Download festival hack

I didn't make it to download this year but the pictures look awesome, before they disappear into nowhere I wanted to grab a copy so I hacked up a bit of Python to do some scraping and it works (not very well, but it was more of a 5min thing rather than a let's make a good job of it thing.). Script is as below:

#!/usr/bin/env python
'''
Uber hack to download images from download website, kinda works.
Very messy as I ripped most the code out of other scripts I've written.
'''
import os
import urllib
import hashlib
import threading
import Queue
import urlparse
from BeautifulSoup import BeautifulSoup
base_dir = '/home/damian/Pictures/download-2011'

def get_total_pages():
 uh = urllib.urlopen("http://photos.downloadfestival.co.uk/view/")
 soup = BeautifulSoup(uh)
 uh.close()

count = str(soup.find("div", {"class": "page_count"}).string.strip())
 count = int(count.replace('Page 1 of ', ''))
 return count

def get_images(page):
 images = []
 uh = urllib.urlopen("http://photos.downloadfestival.co.uk/view/?page=%d" % page)
 soup = BeautifulSoup(uh)
 uh.close()

r = soup.find("ul", {"class": "item_list"})
 for sr in r.findAll("li"):
 image = sr.a.img['src']
 images.append(image)
 return images

class Runner(threading.Thread):
 def __init__(self, queue):
 threading.Thread.__init__(self)
 self.queue = queue

def run(self):
 while True:
 url = self.queue.get()
 self.get_image(url)
 self.queue.task_done()

def get_image(self, url):
 urldata = urlparse.urlparse(url)

ext = '.'.join(urldata.path.split('.')[-1:])
 file_name = "%s.%s" % (hashlib.md5(url).hexdigest(), ext)
 file_path = os.path.join(base_dir, file_name)

uh = urllib.urlopen(url)
 fh = open(file_path, 'w')
 fh.write(uh.read())
 fh.close()
 uh.close()
 print "Downloaded image to %s" % file_path

if __name__ == "__main__":
 if not os.path.isdir(base_dir):
 os.makedirs(base_dir)

total = get_total_pages()
 images = []

print "%d pages found" % total
 for p in range(1, total):
 print "Processing page %d" % p
 i = get_images(p)

print "Found %d images" % len(i)
 images.extend(i)

 print "Downloading %d images" % len(images)
 queue = Queue.Queue()
 for i in range(20):
 thread = Runner(queue)
 thread.setDaemon(True)
 thread.start()

for image in images:
 queue.put(image)
 queue.join()

Comments