From Media Design: Networked & Lens-Based wiki
Jump to navigation Jump to search

Download all images from a 4Chan thread

import time, os, re
import urllib, urllib2

# What do we want to get
threadurl = "http://boards.4chan.org/tv/res/7240846"
# Who are we?
header = {"User-agent": "Mozilla/5.0 (X11; U; Linux i686; en-US; rv: Gecko/20100106 Ubuntu/9.10 (karmic) Firefox/3.5.7"}

# Make the request and store the result
request = urllib2.Request(url=threadurl, headers=header)
response = urllib2.urlopen(request)

imgthread = response.read()

# find all the image URLs
imgurl = re.compile("http://\w+\.4chan\.org/\w+/src/\d+\.(?:jpg|jpeg|png|gif|svg)")
imgurls = imgurl.findall(imgthread)

# We need a folder to store the images
targetfolder = threadurl.split('/')[-1]
if not os.path.isdir(targetfolder):

# We read through the list of Image URLs and download them in the directory
for url in list(set(imgurls)):
    filename = url.split('/')[-1]
    print "downloading " + filename
    urllib.urlretrieve(str(url), targetfolder + "/" + filename)