Image-Scrape
Revision as of 12:35, 4 April 2012 by Marie Wocher (talk | contribs)
This script can scrape all images from all articles of a newspage (zeit.de) you get by a choosen searchterm <source lang="python">
import html5lib, urllib2 import lxml.cssselect import re
def get (url):
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) request = urllib2.Request(url) request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") f=urllib2.urlopen(request) page = htmlparser.parse(f) return page
def download (url, filename):
#tm.replace("148x84","540x304") url=re.sub("148x84","540x304", url) print url #print "downloading", url, "to", filename htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) request = urllib2.Request(url) request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") f=urllib2.urlopen(request) o=open(filename, "wb") o.write(f.read()) o.close()
def get_images(url):
p = get(url) s=lxml.cssselect.CSSSelector ("div#main li") lis = s (p) counter = 0
for li in lis:
#print item
try:
s2=lxml.cssselect.CSSSelector ("img") imgs=s2 (li) if len(imgs) == 1:
img = imgs[0] url= img.attrib.get("src")
s2=lxml.cssselect.CSSSelector ("p.meta") p =s2 (li)[0] date = p.text date = date.split(",")[0]
download(url, date+str(counter)+".jpg")
counter = counter+1 except ValueError: pass
for pnum in range(1, 310):
print "page", pnum get_images("http://www.zeit.de/suche/index?q=SEARCHTERM&sort=aktuell&rezension=0&tmode=&from=&to=&p="+str(pnum))