Image-Scrape: Difference between revisions
Marie Wocher (talk | contribs) (Created page with "This script can scrape all images from all articles of a newspage (zeit.de) you get by a choosen searchterm <source lang="python">") |
Marie Wocher (talk | contribs) No edit summary |
||
Line 1: | Line 1: | ||
This script can scrape all images from all articles of a newspage (zeit.de) you get by a choosen searchterm | This script can scrape all images from all articles of a newspage (zeit.de) you get by a choosen searchterm | ||
<source lang="python"> | <source lang="python"> | ||
import html5lib, urllib2 | |||
import lxml.cssselect | |||
import re | |||
def get (url): | |||
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) | |||
request = urllib2.Request(url) | |||
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") | |||
f=urllib2.urlopen(request) | |||
page = htmlparser.parse(f) | |||
return page | |||
def download (url, filename): | |||
#tm.replace("148x84","540x304") | |||
url=re.sub("148x84","540x304", url) | |||
print url | |||
#print "downloading", url, "to", filename | |||
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) | |||
request = urllib2.Request(url) | |||
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") | |||
f=urllib2.urlopen(request) | |||
o=open(filename, "wb") | |||
o.write(f.read()) | |||
o.close() | |||
def get_images(url): | |||
p = get(url) | |||
s=lxml.cssselect.CSSSelector ("div#main li") | |||
lis = s (p) | |||
counter = 0 | |||
for li in lis: | |||
#print item | |||
try: | |||
s2=lxml.cssselect.CSSSelector ("img") | |||
imgs=s2 (li) | |||
if len(imgs) == 1: | |||
img = imgs[0] | |||
url= img.attrib.get("src") | |||
s2=lxml.cssselect.CSSSelector ("p.meta") | |||
p =s2 (li)[0] | |||
date = p.text | |||
date = date.split(",")[0] | |||
download(url, date+str(counter)+".jpg") | |||
counter = counter+1 | |||
except ValueError: | |||
pass | |||
for pnum in range(1, 310): | |||
print "page", pnum | |||
get_images("http://www.zeit.de/suche/index?q=SEARCHTERM&sort=aktuell&rezension=0&tmode=&from=&to=&p="+str(pnum)) |
Latest revision as of 12:35, 4 April 2012
This script can scrape all images from all articles of a newspage (zeit.de) you get by a choosen searchterm <source lang="python">
import html5lib, urllib2 import lxml.cssselect import re
def get (url):
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) request = urllib2.Request(url) request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") f=urllib2.urlopen(request) page = htmlparser.parse(f) return page
def download (url, filename):
#tm.replace("148x84","540x304") url=re.sub("148x84","540x304", url) print url #print "downloading", url, "to", filename htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) request = urllib2.Request(url) request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") f=urllib2.urlopen(request) o=open(filename, "wb") o.write(f.read()) o.close()
def get_images(url):
p = get(url) s=lxml.cssselect.CSSSelector ("div#main li") lis = s (p) counter = 0
for li in lis:
#print item
try:
s2=lxml.cssselect.CSSSelector ("img") imgs=s2 (li) if len(imgs) == 1:
img = imgs[0] url= img.attrib.get("src")
s2=lxml.cssselect.CSSSelector ("p.meta") p =s2 (li)[0] date = p.text date = date.split(",")[0]
download(url, date+str(counter)+".jpg")
counter = counter+1 except ValueError: pass
for pnum in range(1, 310):
print "page", pnum get_images("http://www.zeit.de/suche/index?q=SEARCHTERM&sort=aktuell&rezension=0&tmode=&from=&to=&p="+str(pnum))