Image Scrape: Difference between revisions

From XPUB & Lens-Based wiki
(Created page with "This script can scrape all Images from all articles of a newspage (zeit.de) you get by a choosen searchterm. <source lang="python">")
 
No edit summary
 
Line 2: Line 2:


<source lang="python">
<source lang="python">
import html5lib, urllib2
import lxml.cssselect
import re
def get (url):
    htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
    request = urllib2.Request(url)
    request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
    f=urllib2.urlopen(request)
    page = htmlparser.parse(f)
    return page
def download (url, filename):
    #tm.replace("148x84","540x304")
    url=re.sub("148x84","540x304", url)
    print url
    #print "downloading", url, "to", filename
    htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
    request = urllib2.Request(url)
    request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
    f=urllib2.urlopen(request)
    o=open(filename, "wb")
    o.write(f.read())
    o.close()
def get_images(url):
    p = get(url)
    s=lxml.cssselect.CSSSelector ("div#main li")
    lis = s (p)
    counter = 0
    for li in lis:
    #print item
        try:
        s2=lxml.cssselect.CSSSelector ("img")
        imgs=s2 (li)
        if len(imgs) == 1:
        img = imgs[0]
        url= img.attrib.get("src")
        s2=lxml.cssselect.CSSSelector ("p.meta")
        p =s2 (li)[0]
        date = p.text
        date = date.split(",")[0]
        download(url, date+str(counter)+".jpg")
                counter = counter+1
               
        except ValueError:
            pass
for pnum in range(1, 310):
    print "page", pnum
    get_images("http://www.zeit.de/suche/index?q=libyen&sort=aktuell&rezension=0&tmode=&from=&to=&p="+str(pnum))

Latest revision as of 13:24, 4 April 2012

This script can scrape all Images from all articles of a newspage (zeit.de) you get by a choosen searchterm.

<source lang="python">

import html5lib, urllib2 import lxml.cssselect import re

def get (url):

   htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
   request = urllib2.Request(url)
   request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
   f=urllib2.urlopen(request)

   page = htmlparser.parse(f)
   return page


def download (url, filename):

   #tm.replace("148x84","540x304")
   url=re.sub("148x84","540x304", url)
   print url
   #print "downloading", url, "to", filename
   htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
   request = urllib2.Request(url)
   request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
   f=urllib2.urlopen(request)
   o=open(filename, "wb")
   o.write(f.read())
   o.close()


def get_images(url):

   p = get(url)
   s=lxml.cssselect.CSSSelector ("div#main li")
   lis = s (p)
   counter = 0
   for li in lis:

#print item

       try:

s2=lxml.cssselect.CSSSelector ("img") imgs=s2 (li) if len(imgs) == 1:

img = imgs[0] url= img.attrib.get("src")

s2=lxml.cssselect.CSSSelector ("p.meta") p =s2 (li)[0] date = p.text date = date.split(",")[0]

download(url, date+str(counter)+".jpg")

               counter = counter+1
               
       except ValueError:
           pass

for pnum in range(1, 310):

   print "page", pnum
   get_images("http://www.zeit.de/suche/index?q=libyen&sort=aktuell&rezension=0&tmode=&from=&to=&p="+str(pnum))