Wikipedia Image Scraping: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
 
Line 33: Line 33:
         #step1:check the links and add them to urls, unless they're in history
         #step1:check the links and add them to urls, unless they're in history
         for b in alink:
         for b in alink:
            if b.get("href") == None:
                continue
             if b.get("href").startswith('/w/') or b.get("href").startswith("#"):
             if b.get("href").startswith('/w/') or b.get("href").startswith("#"):
                 continue
                 continue
Line 81: Line 83:
         #step1:check the links and add them to urls, unless they're in history
         #step1:check the links and add them to urls, unless they're in history
         for b in alink:
         for b in alink:
            if b.get("href") == None:
                continue
             if b.get("href").startswith('/w/') or b.get("href").startswith("#"):
             if b.get("href").startswith('/w/') or b.get("href").startswith("#"):
                 continue
                 continue

Latest revision as of 11:31, 28 May 2014

First example, printing out a list of Wikipedia page titles and images (starting with the Gunny Sack):

from __future__ import print_function
import html5lib
from urlparse import urljoin, urldefrag
import urllib2
from urllib2 import urlopen
from xml.etree import ElementTree as ET
import time

preffix = "http://en.wikipedia.org/wiki/"
urls = ["http://en.wikipedia.org/wiki/Gunny_sack"]
history = []
fili = open("image.html", "w")

while urls:  
    url = urls[0]
    print (url)
    urls = urls[1:]
    try:
        f = urlopen(url)
        history.append(url)
        parsed = html5lib.parse(f, namespaceHTMLElements=False)
        content = parsed.find(".//*[@id='mw-content-text']")
        wee = parsed.find(".//h1")
        title = ET.tostring(wee, method="text", encoding="utf-8")
        fili.write("<h1>"+title+"</h1>")
        if content == None:
            continue 
        getimg = content.findall(".//img")
        alink = content.findall(".//a")
        #step1:check the links and add them to urls, unless they're in history
        for b in alink:
            if b.get("href") == None:
                continue
            if b.get("href").startswith('/w/') or b.get("href").startswith("#"):
                continue
            joinity = urljoin(f.geturl(), b.attrib.get("href"))
            joinity = urldefrag(joinity)[0]
            if joinity not in history and joinity not in urls and joinity.startswith(preffix):
                urls.append(joinity)
        #step2:check the first image within the current url(urls[0])
        for img in getimg:
            if img.attrib.get("class") == "thumbimage":
                source = img.attrib.get("src")
                source = urljoin(f.geturl(), source)
                fili.write("<img src="+source+">")
                break
    except urllib2.URLError:
        print ("Nee!")

Second example, printing only images as links to their original page:

from __future__ import print_function
import html5lib
from urlparse import urljoin, urldefrag
import urllib2
from urllib2 import urlopen
from xml.etree import ElementTree as ET
import time

preffix = "http://en.wikipedia.org/wiki/"
urls = ["http://en.wikipedia.org/wiki/Gunny_sack"]
history = []
fili = open("image.html", "w")

while urls:  
    url = urls[0]
    print (url)
    urls = urls[1:]
    try:
        f = urlopen(url)
        history.append(url)
        parsed = html5lib.parse(f, namespaceHTMLElements=False)
        content = parsed.find(".//*[@id='mw-content-text']")
        if content == None:
            continue 
        getimg = content.findall(".//img")
        alink = content.findall(".//a")
        #step1:check the links and add them to urls, unless they're in history
        for b in alink:
            if b.get("href") == None:
                continue
            if b.get("href").startswith('/w/') or b.get("href").startswith("#"):
                continue
            joinity = urljoin(f.geturl(), b.attrib.get("href"))
            joinity = urldefrag(joinity)[0]
            if joinity not in history and joinity not in urls and joinity.startswith(preffix):
                urls.append(joinity)
        #step2:check the first image within the current url(urls[0])
        for img in getimg:
            if img.attrib.get("class") == "thumbimage":
                source = img.attrib.get("src")
                source = urljoin(f.geturl(), source)
                fili.write("<a href ="+url+"><img src="+source+"></a>")
                break
    except urllib2.URLError:
        print ("Nee!")