Wikipedia Image Scraping
Revision as of 11:31, 28 May 2014 by Lidia.Pereira (talk | contribs)
First example, printing out a list of Wikipedia page titles and images (starting with the Gunny Sack):
from __future__ import print_function
import html5lib
from urlparse import urljoin, urldefrag
import urllib2
from urllib2 import urlopen
from xml.etree import ElementTree as ET
import time
preffix = "http://en.wikipedia.org/wiki/"
urls = ["http://en.wikipedia.org/wiki/Gunny_sack"]
history = []
fili = open("image.html", "w")
while urls:
url = urls[0]
print (url)
urls = urls[1:]
try:
f = urlopen(url)
history.append(url)
parsed = html5lib.parse(f, namespaceHTMLElements=False)
content = parsed.find(".//*[@id='mw-content-text']")
wee = parsed.find(".//h1")
title = ET.tostring(wee, method="text", encoding="utf-8")
fili.write("<h1>"+title+"</h1>")
if content == None:
continue
getimg = content.findall(".//img")
alink = content.findall(".//a")
#step1:check the links and add them to urls, unless they're in history
for b in alink:
if b.get("href") == None:
continue
if b.get("href").startswith('/w/') or b.get("href").startswith("#"):
continue
joinity = urljoin(f.geturl(), b.attrib.get("href"))
joinity = urldefrag(joinity)[0]
if joinity not in history and joinity not in urls and joinity.startswith(preffix):
urls.append(joinity)
#step2:check the first image within the current url(urls[0])
for img in getimg:
if img.attrib.get("class") == "thumbimage":
source = img.attrib.get("src")
source = urljoin(f.geturl(), source)
fili.write("<img src="+source+">")
break
except urllib2.URLError:
print ("Nee!")
Second example, printing only images as links to their original page:
from __future__ import print_function
import html5lib
from urlparse import urljoin, urldefrag
import urllib2
from urllib2 import urlopen
from xml.etree import ElementTree as ET
import time
preffix = "http://en.wikipedia.org/wiki/"
urls = ["http://en.wikipedia.org/wiki/Gunny_sack"]
history = []
fili = open("image.html", "w")
while urls:
url = urls[0]
print (url)
urls = urls[1:]
try:
f = urlopen(url)
history.append(url)
parsed = html5lib.parse(f, namespaceHTMLElements=False)
content = parsed.find(".//*[@id='mw-content-text']")
if content == None:
continue
getimg = content.findall(".//img")
alink = content.findall(".//a")
#step1:check the links and add them to urls, unless they're in history
for b in alink:
if b.get("href") == None:
continue
if b.get("href").startswith('/w/') or b.get("href").startswith("#"):
continue
joinity = urljoin(f.geturl(), b.attrib.get("href"))
joinity = urldefrag(joinity)[0]
if joinity not in history and joinity not in urls and joinity.startswith(preffix):
urls.append(joinity)
#step2:check the first image within the current url(urls[0])
for img in getimg:
if img.attrib.get("class") == "thumbimage":
source = img.attrib.get("src")
source = urljoin(f.geturl(), source)
fili.write("<a href ="+url+"><img src="+source+"></a>")
break
except urllib2.URLError:
print ("Nee!")