Web scraping with Python
Tools
- python
- html5lib
- ElementTree part of the standard python library
Scraping dmoz.org
From the dmoz website:
DMOZ is the largest, most comprehensive human-edited directory of the Web. It is constructed and maintained by a passionate, global community of volunteers editors. It was historically known as the Open Directory Project (ODP).
Example 1: Pulling the URLs + textual descriptions from a single page
Consider a single page on dmoz, such as:
http://www.dmoz.org/Science/Astronomy/
If you look into the source, we can see the structure around the URLs listed at the bottom of the page:
<ul style="margin-left:0;" class="directory-url">
<li>
<a class="listinglink" href="http://www.absoluteastronomy.com/">Absolute Astronomy</a>
- Facts and statistical information about planets, moons, constellations, stars, galaxies, and Messier objects.
<div class="flag"><a href="/public/flag?cat=Science%2FAstronomy&url=http%3A%2F%2Fwww.absoluteastronomy.com%2F"><img title="report an issue with this listing" alt="[!]" src="/img/flag.png"></a></div>
</li>
url = sys.argv[1]
f = urllib2.urlopen(url)
src = f.read()
tree = html5lib.parse(src, namespaceHTMLElements=False)
for div in tree.findall(".//ul"):
if "directory-url" in div.get("class", "").split():
for li in div.findall("li"):
for a in li.findall("a"):
if "listinglink" in a.get("class", "").split():
linkurl = a.get("href")
linkdescription = a.tail.strip().strip("-").strip()
print (linkurl)
print ("\t"+linkdescription.encode("utf-8"))
print ()
The pages also contain links to sub- and related categories, if we want to follow these as well, we can...
<div class="dir-1 borN">
<span><img style="height:2px;float:left;width:100%" src="http://o.aolcdn.com/os/dmoz/img/dividerN.gif"></span>
<ul class="directory dir-col">
<li class="">
<a href="/Science/Anomalies_and_Alternative_Science/Astronomy%2C_Alternative/">Alternative</a>@
<em>(72)</em>
</li>
from __future__ import print_function
import urllib2, html5lib, sys
from urlparse import urljoin
url = sys.argv[1]
todo = [url]
seen = {}
while len(todo) > 0:
url, todo = todo[0], todo[1:]
if url not in seen:
f = urllib2.urlopen(url)
print("VISITING", url)
seen[url] = True
src = f.read()
tree = html5lib.parse(src, namespaceHTMLElements=False)
# Extract links
print ("LINKS")
for div in tree.findall(".//ul"):
if "directory-url" in div.get("class", "").split():
for li in div.findall("li"):
for a in li.findall("a"):
if "listinglink" in a.get("class", "").split():
linkurl = a.get("href")
linkdescription = a.tail.strip().strip("-").strip()
print (linkurl)
print ("\t"+linkdescription.encode("utf-8"))
# Follow the related category pages
print ("RELATED")
for div in tree.findall(".//ul"):
if "directory" in div.get("class", "").split():
for li in div.findall("li"):
for a in li.findall("a"):
suburl = a.get("href")
suburl = urljoin(url, suburl)
description = a.text.strip()
print (suburl)
print ("\t"+description.encode("utf-8"))
if suburl not in seen:
todo.append(suburl)
print ()
Example 3: Scraping the links, Crawling to adjacent categories
from __future__ import print_function
import urllib2, html5lib, sys
from urlparse import urljoin
url = sys.argv[1]
todo = [url]
seen = {}
links = {}
while len(todo) > 0:
url, todo = todo[0], todo[1:]
if url not in seen:
f = urllib2.urlopen(url)
print("VISITING", url)
seen[url] = True
src = f.read()
tree = html5lib.parse(src, namespaceHTMLElements=False)
# Extract links
for div in tree.findall(".//ul"):
if "directory-url" in div.get("class", "").split():
for li in div.findall("li"):
for a in li.findall("a"):
if "listinglink" in a.get("class", "").split():
linkurl = a.get("href")
linkdescription = a.tail.strip().strip("-").strip()
if linkurl in links:
print ("already seen", linkurl)
# Record the link
links[linkurl] = linkdescription
# Follow the related category pages
for div in tree.findall(".//ul"):
if "directory" in div.get("class", "").split():
for li in div.findall("li"):
for a in li.findall("a"):
suburl = a.get("href")
suburl = urljoin(url, suburl)
description = a.text.strip()
if suburl not in seen:
# Add the suburl to the todo list
todo.append(suburl)
print ("links", len(links.keys()))