Web scraping with Python: Difference between revisions
Line 52: | Line 52: | ||
<em>(72)</em> | <em>(72)</em> | ||
</li> | </li> | ||
</source> | |||
<source lang="python"> | |||
url = sys.argv[1] | |||
todo = [url] | |||
seen = set() | |||
while len(todo) > 0: | |||
url, todo = todo[0], todo[1:] | |||
if url not in seen: | |||
f = urllib2.urlopen(url) | |||
print("VISITING", url) | |||
seen.add(url) | |||
src = f.read() | |||
tree = html5lib.parse(src, namespaceHTMLElements=False) | |||
# Extract links | |||
print ("LINKS") | |||
for div in tree.findall(".//ul"): | |||
if "directory-url" in div.get("class", "").split(): | |||
for li in div.findall("li"): | |||
for a in li.findall("a"): | |||
if "listinglink" in a.get("class", "").split(): | |||
linkurl = a.get("href") | |||
linkdescription = a.tail.strip().strip("-").strip() | |||
print (linkurl) | |||
print ("\t"+linkdescription.encode("utf-8")) | |||
# Follow the related category pages | |||
print ("RELATED") | |||
for div in tree.findall(".//ul"): | |||
if "directory" in div.get("class", "").split(): | |||
for li in div.findall("li"): | |||
for a in li.findall("a"): | |||
suburl = a.get("href") | |||
suburl = urljoin(url, suburl) | |||
description = a.text.strip() | |||
print (suburl) | |||
print ("\t"+description.encode("utf-8")) | |||
if suburl not in seen: | |||
todo.append(suburl) | |||
print () | |||
</source> | </source> |
Revision as of 14:50, 26 May 2014
Tools
- python
- html5lib
- ElementTree part of the standard python library
Scraping dmoz.org
Example 1: Pulling the URLs + textual descriptions from a single page
Consider a single page on dmoz, such as:
http://www.dmoz.org/Science/Astronomy/
If you look into the source, we can see the structure around the URLs listed at the bottom of the page:
<ul style="margin-left:0;" class="directory-url">
<li>
<a class="listinglink" href="http://www.absoluteastronomy.com/">Absolute Astronomy</a>
- Facts and statistical information about planets, moons, constellations, stars, galaxies, and Messier objects.
<div class="flag"><a href="/public/flag?cat=Science%2FAstronomy&url=http%3A%2F%2Fwww.absoluteastronomy.com%2F"><img title="report an issue with this listing" alt="[!]" src="/img/flag.png"></a></div>
</li>
url = sys.argv[1]
f = urllib2.urlopen(url)
src = f.read()
tree = html5lib.parse(src, namespaceHTMLElements=False)
for div in tree.findall(".//ul"):
if "directory-url" in div.get("class", "").split():
for li in div.findall("li"):
for a in li.findall("a"):
if "listinglink" in a.get("class", "").split():
linkurl = a.get("href")
linkdescription = a.tail.strip().strip("-").strip()
print (linkurl)
print ("\t"+linkdescription.encode("utf-8"))
print ()
The pages also contain links to sub- and related categories, if we want to follow these as well, we can...
<div class="dir-1 borN">
<span><img style="height:2px;float:left;width:100%" src="http://o.aolcdn.com/os/dmoz/img/dividerN.gif"></span>
<ul class="directory dir-col">
<li class="">
<a href="/Science/Anomalies_and_Alternative_Science/Astronomy%2C_Alternative/">Alternative</a>@
<em>(72)</em>
</li>
url = sys.argv[1]
todo = [url]
seen = set()
while len(todo) > 0:
url, todo = todo[0], todo[1:]
if url not in seen:
f = urllib2.urlopen(url)
print("VISITING", url)
seen.add(url)
src = f.read()
tree = html5lib.parse(src, namespaceHTMLElements=False)
# Extract links
print ("LINKS")
for div in tree.findall(".//ul"):
if "directory-url" in div.get("class", "").split():
for li in div.findall("li"):
for a in li.findall("a"):
if "listinglink" in a.get("class", "").split():
linkurl = a.get("href")
linkdescription = a.tail.strip().strip("-").strip()
print (linkurl)
print ("\t"+linkdescription.encode("utf-8"))
# Follow the related category pages
print ("RELATED")
for div in tree.findall(".//ul"):
if "directory" in div.get("class", "").split():
for li in div.findall("li"):
for a in li.findall("a"):
suburl = a.get("href")
suburl = urljoin(url, suburl)
description = a.text.strip()
print (suburl)
print ("\t"+description.encode("utf-8"))
if suburl not in seen:
todo.append(suburl)
print ()