|
|
(9 intermediate revisions by the same user not shown) |
Line 2: |
Line 2: |
| * [[python]] | | * [[python]] |
| * [[html5lib]] | | * [[html5lib]] |
| * [https://docs.python.org/2/library/xml.etree.elementtree.html ElementTree] part of the standard python library | | * [[ElementTree]] part of the standard python library |
|
| |
|
| == Scraping dmoz.org == | | == Examples == |
| | | * [[Scraping the Open Directory with Python]] |
| === Example 1: Pulling the URLs + textual descriptions from a single page ===
| | * [[Wikiwalker: Crawling wikipedia pages for images]] |
| | |
| Consider a single page on dmoz, such as:
| |
| | |
| http://www.dmoz.org/Science/Astronomy/
| |
| | |
| If you look into the source, we can see the structure around the URLs listed at the bottom of the page:
| |
| | |
| <source lang="html4strict">
| |
| <ul style="margin-left:0;" class="directory-url">
| |
| <li>
| |
| <a class="listinglink" href="http://www.absoluteastronomy.com/">Absolute Astronomy</a>
| |
| - Facts and statistical information about planets, moons, constellations, stars, galaxies, and Messier objects.
| |
| <div class="flag"><a href="/public/flag?cat=Science%2FAstronomy&url=http%3A%2F%2Fwww.absoluteastronomy.com%2F"><img title="report an issue with this listing" alt="[!]" src="/img/flag.png"></a></div>
| |
| </li>
| |
| </source>
| |
| | |
| <source lang="python">
| |
| url = sys.argv[1]
| |
| | |
| f = urllib2.urlopen(url)
| |
| src = f.read()
| |
| tree = html5lib.parse(src, namespaceHTMLElements=False)
| |
| | |
| for div in tree.findall(".//ul"):
| |
| if "directory-url" in div.get("class", "").split():
| |
| for li in div.findall("li"):
| |
| for a in li.findall("a"):
| |
| if "listinglink" in a.get("class", "").split():
| |
| linkurl = a.get("href")
| |
| linkdescription = a.tail.strip().strip("-").strip()
| |
| print (linkurl)
| |
| print ("\t"+linkdescription.encode("utf-8"))
| |
| print ()</source>
| |
| | |
| === Example 2: Digging into sub / related categories ===
| |
| The pages also contain links to sub- and related categories, if we want to follow these as well, we can...
| |
| | |
| <source lang="html4strict">
| |
| <div class="dir-1 borN">
| |
| <span><img style="height:2px;float:left;width:100%" src="http://o.aolcdn.com/os/dmoz/img/dividerN.gif"></span>
| |
| <ul class="directory dir-col">
| |
| <li class="">
| |
| <a href="/Science/Anomalies_and_Alternative_Science/Astronomy%2C_Alternative/">Alternative</a>@
| |
| <em>(72)</em>
| |
| </li>
| |
| </source>
| |
| | |
| <source lang="python">
| |
| from __future__ import print_function
| |
| import urllib2, html5lib, sys
| |
| from urlparse import urljoin
| |
| | |
| url = sys.argv[1]
| |
| | |
| todo = [url]
| |
| seen = {}
| |
| | |
| while len(todo) > 0:
| |
| url, todo = todo[0], todo[1:]
| |
| if url not in seen:
| |
| f = urllib2.urlopen(url)
| |
| print("VISITING", url)
| |
| seen[url] = True
| |
| src = f.read()
| |
| tree = html5lib.parse(src, namespaceHTMLElements=False)
| |
| | |
| # Extract links
| |
| print ("LINKS")
| |
| for div in tree.findall(".//ul"):
| |
| if "directory-url" in div.get("class", "").split():
| |
| for li in div.findall("li"):
| |
| for a in li.findall("a"):
| |
| if "listinglink" in a.get("class", "").split():
| |
| linkurl = a.get("href")
| |
| linkdescription = a.tail.strip().strip("-").strip()
| |
| print (linkurl)
| |
| print ("\t"+linkdescription.encode("utf-8"))
| |
| | |
| # Follow the related category pages
| |
| print ("RELATED")
| |
| for div in tree.findall(".//ul"):
| |
| if "directory" in div.get("class", "").split():
| |
| for li in div.findall("li"):
| |
| for a in li.findall("a"):
| |
| suburl = a.get("href")
| |
| suburl = urljoin(url, suburl)
| |
| description = a.text.strip()
| |
| print (suburl)
| |
| print ("\t"+description.encode("utf-8"))
| |
| if suburl not in seen:
| |
| todo.append(suburl)
| |
| | |
| print ()
| |
| </source>
| |
| | |
| === Example 3: Scraping the links, Crawling to adjacent categories ===
| |
| | |
| <source lang="python">
| |
| from __future__ import print_function
| |
| import urllib2, html5lib, sys
| |
| from urlparse import urljoin
| |
| | |
| | |
| url = sys.argv[1]
| |
| | |
| todo = [url]
| |
| seen = {}
| |
| links = {}
| |
| | |
| while len(todo) > 0:
| |
| url, todo = todo[0], todo[1:]
| |
| if url not in seen:
| |
| f = urllib2.urlopen(url)
| |
| print("VISITING", url)
| |
| seen[url] = True
| |
| src = f.read()
| |
| tree = html5lib.parse(src, namespaceHTMLElements=False)
| |
| | |
| # Extract links
| |
| for div in tree.findall(".//ul"):
| |
| if "directory-url" in div.get("class", "").split():
| |
| for li in div.findall("li"):
| |
| for a in li.findall("a"):
| |
| if "listinglink" in a.get("class", "").split():
| |
| linkurl = a.get("href")
| |
| linkdescription = a.tail.strip().strip("-").strip()
| |
| if linkurl in links:
| |
| print ("already seen", linkurl)
| |
| | |
| # Record the link
| |
| links[linkurl] = linkdescription
| |
| | |
| # Follow the related category pages
| |
| for div in tree.findall(".//ul"):
| |
| if "directory" in div.get("class", "").split():
| |
| for li in div.findall("li"):
| |
| for a in li.findall("a"):
| |
| suburl = a.get("href")
| |
| suburl = urljoin(url, suburl)
| |
| description = a.text.strip()
| |
| if suburl not in seen:
| |
| | |
| # Add the suburl to the todo list
| |
| todo.append(suburl)
| |
| | |
| print ("links", len(links.keys()))
| |
| </source>
| |