Web scraping with Python: Difference between revisions

Latest revision as of 16:47, 26 May 2014

Tools

python
html5lib
ElementTree part of the standard python library

@@ Line 2: / Line 2: @@
 * [[python]]
 * [[html5lib]]
-* [https://docs.python.org/2/library/xml.etree.elementtree.html ElementTree] part of the standard python library
+* [[ElementTree]] part of the standard python library
-== Scraping dmoz.org ==
+== Examples ==
+* [[Scraping the Open Directory with Python]]
-From the dmoz website:
+* [[Wikiwalker: Crawling wikipedia pages for images]]
-<blockquote>
-DMOZ is the largest, most comprehensive human-edited directory of the Web. It is constructed and maintained by a passionate, global community of volunteers editors. It was historically known as the Open Directory Project (ODP).
-</blockquote>
-=== Example 1: Pulling the URLs + textual descriptions from a single page ===
-Consider a single page on dmoz, such as:
-http://www.dmoz.org/Science/Astronomy/
-If you look into the source, we can see the structure around the URLs listed at the bottom of the page:
-<source lang="html4strict">
-<ul style="margin-left:0;" class="directory-url">
-<li>
-<a class="listinglink" href="http://www.absoluteastronomy.com/">Absolute Astronomy</a>
-- Facts and statistical information about planets, moons, constellations, stars, galaxies, and Messier objects.
-<div class="flag"><a href="/public/flag?cat=Science%2FAstronomy&amp;url=http%3A%2F%2Fwww.absoluteastronomy.com%2F"><img title="report an issue with this listing" alt="[!]" src="/img/flag.png"></a></div>
-</li>
-</source>
-<source lang="python">
-url = sys.argv[1]
-f = urllib2.urlopen(url)
-src = f.read()
-tree = html5lib.parse(src, namespaceHTMLElements=False)
-for div in tree.findall(".//ul"):
-    if "directory-url" in div.get("class", "").split():
-        for li in div.findall("li"):
-            for a in li.findall("a"):
-                if "listinglink" in a.get("class", "").split():
-                    linkurl = a.get("href")
-                    linkdescription = a.tail.strip().strip("-").strip()
-                    print (linkurl)
-                    print ("\t"+linkdescription.encode("utf-8"))
-                    print ()</source>
-=== Example 2: Digging into sub / related categories ===
-The pages also contain links to sub- and related categories, if we want to follow these as well, we can...
-<source lang="html4strict">
-<div class="dir-1 borN">
-    <span><img style="height:2px;float:left;width:100%" src="http://o.aolcdn.com/os/dmoz/img/dividerN.gif"></span>
-    <ul class="directory dir-col">
-        <li class="">
-            <a href="/Science/Anomalies_and_Alternative_Science/Astronomy%2C_Alternative/">Alternative</a>@
-            <em>(72)</em>
-        </li>
-</source>
-<source lang="python">
-from __future__ import print_function
-import urllib2, html5lib, sys
-from urlparse import urljoin
-url = sys.argv[1]
-todo = [url]
-seen = {}
-while len(todo) > 0:
-    url, todo = todo[0], todo[1:]
-    if url not in seen:
-        f = urllib2.urlopen(url)
-        print("VISITING", url)
-        seen[url] = True
-        src = f.read()
-        tree = html5lib.parse(src, namespaceHTMLElements=False)
-        # Extract links
-        print ("LINKS")
-        for div in tree.findall(".//ul"):
-            if "directory-url" in div.get("class", "").split():
-                for li in div.findall("li"):
-                    for a in li.findall("a"):
-                        if "listinglink" in a.get("class", "").split():
-                            linkurl = a.get("href")
-                            linkdescription = a.tail.strip().strip("-").strip()
-                            print (linkurl)
-                            print ("\t"+linkdescription.encode("utf-8"))
-        # Follow the related category pages
-        print ("RELATED")
-        for div in tree.findall(".//ul"):
-            if "directory" in div.get("class", "").split():
-                for li in div.findall("li"):
-                    for a in li.findall("a"):
-                        suburl = a.get("href")
-                        suburl = urljoin(url, suburl)
-                        description = a.text.strip()
-                        print (suburl)
-                        print ("\t"+description.encode("utf-8"))
-                        if suburl not in seen:
-                            todo.append(suburl)
-        print ()
-</source>
-=== Example 3: Scraping the links, Crawling to adjacent categories ===
-<source lang="python">
-from __future__ import print_function
-import urllib2, html5lib, sys
-from urlparse import urljoin
-url = sys.argv[1]
-todo = [url]
-seen = {}
-links = {}
-while len(todo) > 0:
-    url, todo = todo[0], todo[1:]
-    if url not in seen:
-        f = urllib2.urlopen(url)
-        print("VISITING", url)
-        seen[url] = True
-        src = f.read()
-        tree = html5lib.parse(src, namespaceHTMLElements=False)
-        # Extract links
-        for div in tree.findall(".//ul"):
-            if "directory-url" in div.get("class", "").split():
-                for li in div.findall("li"):
-                    for a in li.findall("a"):
-                        if "listinglink" in a.get("class", "").split():
-                            linkurl = a.get("href")
-                            linkdescription = a.tail.strip().strip("-").strip()
-                            if linkurl in links:
-                                print ("already seen", linkurl)
-                            # Record the link
-                            links[linkurl] = linkdescription
-        # Follow the related category pages
-        for div in tree.findall(".//ul"):
-            if "directory" in div.get("class", "").split():
-                for li in div.findall("li"):
-                    for a in li.findall("a"):
-                        suburl = a.get("href")
-                        suburl = urljoin(url, suburl)
-                        description = a.text.strip()
-                        if suburl not in seen:
-                            # Add the suburl to the todo list
-                            todo.append(suburl)
-        print ("links", len(links.keys()))
-</source>