Scraping web pages with python: Difference between revisions
No edit summary |
No edit summary |
||
Line 39: | Line 39: | ||
-------------------- | -------------------- | ||
More stuff. | More stuff. | ||
<source lang="python"> | |||
import urllib2, html5lib, lxml, lxml.etree | |||
def getXpath (url, xpath): | |||
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) | |||
request = urllib2.Request(url) | |||
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") | |||
f=urllib2.urlopen(request) | |||
page = htmlparser.parse(f) | |||
return page.xpath(xpath) | |||
if __name__ == "__main__": | |||
url = "http://www.jabberwocky.com/carroll/walrus.html" | |||
xpath = "/html/body/p[6]" | |||
print lxml.etree.tostring(getXpath(url, xpath)[0]) | |||
</source> | |||
[[Category: Cookbook]] [[Category: xpath]] [[Category: python]] [[Category: lxml]] | [[Category: Cookbook]] [[Category: xpath]] [[Category: python]] [[Category: lxml]] |
Revision as of 16:30, 1 April 2011
The html5lib parser is code that turns the source text of an HTML page into a structured object, allowing, for instance, to use CSS selectors or xpath expressions to select/extract portions of a page
You can use xpath expressions:
import html5lib, lxml
htmlsource="<html><body><p>Example page.</p><p>More stuff.</p></body></html>"
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = htmlparser.parse(htmlsource)
p = page.xpath("/html/body/p[2]")
if p:
p = p[0]
print "".join([t for t in p.itertext()])
outputs: More stuff.
Also CSS selectors are possible:
import html5lib, lxml, lxml.cssselect
htmlsource="<html><body><p>Example page.</p><p>More stuff.</p></body></html>"
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = htmlparser.parse(htmlsource)
selector = lxml.cssselect.CSSSelector("p")
for p in selector(page):
print "-"*20
print "".join([t for t in p.itertext()])
-------------------- Example page. -------------------- More stuff.
import urllib2, html5lib, lxml, lxml.etree
def getXpath (url, xpath):
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
request = urllib2.Request(url)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)
page = htmlparser.parse(f)
return page.xpath(xpath)
if __name__ == "__main__":
url = "http://www.jabberwocky.com/carroll/walrus.html"
xpath = "/html/body/p[6]"
print lxml.etree.tostring(getXpath(url, xpath)[0])