Scraping web pages with python: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
Line 1: Line 1:
== Open a URL with lxml ==
== Scraping with standard library ==
 
<source lang="python">
<source lang="python">
import html5lib, urllib2
import html5lib
 
import xml.etree.ElementTree as ET
def get (url):
from urllib.request import urlopen
     htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
from urllib.parse import urljoin
    request = urllib2.Request(url)
from urllib.error import HTTPError
     request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
     f=urllib2.urlopen(request)
url = 'https://news.bbc.co.uk'
todo = [url]
seen = set()
while todo:
     url = todo[0]
    todo = todo[1:]
    print('Scraping', url)
    try:
        with urlopen(url) as f:
            t = html5lib.parse(f, namespaceHTMLElements=False)
            seen.add(url)
        # with open('nytimes.html') as f:
        #     t = html5lib.parse(f, namespaceHTMLElements=False)
        # a = t.find('.//a')
        for a in t.findall('.//a[@href]'):
            href = urljoin(url, a.attrib.get('href'))
            #print(ET.tostring(a, encoding='unicode'))
         
            if href not in seen:
                todo.append(href)
                print(href, a.text)  # link, label
    except HTTPError:
        print('Page not found!!111')
# for x in t.iter():
#     if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
#        print(x.tag, x.text)
 
# print(t)
   
   
    page = htmlparser.parse(f)
# for x in t:
    return page
#     print(x)
</source>
 
== Parsing with html5lib ==
 
The html5lib parser is code that turns the source text of an HTML page
into a structured object, allowing, for instance, to use CSS selectors
or xpath expressions to select/extract portions of a page
 
You can use xpath expressions:
 
<source lang="python">
import html5lib, lxml
 
htmlsource="<html><body><p>Example page.</p><p>More stuff.</p></body></html>"
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = htmlparser.parse(htmlsource)
p = page.xpath("/html/body/p[2]")
if p:
    p = p[0]
    print "".join([t for t in p.itertext()])
</source>
 
outputs:
More stuff.
 
Also CSS selectors are possible:
 
<source lang="python">
import html5lib, lxml, lxml.cssselect
 
htmlsource="<html><body><p>Example page.</p><p>More stuff.</p></body></html>"
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = htmlparser.parse(htmlsource)
selector = lxml.cssselect.CSSSelector("p")
for p in selector(page):
     print "-"*20
    print "".join([t for t in p.itertext()])
 
</source>
 
--------------------
Example page.
--------------------
More stuff.
 
== Function that takes a URL + xpath ==
 
NB the function returns a LIST of matching fragments (since xpaths can potentially match multiple things). So, if you expect only one result, use [0] to pull off the first (single) item. lxml.etree.tostring is used to re-serialize the result.
 
<source lang="python">
import urllib2, html5lib, lxml, lxml.etree
   
   
def getXpath (url, xpath):
#print(t[0])
    htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
    request = urllib2.Request(url)
    request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
    f=urllib2.urlopen(request)
   
    page = htmlparser.parse(f)
    return page.xpath(xpath)
 
if __name__ == "__main__":
    url = "http://www.jabberwocky.com/carroll/walrus.html"
    xpath = "/html/body/p[6]"
    print lxml.etree.tostring(getXpath(url, xpath)[0])
 
</source>
 
== Function that takes a URL + CSS selector ==
 
<source lang="python">
import html5lib, lxml, lxml.cssselect
   
   
def getCSS (url, selector):
# for x in t.iter():
    htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
#    print(x)
    request = urllib2.Request(url)
    request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
    f=urllib2.urlopen(request)
   
    page = htmlparser.parse(f)
    selector = lxml.cssselect.CSSSelector(selector)
    return list(selector(page))
 
# TEST
if __name__ == "__main__":
    url = "http://www.jabberwocky.com/carroll/walrus.html"
     print lxml.etree.tostring(getCSS(url, "p")[0])
 
</source>
</source>
[[Category: Cookbook]] [[Category: Scraping]] [[Category: xpath]] [[Category: python]] [[Category: lxml]]

Revision as of 17:03, 2 October 2018

Scraping with standard library

import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin
from urllib.error import HTTPError
 
url = 'https://news.bbc.co.uk'
todo = [url]
seen = set()
 
while todo:
    url = todo[0]
    todo = todo[1:]
    print('Scraping', url)
 
    try:
        with urlopen(url) as f:
            t = html5lib.parse(f, namespaceHTMLElements=False)
            seen.add(url)
 
        # with open('nytimes.html') as f:
        #     t = html5lib.parse(f, namespaceHTMLElements=False)
 
        # a = t.find('.//a')
 
 
        for a in t.findall('.//a[@href]'):
            href = urljoin(url, a.attrib.get('href'))
            #print(ET.tostring(a, encoding='unicode'))
           
            if href not in seen:
                todo.append(href)
                print(href, a.text)  # link, label
    except HTTPError:
        print('Page not found!!111')
 
# for x in t.iter():
#     if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
#         print(x.tag, x.text)
   
 
 
 
 
# print(t)
 
# for x in t:
#     print(x)
 
#print(t[0])
 
# for x in t.iter():
#     print(x)