Scraping web pages with python: Difference between revisions

Revision as of 16:56, 2 October 2018

Using html5lib + elementtree

Back in the day, working with HTML pages with python's standard library was often frustrating as most web pages "in the wild" didn't conform to the rigid restrictions of XML. As a result projects like Beautiful Soup were created that made working with HTML quite easy. Happily the lessons learned from BeautifulSoup have incorporated into modern libraries like html5lib which gives you standard access to any XML/HTML using python's built in ElementTree module.

Find all the links (a) on the front page of nytimes.com and print their href and label

import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin


url = "https://nytimes.com/"
with urlopen(url) as f:
    t = html5lib.parse(f, namespaceHTMLElements=False)

print ("Link", "Label")
for a in t.findall('.//a[@href]'):
    #print(ET.tostring(a, encoding='unicode'))
    href = urljoin(url, a.attrib.get('href'))
    print(href, a.text)  # link, label

Scraping from a local file

with open("myfile.html") as f:
    t = html5lib.parse(f, namespaceHTMLElements=False)

Generic page scraping

import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin


url = "https://nytimes.com/"
with urlopen(url) as f:
    t = html5lib.parse(f, namespaceHTMLElements=False)

for x in t.iter():
    if x.text != None and "trump" in x.text.lower() and x.tag != "script":
        print (x.tag, x.text)

A spider

import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin
from urllib.error import HTTPError
 
url = 'https://news.bbc.co.uk'
todo = [url]
seen = set()
 
while todo:
    url = todo[0]
    todo = todo[1:]
    print('Scraping', url)
 
    try:
        with urlopen(url) as f:
            t = html5lib.parse(f, namespaceHTMLElements=False)
            seen.add(url)
 
        # with open('nytimes.html') as f:
        #     t = html5lib.parse(f, namespaceHTMLElements=False)
 
        # a = t.find('.//a')
 
 
        for a in t.findall('.//a[@href]'):
            href = urljoin(url, a.attrib.get('href'))
            #print(ET.tostring(a, encoding='unicode'))
           
            if href not in seen:
                todo.append(href)
                print(href, a.text)  # link, label
    except HTTPError:
        print('Page not found!!111')
 
# for x in t.iter():
#     if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
#         print(x.tag, x.text)
   
 
 
 
 
# print(t)
 
# for x in t:
#     print(x)
 
#print(t[0])
 
# for x in t.iter():
#     print(x)