Scraping web pages with python

From XPUB & Lens-Based wiki
Revision as of 16:03, 2 October 2018 by Michael Murtaugh (talk | contribs)

Scraping with standard library

import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin
from urllib.error import HTTPError
url = ''
todo = [url]
seen = set()
while todo:
    url = todo[0]
    todo = todo[1:]
    print('Scraping', url)
        with urlopen(url) as f:
            t = html5lib.parse(f, namespaceHTMLElements=False)
        # with open('nytimes.html') as f:
        #     t = html5lib.parse(f, namespaceHTMLElements=False)
        # a = t.find('.//a')
        for a in t.findall('.//a[@href]'):
            href = urljoin(url, a.attrib.get('href'))
            #print(ET.tostring(a, encoding='unicode'))
            if href not in seen:
                print(href, a.text)  # link, label
    except HTTPError:
        print('Page not found!!111')
# for x in t.iter():
#     if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
#         print(x.tag, x.text)
# print(t)
# for x in t:
#     print(x)
# for x in t.iter():
#     print(x)