Scraping web pages with python: Difference between revisions

Revision as of 16:03, 2 October 2018

Scraping with standard library

import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin
from urllib.error import HTTPError
 
url = 'https://news.bbc.co.uk'
todo = [url]
seen = set()
 
while todo:
    url = todo[0]
    todo = todo[1:]
    print('Scraping', url)
 
    try:
        with urlopen(url) as f:
            t = html5lib.parse(f, namespaceHTMLElements=False)
            seen.add(url)
 
        # with open('nytimes.html') as f:
        #     t = html5lib.parse(f, namespaceHTMLElements=False)
 
        # a = t.find('.//a')
 
 
        for a in t.findall('.//a[@href]'):
            href = urljoin(url, a.attrib.get('href'))
            #print(ET.tostring(a, encoding='unicode'))
           
            if href not in seen:
                todo.append(href)
                print(href, a.text)  # link, label
    except HTTPError:
        print('Page not found!!111')
 
# for x in t.iter():
#     if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
#         print(x.tag, x.text)
   
 
 
 
 
# print(t)
 
# for x in t:
#     print(x)
 
#print(t[0])
 
# for x in t.iter():
#     print(x)