Scraping web pages with python: Difference between revisions

Revision as of 17:43, 2 October 2018

Scraping with standard library

Using the html5lib, you can easily work with web pages.

import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin


url = "https://nytimes.com/"
with urlopen(url) as f:
    t = html5lib.parse(f, namespaceHTMLElements=False)

print ("Link", "Label")
for a in t.findall('.//a[@href]'):
    #print(ET.tostring(a, encoding='unicode'))
    href = urljoin(url, a.attrib.get('href'))
    print(href, a.text)  # link, label

import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin
from urllib.error import HTTPError
 
url = 'https://news.bbc.co.uk'
todo = [url]
seen = set()
 
while todo:
    url = todo[0]
    todo = todo[1:]
    print('Scraping', url)
 
    try:
        with urlopen(url) as f:
            t = html5lib.parse(f, namespaceHTMLElements=False)
            seen.add(url)
 
        # with open('nytimes.html') as f:
        #     t = html5lib.parse(f, namespaceHTMLElements=False)
 
        # a = t.find('.//a')
 
 
        for a in t.findall('.//a[@href]'):
            href = urljoin(url, a.attrib.get('href'))
            #print(ET.tostring(a, encoding='unicode'))
           
            if href not in seen:
                todo.append(href)
                print(href, a.text)  # link, label
    except HTTPError:
        print('Page not found!!111')
 
# for x in t.iter():
#     if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
#         print(x.tag, x.text)
   
 
 
 
 
# print(t)
 
# for x in t:
#     print(x)
 
#print(t[0])
 
# for x in t.iter():
#     print(x)