Scraping web pages with python
Revision as of 16:03, 2 October 2018 by Michael Murtaugh (talk | contribs)
Scraping with standard library
import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin
from urllib.error import HTTPError
url = 'https://news.bbc.co.uk'
todo = [url]
seen = set()
while todo:
url = todo[0]
todo = todo[1:]
print('Scraping', url)
try:
with urlopen(url) as f:
t = html5lib.parse(f, namespaceHTMLElements=False)
seen.add(url)
# with open('nytimes.html') as f:
# t = html5lib.parse(f, namespaceHTMLElements=False)
# a = t.find('.//a')
for a in t.findall('.//a[@href]'):
href = urljoin(url, a.attrib.get('href'))
#print(ET.tostring(a, encoding='unicode'))
if href not in seen:
todo.append(href)
print(href, a.text) # link, label
except HTTPError:
print('Page not found!!111')
# for x in t.iter():
# if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
# print(x.tag, x.text)
# print(t)
# for x in t:
# print(x)
#print(t[0])
# for x in t.iter():
# print(x)