Scraping web pages with python
Revision as of 16:43, 2 October 2018 by Michael Murtaugh (talk | contribs) (→Scraping with standard library)
Scraping with standard library
Using the html5lib, you can easily work with web pages.
import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin
url = "https://nytimes.com/"
with urlopen(url) as f:
t = html5lib.parse(f, namespaceHTMLElements=False)
print ("Link", "Label")
for a in t.findall('.//a[@href]'):
#print(ET.tostring(a, encoding='unicode'))
href = urljoin(url, a.attrib.get('href'))
print(href, a.text) # link, label
import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin
from urllib.error import HTTPError
url = 'https://news.bbc.co.uk'
todo = [url]
seen = set()
while todo:
url = todo[0]
todo = todo[1:]
print('Scraping', url)
try:
with urlopen(url) as f:
t = html5lib.parse(f, namespaceHTMLElements=False)
seen.add(url)
# with open('nytimes.html') as f:
# t = html5lib.parse(f, namespaceHTMLElements=False)
# a = t.find('.//a')
for a in t.findall('.//a[@href]'):
href = urljoin(url, a.attrib.get('href'))
#print(ET.tostring(a, encoding='unicode'))
if href not in seen:
todo.append(href)
print(href, a.text) # link, label
except HTTPError:
print('Page not found!!111')
# for x in t.iter():
# if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
# print(x.tag, x.text)
# print(t)
# for x in t:
# print(x)
#print(t[0])
# for x in t.iter():
# print(x)