Scraping web pages with python: Difference between revisions
No edit summary |
No edit summary |
||
Line 1: | Line 1: | ||
== | == Scraping with standard library == | ||
<source lang="python"> | <source lang="python"> | ||
import html5lib | import html5lib | ||
import xml.etree.ElementTree as ET | |||
from urllib.request import urlopen | |||
from urllib.parse import urljoin | |||
from urllib.error import HTTPError | |||
url = 'https://news.bbc.co.uk' | |||
todo = [url] | |||
seen = set() | |||
while todo: | |||
url = todo[0] | |||
todo = todo[1:] | |||
print('Scraping', url) | |||
try: | |||
with urlopen(url) as f: | |||
t = html5lib.parse(f, namespaceHTMLElements=False) | |||
seen.add(url) | |||
# with open('nytimes.html') as f: | |||
# t = html5lib.parse(f, namespaceHTMLElements=False) | |||
# a = t.find('.//a') | |||
for a in t.findall('.//a[@href]'): | |||
href = urljoin(url, a.attrib.get('href')) | |||
#print(ET.tostring(a, encoding='unicode')) | |||
if href not in seen: | |||
todo.append(href) | |||
print(href, a.text) # link, label | |||
except HTTPError: | |||
print('Page not found!!111') | |||
# for x in t.iter(): | |||
# if x.text != None and 'trump' in x.text.lower() and x.tag != 'script': | |||
# print(x.tag, x.text) | |||
# print(t) | |||
# for x in t: | |||
# print(x) | |||
print | |||
#print(t[0]) | |||
# for x in t.iter(): | |||
# print(x) | |||
# | |||
print | |||
</source> | </source> | ||
Revision as of 16:03, 2 October 2018
Scraping with standard library
import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin
from urllib.error import HTTPError
url = 'https://news.bbc.co.uk'
todo = [url]
seen = set()
while todo:
url = todo[0]
todo = todo[1:]
print('Scraping', url)
try:
with urlopen(url) as f:
t = html5lib.parse(f, namespaceHTMLElements=False)
seen.add(url)
# with open('nytimes.html') as f:
# t = html5lib.parse(f, namespaceHTMLElements=False)
# a = t.find('.//a')
for a in t.findall('.//a[@href]'):
href = urljoin(url, a.attrib.get('href'))
#print(ET.tostring(a, encoding='unicode'))
if href not in seen:
todo.append(href)
print(href, a.text) # link, label
except HTTPError:
print('Page not found!!111')
# for x in t.iter():
# if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
# print(x.tag, x.text)
# print(t)
# for x in t:
# print(x)
#print(t[0])
# for x in t.iter():
# print(x)