Scraping web pages with python: Difference between revisions
No edit summary |
|||
Line 1: | Line 1: | ||
== Scraping with standard library == | == Scraping with standard library == | ||
Using the html5lib, you can easily work with web pages. | |||
<source lang="python"> | |||
import html5lib | |||
import xml.etree.ElementTree as ET | |||
from urllib.request import urlopen | |||
from urllib.parse import urljoin | |||
url = "https://nytimes.com/" | |||
with urlopen(url) as f: | |||
t = html5lib.parse(f, namespaceHTMLElements=False) | |||
print ("Link", "Label") | |||
for a in t.findall('.//a[@href]'): | |||
#print(ET.tostring(a, encoding='unicode')) | |||
href = urljoin(url, a.attrib.get('href')) | |||
print(href, a.text) # link, label | |||
</source> | |||
<source lang="python"> | <source lang="python"> |
Revision as of 16:43, 2 October 2018
Scraping with standard library
Using the html5lib, you can easily work with web pages.
import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin
url = "https://nytimes.com/"
with urlopen(url) as f:
t = html5lib.parse(f, namespaceHTMLElements=False)
print ("Link", "Label")
for a in t.findall('.//a[@href]'):
#print(ET.tostring(a, encoding='unicode'))
href = urljoin(url, a.attrib.get('href'))
print(href, a.text) # link, label
import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from urllib.parse import urljoin
from urllib.error import HTTPError
url = 'https://news.bbc.co.uk'
todo = [url]
seen = set()
while todo:
url = todo[0]
todo = todo[1:]
print('Scraping', url)
try:
with urlopen(url) as f:
t = html5lib.parse(f, namespaceHTMLElements=False)
seen.add(url)
# with open('nytimes.html') as f:
# t = html5lib.parse(f, namespaceHTMLElements=False)
# a = t.find('.//a')
for a in t.findall('.//a[@href]'):
href = urljoin(url, a.attrib.get('href'))
#print(ET.tostring(a, encoding='unicode'))
if href not in seen:
todo.append(href)
print(href, a.text) # link, label
except HTTPError:
print('Page not found!!111')
# for x in t.iter():
# if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
# print(x.tag, x.text)
# print(t)
# for x in t:
# print(x)
#print(t[0])
# for x in t.iter():
# print(x)