Scrape
Revision as of 13:04, 19 February 2020 by Michael Murtaugh (talk | contribs) (Created page with "The basis of an HTML link scraper in python... <source lang="python"> import argparse import html5lib from urllib.request import urlopen from urllib.parse import urljoin ap...")
The basis of an HTML link scraper in python...
import argparse
import html5lib
from urllib.request import urlopen
from urllib.parse import urljoin
ap = argparse.ArgumentParser("")
ap.add_argument("--url", default="http://media.constantvzw.org/wefts/121/")
args = ap.parse_args()
f = urlopen(args.url)
t = html5lib.parse(f, namespaceHTMLElements=False)
for link in t.findall(".//a[@href]"):
href = urljoin(args.url, link.attrib.get("href"))
print (href)