Scrape

From XPUB & Lens-Based wiki

The basis of an HTML link scraper in python...

import argparse
import html5lib
from urllib.request import urlopen
from urllib.parse import urljoin


ap = argparse.ArgumentParser("")
ap.add_argument("--url", default="http://media.constantvzw.org/wefts/121/")
args = ap.parse_args()


f = urlopen(args.url)
t = html5lib.parse(f, namespaceHTMLElements=False)
for link in t.findall(".//a[@href]"):
    href = urljoin(args.url, link.attrib.get("href"))
    print (href)