Filtering HTML with python

From XPUB & Lens-Based wiki
Revision as of 16:58, 23 May 2020 by Michael Murtaugh (talk | contribs) (Created page with "<source lang="python"> from urllib.parse import urljoin def absolute_hrefs(html, baseurl): t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements =...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
from urllib.parse import urljoin

def absolute_hrefs(html, baseurl):
    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
    for a in t.findall(".//*[@href]"):
        linkclass = a.attrib.get("class", "")
        href = urljoin(baseurl, a.attrib.get("href"))
        a.attrib['href'] = href
    html = ET.tostring(t, method="html", encoding="unicode")
    return html