Filtering HTML with python

From XPUB & Lens-Based wiki
Revision as of 16:59, 23 May 2020 by Michael Murtaugh (talk | contribs) (→‎Absolutizing hrefs in HTML)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Absolutizing hrefs in HTML

from urllib.parse import urljoin
from xml.etree import ElementTree as ET

def absolute_hrefs(html, baseurl):
    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
    for a in t.findall(".//*[@href]"):
        linkclass = a.attrib.get("class", "")
        href = urljoin(baseurl, a.attrib.get("href"))
        a.attrib['href'] = href
    html = ET.tostring(t, method="html", encoding="unicode")
    return html