Filtering HTML with python
Revision as of 16:59, 23 May 2020 by Michael Murtaugh (talk | contribs) (→Absolutizing hrefs in HTML)
Absolutizing hrefs in HTML
from urllib.parse import urljoin
from xml.etree import ElementTree as ET
def absolute_hrefs(html, baseurl):
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
for a in t.findall(".//*[@href]"):
linkclass = a.attrib.get("class", "")
href = urljoin(baseurl, a.attrib.get("href"))
a.attrib['href'] = href
html = ET.tostring(t, method="html", encoding="unicode")
return html