Difference between revisions of "Filtering HTML with python"
Jump to navigation
Jump to search
(Created page with "<source lang="python"> from urllib.parse import urljoin def absolute_hrefs(html, baseurl): t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements =...") |
|||
(One intermediate revision by the same user not shown) | |||
Line 1: | Line 1: | ||
+ | |||
+ | == Absolutizing hrefs in HTML == | ||
+ | |||
<source lang="python"> | <source lang="python"> | ||
from urllib.parse import urljoin | from urllib.parse import urljoin | ||
+ | from xml.etree import ElementTree as ET | ||
def absolute_hrefs(html, baseurl): | def absolute_hrefs(html, baseurl): |
Latest revision as of 15:59, 23 May 2020
Absolutizing hrefs in HTML
from urllib.parse import urljoin
from xml.etree import ElementTree as ET
def absolute_hrefs(html, baseurl):
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
for a in t.findall(".//*[@href]"):
linkclass = a.attrib.get("class", "")
href = urljoin(baseurl, a.attrib.get("href"))
a.attrib['href'] = href
html = ET.tostring(t, method="html", encoding="unicode")
return html