Difference between revisions of "Filtering HTML with python"

From Media Design: Networked & Lens-Based wiki
Jump to navigation Jump to search
(Created page with "<source lang="python"> from urllib.parse import urljoin def absolute_hrefs(html, baseurl): t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements =...")
 
 
(One intermediate revision by the same user not shown)
Line 1: Line 1:
 +
 +
== Absolutizing hrefs in HTML ==
 +
 
<source lang="python">
 
<source lang="python">
 
from urllib.parse import urljoin
 
from urllib.parse import urljoin
 +
from xml.etree import ElementTree as ET
  
 
def absolute_hrefs(html, baseurl):
 
def absolute_hrefs(html, baseurl):

Latest revision as of 16:59, 23 May 2020

Absolutizing hrefs in HTML

from urllib.parse import urljoin
from xml.etree import ElementTree as ET

def absolute_hrefs(html, baseurl):
    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
    for a in t.findall(".//*[@href]"):
        linkclass = a.attrib.get("class", "")
        href = urljoin(baseurl, a.attrib.get("href"))
        a.attrib['href'] = href
    html = ET.tostring(t, method="html", encoding="unicode")
    return html