Href-lister

From XPUB & Lens-Based wiki
Revision as of 18:20, 19 May 2013 by Lassebosch (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
import sys, urllib, lxml.html
from urlparse import urlparse, urljoin, urldefrag

url=sys.argv[1]

temp_url=urlparse(url)
urlbase=temp_url.netloc

availurl_list=[]

filter_availurl_list=[]

exclude=[urlbase]

listertext = "listertext.txt"

connection = urllib.urlopen(url)

dom =  lxml.html.fromstring(connection.read())
                
for xpath in ['//a/@href']:
    for link in dom.xpath(xpath): # select the url in href for all a tags(links)
        #print link
        link = link.strip()
        # normalize url
        link = urljoin(url,link)
        link = urldefrag(link)[0]
        # strip for /
        link = link.rstrip('/')

        availurl_list.append(link)

for line in availurl_list:
    if not any(exclude in line for exclude in exclude):
        filter_availurl_list.append(line)

clean_filter_availurl_list = list(set(filter_availurl_list))

for line in clean_filter_availurl_list:
    with open(listertext,"a") as textfile:
        textfile.write(line+'\n')
    textfile.close()