Href-lister

From XPUB & Lens-Based wiki
Revision as of 19:19, 19 May 2013 by Lassebosch (talk | contribs) (Created page with "<script type="python"> import sys, urllib, lxml.html from urlparse import urlparse, urljoin, urldefrag url=sys.argv[1] temp_url=urlparse(url) urlbase=temp_url.netloc avail...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

<script type="python">

import sys, urllib, lxml.html from urlparse import urlparse, urljoin, urldefrag

url=sys.argv[1]

temp_url=urlparse(url) urlbase=temp_url.netloc

availurl_list=[]

filter_availurl_list=[]

exclude=[urlbase]

listertext = "listertext.txt"

connection = urllib.urlopen(url)

dom = lxml.html.fromstring(connection.read())

for xpath in ['//a/@href']:

   for link in dom.xpath(xpath): # select the url in href for all a tags(links)
       #print link
       link = link.strip()
       # normalize url
       link = urljoin(url,link)
       link = urldefrag(link)[0]
       # strip for /
       link = link.rstrip('/')
       availurl_list.append(link)

for line in availurl_list:

   if not any(exclude in line for exclude in exclude):
       filter_availurl_list.append(line)

clean_filter_availurl_list = list(set(filter_availurl_list))

for line in clean_filter_availurl_list:

   with open(listertext,"a") as textfile:
       textfile.write(line+'\n')
   textfile.close()

</script>