Href-lister
Revision as of 18:19, 19 May 2013 by Lassebosch (talk | contribs) (Created page with "<script type="python"> import sys, urllib, lxml.html from urlparse import urlparse, urljoin, urldefrag url=sys.argv[1] temp_url=urlparse(url) urlbase=temp_url.netloc avail...")
<script type="python">
import sys, urllib, lxml.html from urlparse import urlparse, urljoin, urldefrag
url=sys.argv[1]
temp_url=urlparse(url) urlbase=temp_url.netloc
availurl_list=[]
filter_availurl_list=[]
exclude=[urlbase]
listertext = "listertext.txt"
connection = urllib.urlopen(url)
dom = lxml.html.fromstring(connection.read())
for xpath in ['//a/@href']:
for link in dom.xpath(xpath): # select the url in href for all a tags(links) #print link link = link.strip() # normalize url link = urljoin(url,link) link = urldefrag(link)[0] # strip for / link = link.rstrip('/')
availurl_list.append(link)
for line in availurl_list:
if not any(exclude in line for exclude in exclude): filter_availurl_list.append(line)
clean_filter_availurl_list = list(set(filter_availurl_list))
for line in clean_filter_availurl_list:
with open(listertext,"a") as textfile: textfile.write(line+'\n') textfile.close()
</script>