Href-lister
From XPUB & Lens-Based wiki
import sys, urllib, lxml.html
from urlparse import urlparse, urljoin, urldefrag
url=sys.argv[1]
temp_url=urlparse(url)
urlbase=temp_url.netloc
availurl_list=[]
filter_availurl_list=[]
exclude=[urlbase]
listertext = "listertext.txt"
connection = urllib.urlopen(url)
dom = lxml.html.fromstring(connection.read())
for xpath in ['//a/@href']:
for link in dom.xpath(xpath): # select the url in href for all a tags(links)
#print link
link = link.strip()
# normalize url
link = urljoin(url,link)
link = urldefrag(link)[0]
# strip for /
link = link.rstrip('/')
availurl_list.append(link)
for line in availurl_list:
if not any(exclude in line for exclude in exclude):
filter_availurl_list.append(line)
clean_filter_availurl_list = list(set(filter_availurl_list))
for line in clean_filter_availurl_list:
with open(listertext,"a") as textfile:
textfile.write(line+'\n')
textfile.close()