Href-lister: Difference between revisions
Lassebosch (talk | contribs) (Created page with "<script type="python"> import sys, urllib, lxml.html from urlparse import urlparse, urljoin, urldefrag url=sys.argv[1] temp_url=urlparse(url) urlbase=temp_url.netloc avail...") |
Lassebosch (talk | contribs) No edit summary |
||
Line 1: | Line 1: | ||
< | <source lang="python"> | ||
import sys, urllib, lxml.html | import sys, urllib, lxml.html | ||
from urlparse import urlparse, urljoin, urldefrag | from urlparse import urlparse, urljoin, urldefrag | ||
Line 43: | Line 42: | ||
textfile.write(line+'\n') | textfile.write(line+'\n') | ||
textfile.close() | textfile.close() | ||
</ | </source> |
Latest revision as of 19:20, 19 May 2013
import sys, urllib, lxml.html
from urlparse import urlparse, urljoin, urldefrag
url=sys.argv[1]
temp_url=urlparse(url)
urlbase=temp_url.netloc
availurl_list=[]
filter_availurl_list=[]
exclude=[urlbase]
listertext = "listertext.txt"
connection = urllib.urlopen(url)
dom = lxml.html.fromstring(connection.read())
for xpath in ['//a/@href']:
for link in dom.xpath(xpath): # select the url in href for all a tags(links)
#print link
link = link.strip()
# normalize url
link = urljoin(url,link)
link = urldefrag(link)[0]
# strip for /
link = link.rstrip('/')
availurl_list.append(link)
for line in availurl_list:
if not any(exclude in line for exclude in exclude):
filter_availurl_list.append(line)
clean_filter_availurl_list = list(set(filter_availurl_list))
for line in clean_filter_availurl_list:
with open(listertext,"a") as textfile:
textfile.write(line+'\n')
textfile.close()