Href-lister: Difference between revisions

From XPUB & Lens-Based wiki
(Created page with "<script type="python"> import sys, urllib, lxml.html from urlparse import urlparse, urljoin, urldefrag url=sys.argv[1] temp_url=urlparse(url) urlbase=temp_url.netloc avail...")
 
No edit summary
 
Line 1: Line 1:
<script type="python">
<source lang="python">
 
import sys, urllib, lxml.html
import sys, urllib, lxml.html
from urlparse import urlparse, urljoin, urldefrag
from urlparse import urlparse, urljoin, urldefrag
Line 43: Line 42:
         textfile.write(line+'\n')
         textfile.write(line+'\n')
     textfile.close()
     textfile.close()
</script>
</source>

Latest revision as of 19:20, 19 May 2013

import sys, urllib, lxml.html
from urlparse import urlparse, urljoin, urldefrag

url=sys.argv[1]

temp_url=urlparse(url)
urlbase=temp_url.netloc

availurl_list=[]

filter_availurl_list=[]

exclude=[urlbase]

listertext = "listertext.txt"

connection = urllib.urlopen(url)

dom =  lxml.html.fromstring(connection.read())
                
for xpath in ['//a/@href']:
    for link in dom.xpath(xpath): # select the url in href for all a tags(links)
        #print link
        link = link.strip()
        # normalize url
        link = urljoin(url,link)
        link = urldefrag(link)[0]
        # strip for /
        link = link.rstrip('/')

        availurl_list.append(link)

for line in availurl_list:
    if not any(exclude in line for exclude in exclude):
        filter_availurl_list.append(line)

clean_filter_availurl_list = list(set(filter_availurl_list))

for line in clean_filter_availurl_list:
    with open(listertext,"a") as textfile:
        textfile.write(line+'\n')
    textfile.close()