Spider: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
Line 1: Line 1:
An attempt to write a spider which 1) prints all URL's for a desired web-page. 2) excludes/includes certain urls in a filter-script and 3) Picks one of the filterd urls and 4) eventually sends it back to 1) in a continous loop
'''1)'''
  import sys, httplib2, os, time, urllib, lxml.html, re
  import sys, httplib2, os, time, urllib, lxml.html, re
  from urlparse import urlparse, urljoin, urldefrag
  from urlparse import urlparse, urljoin, urldefrag

Revision as of 14:28, 22 February 2013

An attempt to write a spider which 1) prints all URL's for a desired web-page. 2) excludes/includes certain urls in a filter-script and 3) Picks one of the filterd urls and 4) eventually sends it back to 1) in a continous loop


1)


import sys, httplib2, os, time, urllib, lxml.html, re
from urlparse import urlparse, urljoin, urldefrag
def visit (url, depth=1):
    global visited
    #print url
    print url
    # remember we visited
    visited[url] = True
    if depth >= MAX_DEPTH: return
    connection = urllib.urlopen(url)
   
    dom =  lxml.html.fromstring(connection.read())
   
    for xpath in ['//a/@href', '//img/@src']:
        # select the url in href for all a tags(links)
        for link in dom.xpath(xpath): 
            #print link
            link = link.strip()
            if link.lower().startswith("javascript"):
                continue
            # normalize url
            link = urljoin(url,link)
            link = urldefrag(link)[0]
            # strip for /
            link = link.rstrip('/')
            # if (link not in visited) and link.startswith(PREFIX) and depth<MAX_DEPTH:
            if (link not in visited) and depth<MAX_DEPTH:
               visit(link, depth+1)
MAX_DEPTH = 2
visited = {}
starturl = sys.argv[1]
try:
   PREFIX = sys.argv[2]
except IndexError:
   PREFIX = starturl 
visit(starturl)