Spider: Difference between revisions

From XPUB & Lens-Based wiki
(Created page with " import sys, httplib2, os, time, urllib, lxml.html, re from urlparse import urlparse, urljoin, urldefrag def visit (url, depth=1): global visited #print url pri...")
 
No edit summary
Line 3: Line 3:


  def visit (url, depth=1):
  def visit (url, depth=1):
    global visited
    global visited
    #print url
    #print url
    print url
    print url
    # remember we visited
    # remember we visited
    visited[url] = True
    visited[url] = True
    if depth >= MAX_DEPTH: return
    if depth >= MAX_DEPTH: return
    connection = urllib.urlopen(url)
    connection = urllib.urlopen(url)
      
      
    dom =  lxml.html.fromstring(connection.read())
    dom =  lxml.html.fromstring(connection.read())
      
      
    for xpath in ['//a/@href', '//img/@src']:
    for xpath in ['//a/@href', '//img/@src']:
        # select the url in href for all a tags(links)
        # select the url in href for all a tags(links)
        for link in dom.xpath(xpath):  
        for link in dom.xpath(xpath):  
            #print link
            #print link
            link = link.strip()
            link = link.strip()
            if link.lower().startswith("javascript"):
            if link.lower().startswith("javascript"):
                continue
                continue
            # normalize url
            # normalize url
            link = urljoin(url,link)
            link = urljoin(url,link)
            link = urldefrag(link)[0]
            link = urldefrag(link)[0]
            # strip for /
            # strip for /
            link = link.rstrip('/')
            link = link.rstrip('/')
            # if (link not in visited) and link.startswith(PREFIX) and depth<MAX_DEPTH:
            # if (link not in visited) and link.startswith(PREFIX) and depth<MAX_DEPTH:
            if (link not in visited) and depth<MAX_DEPTH:
            if (link not in visited) and depth<MAX_DEPTH:
                 visit(link, depth+1)
                 visit(link, depth+1)



Revision as of 13:22, 22 February 2013

import sys, httplib2, os, time, urllib, lxml.html, re
from urlparse import urlparse, urljoin, urldefrag
def visit (url, depth=1):
    global visited
    #print url
    print url
    # remember we visited
    visited[url] = True
    if depth >= MAX_DEPTH: return
    connection = urllib.urlopen(url)
   
    dom =  lxml.html.fromstring(connection.read())
   
    for xpath in ['//a/@href', '//img/@src']:
        # select the url in href for all a tags(links)
        for link in dom.xpath(xpath): 
            #print link
            link = link.strip()
            if link.lower().startswith("javascript"):
                continue
            # normalize url
            link = urljoin(url,link)
            link = urldefrag(link)[0]
            # strip for /
            link = link.rstrip('/')
            # if (link not in visited) and link.startswith(PREFIX) and depth<MAX_DEPTH:
            if (link not in visited) and depth<MAX_DEPTH:
               visit(link, depth+1)
MAX_DEPTH = 2
visited = {}
starturl = sys.argv[1]
try:
   PREFIX = sys.argv[2]
except IndexError:
   PREFIX = starturl 
visit(starturl)