Spider: Difference between revisions
Lassebosch (talk | contribs) (Created page with " import sys, httplib2, os, time, urllib, lxml.html, re from urlparse import urlparse, urljoin, urldefrag def visit (url, depth=1): global visited #print url pri...") |
Lassebosch (talk | contribs) No edit summary |
||
Line 3: | Line 3: | ||
def visit (url, depth=1): | def visit (url, depth=1): | ||
global visited | |||
#print url | |||
print url | |||
# remember we visited | |||
visited[url] = True | |||
if depth >= MAX_DEPTH: return | |||
connection = urllib.urlopen(url) | |||
dom = lxml.html.fromstring(connection.read()) | |||
for xpath in ['//a/@href', '//img/@src']: | |||
# select the url in href for all a tags(links) | |||
for link in dom.xpath(xpath): | |||
#print link | |||
link = link.strip() | |||
if link.lower().startswith("javascript"): | |||
continue | |||
# normalize url | |||
link = urljoin(url,link) | |||
link = urldefrag(link)[0] | |||
# strip for / | |||
link = link.rstrip('/') | |||
# if (link not in visited) and link.startswith(PREFIX) and depth<MAX_DEPTH: | |||
if (link not in visited) and depth<MAX_DEPTH: | |||
visit(link, depth+1) | visit(link, depth+1) | ||
Revision as of 13:22, 22 February 2013
import sys, httplib2, os, time, urllib, lxml.html, re from urlparse import urlparse, urljoin, urldefrag
def visit (url, depth=1): global visited #print url print url # remember we visited visited[url] = True if depth >= MAX_DEPTH: return connection = urllib.urlopen(url) dom = lxml.html.fromstring(connection.read()) for xpath in ['//a/@href', '//img/@src']: # select the url in href for all a tags(links) for link in dom.xpath(xpath): #print link link = link.strip() if link.lower().startswith("javascript"): continue # normalize url link = urljoin(url,link) link = urldefrag(link)[0] # strip for / link = link.rstrip('/') # if (link not in visited) and link.startswith(PREFIX) and depth<MAX_DEPTH: if (link not in visited) and depth<MAX_DEPTH: visit(link, depth+1)
MAX_DEPTH = 2
visited = {}
starturl = sys.argv[1]
try: PREFIX = sys.argv[2] except IndexError: PREFIX = starturl visit(starturl)