Spider: Difference between revisions
Lassebosch (talk | contribs) No edit summary |
Lassebosch (talk | contribs) No edit summary |
||
Line 48: | Line 48: | ||
# examples of what you want to include: | # examples of what you want to include: | ||
include = ['www.'] | include = ['www.'] | ||
for line in sys.stdin: | for line in sys.stdin: | ||
if not any(exclude in line for exclude in exclude) and any(include in line for include in include): | if not any(exclude in line for exclude in exclude) and any(include in line for include in include): |
Revision as of 13:37, 22 February 2013
An attempt to write a spider which 1) prints all URL's for a desired web-page. 2) excludes/includes certain urls in a filter-script and 3) Picks one of the filterd urls and 4) eventually sends it back to 1) in a continous loop
1) Spider
import sys, httplib2, os, time, urllib, lxml.html, re from urlparse import urlparse, urljoin, urldefrag def visit (url, depth=1): global visited #print url print url # remember we visited visited[url] = True if depth >= MAX_DEPTH: return connection = urllib.urlopen(url) dom = lxml.html.fromstring(connection.read()) for xpath in ['//a/@href', '//img/@src']: # select the url in href for all a tags(links) for link in dom.xpath(xpath): #print link link = link.strip() if link.lower().startswith("javascript"): continue # normalize url link = urljoin(url,link) link = urldefrag(link)[0] # strip for / link = link.rstrip('/') # if (link not in visited) and link.startswith(PREFIX) and depth<MAX_DEPTH: if (link not in visited) and depth<MAX_DEPTH: visit(link, depth+1) MAX_DEPTH = 2 visited = {} starturl = sys.argv[1] try: PREFIX = sys.argv[2] except IndexError: PREFIX = starturl visit(starturl)
2) filter
import sys # examples of what you want to exclude: exclude = ['.jpg', '.png', 'gif'] # examples of what you want to include: include = ['www.'] for line in sys.stdin: if not any(exclude in line for exclude in exclude) and any(include in line for include in include): sys.stdout.write(line)
3) randomchoice
import sys, random urls = [] for line in sys.stdin: urls.append(line) lengthurls=len(urls) randPick = random.randint(0, lengthurls) sys.stdout.write(urls[randPick])
4) loop??