Spider: Difference between revisions

From XPUB & Lens-Based wiki
(Created page with " import sys, httplib2, os, time, urllib, lxml.html, re from urlparse import urlparse, urljoin, urldefrag def visit (url, depth=1): global visited #print url pri...")
 
No edit summary
 
(14 intermediate revisions by 2 users not shown)
Line 1: Line 1:
See also [[Web Spider in Python]]
An attempt to write a spider which 1) prints all URL's for a desired web-page. 2) excludes/includes certain urls in a filter-script and 3) Picks one of the filterd urls and 4) eventually sends it back to 1) in a continous loop
'''1)''' Spider
<source lang="python">
  import sys, httplib2, os, time, urllib, lxml.html, re
  import sys, httplib2, os, time, urllib, lxml.html, re
  from urlparse import urlparse, urljoin, urldefrag
  from urlparse import urlparse, urljoin, urldefrag
  def visit (url, depth=1):
  def visit (url, depth=1):
    global visited
    global visited
    #print url
    #print url
    print url
    print url
    # remember we visited
    # remember we visited
    visited[url] = True
    visited[url] = True
    if depth >= MAX_DEPTH: return
    if depth >= MAX_DEPTH: return
    connection = urllib.urlopen(url)
    connection = urllib.urlopen(url)
      
      
    dom =  lxml.html.fromstring(connection.read())
    dom =  lxml.html.fromstring(connection.read())
      
      
    for xpath in ['//a/@href', '//img/@src']:
    for xpath in ['//a/@href', '//img/@src']:
        # select the url in href for all a tags(links)
        # select the url in href for all a tags(links)
        for link in dom.xpath(xpath):  
        for link in dom.xpath(xpath):  
            #print link
            #print link
            link = link.strip()
            link = link.strip()
            if link.lower().startswith("javascript"):
            if link.lower().startswith("javascript"):
                continue
                continue
            # normalize url
            # normalize url
            link = urljoin(url,link)
            link = urljoin(url,link)
            link = urldefrag(link)[0]
            link = urldefrag(link)[0]
            # strip for /
            # strip for /
            link = link.rstrip('/')
            link = link.rstrip('/')
            # if (link not in visited) and link.startswith(PREFIX) and depth<MAX_DEPTH:
            # if (link not in visited) and link.startswith(PREFIX) and depth<MAX_DEPTH:
            if (link not in visited) and depth<MAX_DEPTH:
            if (link not in visited) and depth<MAX_DEPTH:
                 visit(link, depth+1)
                 visit(link, depth+1)
  MAX_DEPTH = 2
  MAX_DEPTH = 2
  visited = {}
  visited = {}
  starturl = sys.argv[1]
  starturl = sys.argv[1]
  try:
  try:
     PREFIX = sys.argv[2]
     PREFIX = sys.argv[2]
Line 40: Line 43:
     PREFIX = starturl  
     PREFIX = starturl  
  visit(starturl)
  visit(starturl)
</source>
'''2)''' filter
import sys
# examples of what you want to exclude:
exclude = ['.jpg', '.png', 'gif']
# examples of what you want to include:
include = ['www.']
for line in sys.stdin:
    if not any(exclude in line for exclude in exclude) and any(include in line for include in include):
        sys.stdout.write(line)
'''3)''' randomchoice
import sys, random
urls = []
for line in sys.stdin:
    urls.append(line)
lengthurls=len(urls)
randPick = random.randint(0, lengthurls)
sys.stdout.write(urls[randPick])
'''4)''' loop??
Currently i'm working on the loop...
Example of how you the [[Pipelines | piping]] (link for nice piping-article) :
python spider04.py http://tatteredcorners.tumblr.com/post/15141435895 | python tumblrfilter.py | python randomc.py  > pickedurl.txt

Latest revision as of 19:26, 4 March 2014

See also Web Spider in Python

An attempt to write a spider which 1) prints all URL's for a desired web-page. 2) excludes/includes certain urls in a filter-script and 3) Picks one of the filterd urls and 4) eventually sends it back to 1) in a continous loop


1) Spider

 import sys, httplib2, os, time, urllib, lxml.html, re
 from urlparse import urlparse, urljoin, urldefrag
 def visit (url, depth=1):
     global visited
     #print url
     print url
     # remember we visited
     visited[url] = True
     if depth >= MAX_DEPTH: return
     connection = urllib.urlopen(url)
    
     dom =  lxml.html.fromstring(connection.read())
    
     for xpath in ['//a/@href', '//img/@src']:
         # select the url in href for all a tags(links)
         for link in dom.xpath(xpath): 
             #print link
             link = link.strip()
             if link.lower().startswith("javascript"):
                 continue
             # normalize url
             link = urljoin(url,link)
             link = urldefrag(link)[0]
             # strip for /
             link = link.rstrip('/')
             # if (link not in visited) and link.startswith(PREFIX) and depth<MAX_DEPTH:
             if (link not in visited) and depth<MAX_DEPTH:
                visit(link, depth+1)
 MAX_DEPTH = 2
 visited = {}
 starturl = sys.argv[1]
 try:
    PREFIX = sys.argv[2]
 except IndexError:
    PREFIX = starturl 
 visit(starturl)

2) filter

import sys
# examples of what you want to exclude:
exclude = ['.jpg', '.png', 'gif']
# examples of what you want to include:
include = ['www.']
for line in sys.stdin:
    if not any(exclude in line for exclude in exclude) and any(include in line for include in include):
        sys.stdout.write(line)

3) randomchoice

import sys, random
urls = []
for line in sys.stdin:
    urls.append(line)
lengthurls=len(urls)
randPick = random.randint(0, lengthurls)
sys.stdout.write(urls[randPick])


4) loop??

Currently i'm working on the loop...


Example of how you the piping (link for nice piping-article) :

python spider04.py http://tatteredcorners.tumblr.com/post/15141435895 | python tumblrfilter.py | python randomc.py  > pickedurl.txt