User:Laurier Rochon/prototyping/??????????soft: Difference between revisions
No edit summary |
|||
Line 8: | Line 8: | ||
#!/usr/bin/python2.6 | #!/usr/bin/python2.6 | ||
#!/usr/bin/python2.6 | |||
#-*- coding:utf-8 -*- | |||
print "Content-Type: text/html" | |||
print | |||
import urllib2 | import urllib2 | ||
import json | import json | ||
Line 13: | Line 17: | ||
import os | import os | ||
txt = 'blogs' | txt = 'blogs' | ||
Line 51: | Line 54: | ||
se = '\n'.join(map(str, entries)) | se = '\n'.join(map(str, entries)) | ||
f = open(txt,'a') | f = open(txt,'a') | ||
f.write(se) | if c>0: | ||
f.write(se) | |||
f.write('\n') | |||
f.close() | f.close() | ||
execfile("spider.py") | |||
</source> | </source> | ||
Line 58: | Line 65: | ||
== Verbose blog page scraper : python + beautifulsoup == | == Verbose blog page scraper : python + beautifulsoup == | ||
- - - updated 22nd feb 2011 | |||
- - - updated 19th feb 2011 | - - - updated 19th feb 2011 | ||
Line 69: | Line 77: | ||
blogfile = 'blogs' | blogfile = 'blogs' | ||
qfile = 'questions' | |||
archive = 'archive' | |||
forbiddenlist=['comment','wordpress','related','categories','dtd','w3','archives','tags','admin','php','twitter','subscribe','articles','.com','says:','linkback','post','password','statcounter','class='] | |||
if os.path.exists(qfile): | |||
ff = open(qfile,'r') | |||
pureq = ff.read() | |||
else: | |||
pureq = '' | |||
f = open(blogfile,'r') | f = open(blogfile,'r') | ||
Line 88: | Line 106: | ||
p=1 | p=1 | ||
qs = "" | |||
for element in a: | for element in a: | ||
print '----------------------------------------------------------------------' | print '----------------------------------------------------------------------' | ||
Line 120: | Line 139: | ||
n = match.start() | n = match.start() | ||
positions.append(n) | positions.append(n) | ||
if cleartext[n]=='?' | #start cleaning | ||
if cleartext[n]=='?' and k>1: | |||
e = cleartext[positions[k-3]+1:n+1].strip(" \t\n\r") | |||
if len(e)>10 and len(e)<600: | |||
f = True | |||
for m in forbiddenlist: | |||
print e | if m in e.lower(): | ||
print | f = False | ||
if f: | |||
if e[0].islower() or e[0]==' ' or not e[0].isalpha(): | |||
e = '...'+e | |||
#end cleaning | |||
if e.encode("utf-8") not in pureq: | |||
qs = qs+element[0]+'\t'+element[1]+'\t'+element[2]+'\t'+element[3]+'\t'+element[4]+'\t'+element[5]+'\t'+e+'\n' | |||
print e | |||
print | |||
else: | |||
print "duplicate...this question already exists" | |||
k=k+1 | k=k+1 | ||
Line 138: | Line 167: | ||
p=p+1 | p=p+1 | ||
#dump clean questions to questions file | |||
if not os.path.exists(qfile): | |||
f = open(qfile,'w') | |||
else: | |||
f = open(qfile,'a') | |||
if len(qs)>1: | |||
f.write(qs.encode("utf-8")) | |||
f.close() | |||
#dump old urls to archive | |||
if not os.path.exists(archive): | |||
f = open(archive,'w') | |||
else: | |||
f = open(archive,'a') | |||
f.write(data) | |||
f.close() | |||
#clean scaped urls file | |||
f=open(blogfile,'w') | |||
f.write('') | |||
f.close() | |||
</source> | </source> |
Revision as of 17:48, 22 February 2011
Links scraper /w API
Scraping blog urls, checking them against the current archive + storing them in a tab-separated file.
And then adding a cron job lets you pile up the results like this, doing la scrape every hour.
#!/usr/bin/python2.6
#!/usr/bin/python2.6
#-*- coding:utf-8 -*-
print "Content-Type: text/html"
print
import urllib2
import json
from datetime import date
import os
txt = 'blogs'
start=0
scrapedate=date.today()
entries=[]
urllist=[]
if not os.path.exists(txt):
f = open(txt,'w')
f.close()
else:
f = open(txt,'r')
data = f.read()
if len(data)>0:
urls = data.split('\n')
for a in urls:
line = a.split('\t')
if len(line)>1:
urllist.append(line[2])
c=0
while start<64:
url = ('https://ajax.googleapis.com/ajax/services/search/blogs?v=1.0&q=myself&start='+ str (start)+'&rsz=large')
f = urllib2.urlopen(url)
data = json.load(f)
for r in data['responseData']['results']:
if r['postUrl'] not in urllist:
entry = "%s\t%s\t%s\t%s\t%s\t%s" % (scrapedate, r['title'], r['postUrl'], r['publishedDate'], r['blogUrl'], r['author'])
entry = entry.encode("utf-8")
entries.append(entry)
c = c+1
start += 8
print 'added %s entries' % (c)
se = '\n'.join(map(str, entries))
f = open(txt,'a')
if c>0:
f.write(se)
f.write('\n')
f.close()
execfile("spider.py")
Verbose blog page scraper : python + beautifulsoup
- - - updated 22nd feb 2011 - - - updated 19th feb 2011
#!/usr/bin/python2.6
import os
import urllib2
from BeautifulSoup import BeautifulSoup
import re
blogfile = 'blogs'
qfile = 'questions'
archive = 'archive'
forbiddenlist=['comment','wordpress','related','categories','dtd','w3','archives','tags','admin','php','twitter','subscribe','articles','.com','says:','linkback','post','password','statcounter','class=']
if os.path.exists(qfile):
ff = open(qfile,'r')
pureq = ff.read()
else:
pureq = ''
f = open(blogfile,'r')
data = f.read()
f.close()
rows = data.split('\n')
#put the text file in a 2D array, I imagine this will save me some work later
a = []
c=0
for row in rows:
items = row.split('\t')
if len(items) > 1:
a.append([])
for item in items:
a[c].append(item)
c=c+1
p=1
qs = ""
for element in a:
print '----------------------------------------------------------------------'
print 'scraping link %d of %d' % (p,len(a))
print '----------------------------------------------------------------------'
target = element[2]
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
try:
f=urllib2.urlopen(request)
c = f.read()
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'urllib error ---> %s' % (e)
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
try:
soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
cleartext = ""
for line in soup:
if len(line)>10:
line = line.replace("\n"," ")
line = line.replace(" ","")
cleartext = cleartext + line
#starts = [match.start() for match in re.finditer(re.escape('?'), cleartext)]
l = re.compile('[!?.]')
it = l.finditer(cleartext)
k=0
positions=[]
for match in it:
n = match.start()
positions.append(n)
#start cleaning
if cleartext[n]=='?' and k>1:
e = cleartext[positions[k-3]+1:n+1].strip(" \t\n\r")
if len(e)>10 and len(e)<600:
f = True
for m in forbiddenlist:
if m in e.lower():
f = False
if f:
if e[0].islower() or e[0]==' ' or not e[0].isalpha():
e = '...'+e
#end cleaning
if e.encode("utf-8") not in pureq:
qs = qs+element[0]+'\t'+element[1]+'\t'+element[2]+'\t'+element[3]+'\t'+element[4]+'\t'+element[5]+'\t'+e+'\n'
print e
print
else:
print "duplicate...this question already exists"
k=k+1
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'beautifulsoup error ---> %s ' % (e)
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
p=p+1
#dump clean questions to questions file
if not os.path.exists(qfile):
f = open(qfile,'w')
else:
f = open(qfile,'a')
if len(qs)>1:
f.write(qs.encode("utf-8"))
f.close()
#dump old urls to archive
if not os.path.exists(archive):
f = open(archive,'w')
else:
f = open(archive,'a')
f.write(data)
f.close()
#clean scaped urls file
f=open(blogfile,'w')
f.write('')
f.close()