User:Laurier Rochon/prototyping/??????????soft: Difference between revisions
No edit summary |
|||
Line 1: | Line 1: | ||
== Tragedy generator - v0.1 == | |||
<source lang="python"> | |||
#!/usr/bin/python2.6 | |||
#-*- coding:utf-8 -*- | |||
#print "Content-Type: text/html" | |||
#print | |||
from __future__ import division | |||
import urllib2 | |||
from BeautifulSoup import BeautifulSoup, Comment | |||
import json | |||
import os | |||
import fileinput | |||
import random | |||
import safe_html | |||
import nltk.util | |||
terms = ['tragedy'] | |||
entries=[] | |||
us = [] | |||
titles=10 | |||
content=50000 | |||
maxcrawl = 64 | |||
blacklist = ['|','comment','ENCODING','Login','.com','Favorites', 'DOCTYPE','login','password','loading'] | |||
live=1 | |||
if live==0: | |||
for a in range(40): | |||
fn = 'items/item'+str (a) | |||
#fn = 'items/item'+str (random.randrange(40)) | |||
c='' | |||
for line in fileinput.input(fn): | |||
c=c+line | |||
soup = BeautifulSoup(safe_html.safe_html(c)) | |||
#soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=True))) | |||
heading = soup.h1 | |||
if heading: | |||
heading = str (heading.renderContents()) | |||
print heading.strip(" \t\n\r") | |||
a = BeautifulSoup(''.join(BeautifulSoup(safe_html.safe_html(c)).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style"))) | |||
cleartext = '' | |||
a = str (a) | |||
body = a.split("\n") | |||
for line in body: | |||
if len(line)>250: | |||
line = line.replace(" "," ") | |||
cleartext = cleartext +'\n\n'+ line | |||
print cleartext | |||
print '---------------------------------' | |||
else: | |||
c=0 | |||
for term in terms: | |||
start=0 | |||
while start<maxcrawl: | |||
url = ('https://ajax.googleapis.com/ajax/services/search/news?v=1.0&q='+term+'&start='+ str (start)+'&rsz=large&geo=china') | |||
f = urllib2.urlopen(url) | |||
data = json.load(f) | |||
nb = data['responseData']['cursor']['estimatedResultCount'] | |||
if nb>maxcrawl: | |||
nb=maxcrawl | |||
for r in data['responseData']['results']: | |||
entry = r['unescapedUrl'] | |||
if entry not in us: | |||
us.append(entry) | |||
#print entry | |||
entries.append(entry) | |||
c = c+1 | |||
percent = int (round((c/maxcrawl)*100)) | |||
print 'harvesting links...'+str (percent)+'% of possible maximum' | |||
start += 8 | |||
print '----------------------------------------------------------------------' | |||
print 'done getting links - crawling them to find titles and content' | |||
print '----------------------------------------------------------------------' | |||
p=0 | |||
w=0 | |||
#random.shuffle(entries) | |||
for b in entries: | |||
if w<content: | |||
print '----------------------------------------------------------------------' | |||
print 'crawling : '+b | |||
request = urllib2.Request(b) | |||
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") | |||
try: | |||
f=urllib2.urlopen(request) | |||
q = f.read() | |||
except Exception, e: | |||
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | |||
print 'urllib error ---> %s' % (e) | |||
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | |||
try: | |||
c='' | |||
for line in q: | |||
c=c+line | |||
soup = BeautifulSoup(safe_html.safe_html(c)) | |||
#find headings | |||
heading = soup.h1 | |||
if heading: | |||
heading = str (heading.renderContents()) | |||
heading = nltk.util.clean_html(heading.strip(" \t\n\r")) | |||
a = BeautifulSoup(''.join(BeautifulSoup(safe_html.safe_html(c)).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style"))) | |||
except Exception, e: | |||
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | |||
print 'beautifulsoup error ---> %s ' % (e) | |||
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | |||
cleartext = '' | |||
a = str (a) | |||
body = a.split("\n") | |||
article = '' | |||
#go line by line | |||
for line in body: | |||
#are you at least 250 chars? | |||
if len(line)>250: | |||
line = line.replace(" "," ") | |||
line = nltk.util.clean_html(line) | |||
article = article + line | |||
sentences = article.split('. ') | |||
#give 15 sentences max per article...this is shaky | |||
if len(sentences)>14: | |||
sentences = sentences[:14] | |||
sents = '' | |||
for sent in sentences: | |||
found = False | |||
#do you contain blacklist elements (passed the pre-html filter)? | |||
for x in blacklist: | |||
if x in sent: | |||
found = True | |||
break | |||
# | |||
#too many capital words (over 40%) = probably up to no good | |||
capitals = 0 | |||
words = sent.split(' ') | |||
if len(words)>0: | |||
for ww in words: | |||
if ww: | |||
if ww[0].isupper(): | |||
capitals = capitals + 1 | |||
if round(capitals/len(words)*100) > 40: | |||
found = True | |||
# | |||
#if filters above are passed...guess you're ok to join the bunch | |||
if not found: | |||
sents = sents + str (sent) +'. ' | |||
cleartext = cleartext +'\n\n'+ str (sents) | |||
#make things more readable for humans -> this converts HTML entities | |||
#clrstr = BeautifulSoup(cleartext,convertEntities=BeautifulSoup.HTML_ENTITIES).contents[0] | |||
clrstr = cleartext | |||
#this shouldn't be needed...somehow small fragments still make their way down here | |||
if len(clrstr)>200: | |||
if heading: | |||
print | |||
print heading | |||
print clrstr | |||
w = w+len(cleartext) | |||
per = w/content*100 | |||
print | |||
print 'found new content : '+str (per)+'%' | |||
p=p+1 | |||
print | |||
print 'done crawling, we have enough content now' | |||
</source> | |||
== Links scraper /w API == | == Links scraper /w API == | ||
Revision as of 22:04, 24 March 2011
Tragedy generator - v0.1
#!/usr/bin/python2.6
#-*- coding:utf-8 -*-
#print "Content-Type: text/html"
#print
from __future__ import division
import urllib2
from BeautifulSoup import BeautifulSoup, Comment
import json
import os
import fileinput
import random
import safe_html
import nltk.util
terms = ['tragedy']
entries=[]
us = []
titles=10
content=50000
maxcrawl = 64
blacklist = ['|','comment','ENCODING','Login','.com','Favorites', 'DOCTYPE','login','password','loading']
live=1
if live==0:
for a in range(40):
fn = 'items/item'+str (a)
#fn = 'items/item'+str (random.randrange(40))
c=''
for line in fileinput.input(fn):
c=c+line
soup = BeautifulSoup(safe_html.safe_html(c))
#soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=True)))
heading = soup.h1
if heading:
heading = str (heading.renderContents())
print heading.strip(" \t\n\r")
a = BeautifulSoup(''.join(BeautifulSoup(safe_html.safe_html(c)).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
cleartext = ''
a = str (a)
body = a.split("\n")
for line in body:
if len(line)>250:
line = line.replace(" "," ")
cleartext = cleartext +'\n\n'+ line
print cleartext
print '---------------------------------'
else:
c=0
for term in terms:
start=0
while start<maxcrawl:
url = ('https://ajax.googleapis.com/ajax/services/search/news?v=1.0&q='+term+'&start='+ str (start)+'&rsz=large&geo=china')
f = urllib2.urlopen(url)
data = json.load(f)
nb = data['responseData']['cursor']['estimatedResultCount']
if nb>maxcrawl:
nb=maxcrawl
for r in data['responseData']['results']:
entry = r['unescapedUrl']
if entry not in us:
us.append(entry)
#print entry
entries.append(entry)
c = c+1
percent = int (round((c/maxcrawl)*100))
print 'harvesting links...'+str (percent)+'% of possible maximum'
start += 8
print '----------------------------------------------------------------------'
print 'done getting links - crawling them to find titles and content'
print '----------------------------------------------------------------------'
p=0
w=0
#random.shuffle(entries)
for b in entries:
if w<content:
print '----------------------------------------------------------------------'
print 'crawling : '+b
request = urllib2.Request(b)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
try:
f=urllib2.urlopen(request)
q = f.read()
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'urllib error ---> %s' % (e)
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
try:
c=''
for line in q:
c=c+line
soup = BeautifulSoup(safe_html.safe_html(c))
#find headings
heading = soup.h1
if heading:
heading = str (heading.renderContents())
heading = nltk.util.clean_html(heading.strip(" \t\n\r"))
a = BeautifulSoup(''.join(BeautifulSoup(safe_html.safe_html(c)).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'beautifulsoup error ---> %s ' % (e)
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
cleartext = ''
a = str (a)
body = a.split("\n")
article = ''
#go line by line
for line in body:
#are you at least 250 chars?
if len(line)>250:
line = line.replace(" "," ")
line = nltk.util.clean_html(line)
article = article + line
sentences = article.split('. ')
#give 15 sentences max per article...this is shaky
if len(sentences)>14:
sentences = sentences[:14]
sents = ''
for sent in sentences:
found = False
#do you contain blacklist elements (passed the pre-html filter)?
for x in blacklist:
if x in sent:
found = True
break
#
#too many capital words (over 40%) = probably up to no good
capitals = 0
words = sent.split(' ')
if len(words)>0:
for ww in words:
if ww:
if ww[0].isupper():
capitals = capitals + 1
if round(capitals/len(words)*100) > 40:
found = True
#
#if filters above are passed...guess you're ok to join the bunch
if not found:
sents = sents + str (sent) +'. '
cleartext = cleartext +'\n\n'+ str (sents)
#make things more readable for humans -> this converts HTML entities
#clrstr = BeautifulSoup(cleartext,convertEntities=BeautifulSoup.HTML_ENTITIES).contents[0]
clrstr = cleartext
#this shouldn't be needed...somehow small fragments still make their way down here
if len(clrstr)>200:
if heading:
print
print heading
print clrstr
w = w+len(cleartext)
per = w/content*100
print
print 'found new content : '+str (per)+'%'
p=p+1
print
print 'done crawling, we have enough content now'
Links scraper /w API
Scraping blog urls, checking them against the current archive + storing them in a tab-separated file.
And then adding a cron job lets you pile up the results like this, doing la scrape every hour.
#!/usr/bin/python2.6
#!/usr/bin/python2.6
#-*- coding:utf-8 -*-
print "Content-Type: text/html"
print
import urllib2
import json
from datetime import date
import os
txt = 'blogs'
start=0
scrapedate=date.today()
entries=[]
urllist=[]
if not os.path.exists(txt):
f = open(txt,'w')
f.close()
else:
f = open(txt,'r')
data = f.read()
if len(data)>0:
urls = data.split('\n')
for a in urls:
line = a.split('\t')
if len(line)>1:
urllist.append(line[2])
c=0
while start<64:
url = ('https://ajax.googleapis.com/ajax/services/search/blogs?v=1.0&q=myself&start='+ str (start)+'&rsz=large')
f = urllib2.urlopen(url)
data = json.load(f)
for r in data['responseData']['results']:
if r['postUrl'] not in urllist:
entry = "%s\t%s\t%s\t%s\t%s\t%s" % (scrapedate, r['title'], r['postUrl'], r['publishedDate'], r['blogUrl'], r['author'])
entry = entry.encode("utf-8")
entries.append(entry)
c = c+1
start += 8
print 'added %s entries' % (c)
se = '\n'.join(map(str, entries))
f = open(txt,'a')
if c>0:
f.write(se)
f.write('\n')
f.close()
execfile("spider.py")
Verbose blog page scraper : python + beautifulsoup
- - - updated 22nd feb 2011
- - - updated 19th feb 2011
#!/usr/bin/python2.6
import os
import urllib2
from BeautifulSoup import BeautifulSoup
import re
blogfile = 'blogs'
qfile = 'questions'
archive = 'archive'
forbiddenlist=['comment','wordpress','related','categories','dtd','w3','archives','tags','admin','php','twitter','subscribe','articles','.com','says:','linkback','post','password','statcounter','class=']
if os.path.exists(qfile):
ff = open(qfile,'r')
pureq = ff.read()
else:
pureq = ''
f = open(blogfile,'r')
data = f.read()
f.close()
rows = data.split('\n')
#put the text file in a 2D array, I imagine this will save me some work later
a = []
c=0
for row in rows:
items = row.split('\t')
if len(items) > 1:
a.append([])
for item in items:
a[c].append(item)
c=c+1
p=1
qs = ""
for element in a:
print '----------------------------------------------------------------------'
print 'scraping link %d of %d' % (p,len(a))
print '----------------------------------------------------------------------'
target = element[2]
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
try:
f=urllib2.urlopen(request)
c = f.read()
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'urllib error ---> %s' % (e)
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
try:
soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
cleartext = ""
for line in soup:
if len(line)>10:
line = line.replace("\n"," ")
line = line.replace(" ","")
cleartext = cleartext + line
#starts = [match.start() for match in re.finditer(re.escape('?'), cleartext)]
l = re.compile('[!?.]')
it = l.finditer(cleartext)
k=0
positions=[]
for match in it:
n = match.start()
positions.append(n)
#start cleaning
if cleartext[n]=='?' and k>1:
e = cleartext[positions[k-3]+1:n+1].strip(" \t\n\r")
if len(e)>10 and len(e)<600:
f = True
for m in forbiddenlist:
if m in e.lower():
f = False
if f:
if e[0].islower() or e[0]==' ' or not e[0].isalpha():
e = '...'+e
#end cleaning
if e.encode("utf-8") not in pureq:
qs = qs+element[0]+'\t'+element[1]+'\t'+element[2]+'\t'+element[3]+'\t'+element[4]+'\t'+element[5]+'\t'+e+'\n'
print e
print
else:
print "duplicate...this question already exists"
k=k+1
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'beautifulsoup error ---> %s ' % (e)
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
p=p+1
#dump clean questions to questions file
if not os.path.exists(qfile):
f = open(qfile,'w')
else:
f = open(qfile,'a')
if len(qs)>1:
f.write(qs.encode("utf-8"))
f.close()
#dump old urls to archive
if not os.path.exists(archive):
f = open(archive,'w')
else:
f = open(archive,'a')
f.write(data)
f.close()
#clean scaped urls file
f=open(blogfile,'w')
f.write('')
f.close()