User:Laurier Rochon/work/tragicnine/soft
< User:Laurier Rochon | work/tragicnine
Revision as of 17:24, 5 May 2011 by Laurier Rochon (talk | contribs) (Created page with "== news.cgi == <source lang='python'> #!/usr/bin/python2.6 #-*- coding:utf-8 -*- #print "Content-Type: text/html" #print from __future__ import division import urllib2 import js...")
news.cgi
#!/usr/bin/python2.6
#-*- coding:utf-8 -*-
#print "Content-Type: text/html"
#print
from __future__ import division
import urllib2
import json
import html5lib, lxml, lxml.cssselect
from BeautifulSoup import BeautifulSoup, Comment
import random
import datetime
import narrator
import filters
entries = []
us = []
titles = []
search = True
maxcrawl = 64
terms = ['tragedy']
c = 0
for term in terms:
start=0
while start<maxcrawl:
url = ('https://ajax.googleapis.com/ajax/services/search/news?v=1.0&q='+term+'&start='+ str (start)+'&rsz=large')
f = urllib2.urlopen(url)
data = json.load(f)
nb = data['responseData']['cursor']['estimatedResultCount']
if nb>maxcrawl:
nb=maxcrawl
for r in data['responseData']['results']:
entry = r['unescapedUrl']
if entry not in us:
us.append(entry)
entries.append(entry)
c = c+1
percent = int (round((c/maxcrawl)*100))
print 'harvesting links...'+str (percent)+'% of possible maximum'
start += 8
print titles
print '----------------------------------------------------------------------'
print 'done getting links - crawling them to find titles and content'
print '----------------------------------------------------------------------'
random.shuffle(entries)
#entries = ['http://www.thelantern.com/sports/commentary-sports-give-hope-escape-to-those-affected-by-tragedy-1.2218560']
for b in entries:
if search:
print '----------------------------------------------------------------------'
print 'crawling : '+b
request = urllib2.Request(b)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
try:
f=urllib2.urlopen(request)
q = f.read()
except Exception, e:
print e
try:
cc=''
for line in q:
cc=cc+line
a = BeautifulSoup(''.join(BeautifulSoup(cc).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
except Exception, e:
print e
try:
a = str (a)
body = a.split("\n")
#check min size of our chunk
article = filters.minsize(body)
sentences = article.split('. ')
print sentences
cleartext = ''
cleartext = cleartext +'\n\n'+ str (filters.cleanup(sentences))
clrstr = BeautifulSoup(cleartext,convertEntities=BeautifulSoup.HTML_ENTITIES).contents[0]
listed = clrstr.split('.')
if len(listed)>10:
search = False
l = ''
for v in listed[1:10]:
l = l + v + '. '
print
t = narrator.narrate(l.encode('utf-8'))
print t
fname = 'public_html/talk_files/episodes/'+str (datetime.date.today())
f = open(fname, 'w')
f.write(t)
f.close()
except Exception, e:
print e
<source>
== narrator.py ==
<source lang='python'>
import random
import re
import urllib
newd = []
activeppl = []
people = ['Mandy','Jules','Winnie']
def narrate(d):
c=0
script = ''
data = d.split('.')
occ = rnd(3)
lined = rnd(10)
inserttime = False
for l in data:
if l and len(l)>10:
ol = len(l)
person = people[rnd(3)]
activeppl.append(person)
l = l.replace('"','')
#replace 2 non-ascii quotation marks by nothing
v = urllib.quote(l)
v = v.replace("%E2%80%99","%27")
l = urllib.unquote(v)
l = l.replace("''","")
#make sure the 1st character is always a letter
while not l[0].isalpha():
l = l[1:]
try:
if not l[0].isupper():
l = l.replace(l[0],l[0].upper(),1)
except Exception, e:
print e
#take out the 'he said' 'she said' blablabla
l = re.sub(r',\s\bsaid\b.*','',l)
l = re.sub(r',.*\bsaid\b','',l)
#small sentence? make it a question!
if ol < 30:
l = l + '?'
else:
l = l + '.'
#check if it's the same person talking
if c>0 and person != activeppl[c-1]:
same = True
else:
same = False
if c==0:
same = True
before = rnd(2)
if before==0:
l = '"'+l.strip(' \t\n\r')+'"' +str (postverb(person,same))
else:
if same:
add = str (person) + ' : '
else:
add = ''
l = add+'"'+l.strip(' \t\n\r')+'"'
#group many lines by same person together
if same:
l = '\n' + l
if c>3 and not same and not inserttime:
l = '\n'+str (givemetime())
inserttime = True
#give a bit of context...
#if occ!=0 and lined==c:
script = str (script) + str (l)+'\n'
#script = script.replace('\n\n\n','\n\n')
c = c + 1
z = situate('')
script = z +'\n'+script
return script
def postverb(person,same):
if same:
order = rnd(3)
verbs = ['said','offered','affirmed','mumbled','voiced','said','declared','suggested']
if order==0:
return ' '+str (person) +' '+str (verbs[rnd(len(verbs))])
elif order==1:
return ' '+str (verbs[rnd(len(verbs))])+' '+str (person)
else:
return ' - '+str (person)
else:
return ''
def givemetime():
time = ['(a few minutes later)','long hours pass, eventually they agree to continue the conversation...','the sun goes down slowly, as they finish their conversation','the air gets slghtly heavier, weighing on everyone present','a distant ringing is heard in the distance...','half an hour later','the next day...','suddenly something springs up...']
return time[rnd(len(time))]
def context(param):
group = []
place = []
action = []
if param==0:
g1 = str (people[0]) + ' and '+ str(people[1])
g2 = str (people[0]) + ' and '+ str(people[2])
g3 = str (people[1]) + ' and '+ str(people[2])
group = ['they','the gang','the friends','all of them',g1,g2,g3]
return group[rnd(len(group))]
if param==1:
place = ['downstairs','upstairs','outside','to the lake','towards the frosted bay window','in the kitchen','into the hallway','in the office', 'on the porch','towards the bathroom','sit on the sofa in front of the tv','take a seat in the kitchen','to the table','grab a drink','']
return place[rnd(len(place))]
if param==2:
action = ['ponder life for a short moment',' stare at each other, like they had never met before','wait for someone to break the silence',str (people[rnd(3)])+' sighs, looking at the other two','act as if somehow, they did could not comprehend what had just been said']
return action[rnd(len(action))]
def rnd(nm):
no = random.randrange(0,nm)
return no
def situate(line):
sentence = str (context(0).capitalize()) + ' go ' + str (context(1)) + ' and ' + str (context(2))
return sentence
Filters.py
this file in particular needs massive reworking...sigh.
from __future__ import division
import nltk.util
import re
def minsize(body):
article = ''
#go line by line
for line in body:
#are you at least 100 chars? (sentence)
if len(line)>100:
#line = line.replace('','')
line = line.replace("\xe2\x80\x9c","")
line = line.replace("\xe2\x80\x94","")
line = line.replace("\xe2\x80\x9d","")
line = line.replace("Mrs.","Mrs")
line = line.replace("Ms.","Ms")
line = line.replace("Mr.","Mr")
line = line.replace("Dr.","Dr")
line = line.replace('"','')
line = line.replace(' ',' ')
line = nltk.util.clean_html(line)
line = line.replace("Jan.","January")
line = line.replace("Feb.","February")
line = line.replace("Mar.","March")
line = line.replace("Apr.","April")
line = line.replace("Jun.","June")
line = line.replace("Jul.","July")
line = line.replace("Aug.","August")
line = line.replace("Sep.","Septembre")
line = line.replace("Oct.","October")
line = line.replace("Nov.","November")
line = line.replace("Dec.","December")
article = article + line
return article
def cleanup(sentences):
sents = ''
blacklist = ['|','Internet Options','IE','comment','ENCODING','rss','Login','article','.com','Favorites','Advertisement','PM','AM','ET' 'DOCTYPE','login','password','loading','Loading','form','Stories','/','Buy','buy','Comment','News','Feed']
for sent in sentences:
found = False
sent = re.sub('\.(?![a-zA-Z]{2})','',sent)
#do you contain blacklist elements (passed the pre-html filter)?
for x in blacklist:
if x in sent:
found = True
break
#
#too many capital words (over 40%) = probably up to no good
capitals = 0
words = sent.split(' ')
if len(words)>0:
for ww in words:
#and words that are too long (over 25 chars?)
if len(ww)>25:
found = True
else:
if ww:
if ww[0].isupper():
capitals = capitals + 1
if round(capitals/len(words)*100) > 40:
found = True
#
#if filters above are passed...guess you're ok to join the bunch
if not found:
sents = sents + str (sent) +'. '
return sents