User:Laurier Rochon/work/tragicnine/soft

From XPUB & Lens-Based wiki

news.cgi

#!/usr/bin/python2.6
#-*- coding:utf-8 -*-
#print "Content-Type: text/html"
#print
from __future__ import division
import urllib2
import json
import html5lib, lxml, lxml.cssselect
from BeautifulSoup import BeautifulSoup, Comment
import random
import datetime

import narrator
import filters

entries = []
us = []
titles = []

search = True
maxcrawl = 64
terms = ['tragedy']
c = 0

for term in terms:
		start=0
		while start<maxcrawl:
			url = ('https://ajax.googleapis.com/ajax/services/search/news?v=1.0&q='+term+'&start='+ str (start)+'&rsz=large')
		 
			f = urllib2.urlopen(url)
			data = json.load(f)
			nb = data['responseData']['cursor']['estimatedResultCount']
			if nb>maxcrawl:
				nb=maxcrawl
			for r in data['responseData']['results']:
				entry = r['unescapedUrl']
				if entry not in us:
					us.append(entry)
					entries.append(entry)
					c = c+1
					percent = int (round((c/maxcrawl)*100))
					print 'harvesting links...'+str (percent)+'% of possible maximum'
			start += 8
print titles
print '----------------------------------------------------------------------'
print 'done getting links - crawling them to find titles and content'
print '----------------------------------------------------------------------'

random.shuffle(entries)

#entries = ['http://www.thelantern.com/sports/commentary-sports-give-hope-escape-to-those-affected-by-tragedy-1.2218560']
for b in entries:
	if search:
		print '----------------------------------------------------------------------'
		print 'crawling : '+b
		request = urllib2.Request(b)
		request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")


		try:
			f=urllib2.urlopen(request)
			q = f.read()

		except Exception, e:
			print e



		try:
			cc=''
			for line in q:
				cc=cc+line
			a = BeautifulSoup(''.join(BeautifulSoup(cc).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))

		except Exception, e:
			print e



		try:
			a = str (a)
			body = a.split("\n")
	
			#check min size of our chunk
			article = filters.minsize(body)
			sentences = article.split('. ')
			print sentences
			cleartext = ''
			cleartext = cleartext +'\n\n'+ str (filters.cleanup(sentences))
			clrstr = BeautifulSoup(cleartext,convertEntities=BeautifulSoup.HTML_ENTITIES).contents[0]
			listed = clrstr.split('.')
			if len(listed)>10:
				search = False
				l = ''
				for v in listed[1:10]:
					l = l + v + '. '
				print 
				t = narrator.narrate(l.encode('utf-8'))
				print t
				fname = 'public_html/talk_files/episodes/'+str (datetime.date.today())
				f = open(fname, 'w')
				f.write(t)
				f.close()

		except Exception, e:
			print e


narrator.py

import random
import re
import urllib

newd = []
activeppl = []
people = ['Mandy','Jules','Winnie']

def narrate(d):
	c=0
	script = ''
	data = d.split('.')
	occ = rnd(3)
	lined = rnd(10)
	inserttime = False

	for l in data:
		if l and len(l)>10:
			ol = len(l)
			person = people[rnd(3)]
			activeppl.append(person)
			
			l = l.replace('"','')
			#replace 2 non-ascii quotation marks by nothing
			v = urllib.quote(l)
			v = v.replace("%E2%80%99","%27")
			l = urllib.unquote(v)
			l = l.replace("''","")

			#make sure the 1st character is always a letter
			while not l[0].isalpha():
				l = l[1:]
			try:
				if not l[0].isupper():
					l = l.replace(l[0],l[0].upper(),1)
			except Exception, e:
				print e

			#take out the 'he said' 'she said' blablabla
			l = re.sub(r',\s\bsaid\b.*','',l)
			l = re.sub(r',.*\bsaid\b','',l)

			#small sentence? make it a question!
			if ol < 30:
				l = l + '?'
			else:
				l = l + '.'
			
			#check if it's the same person talking
			if c>0 and person != activeppl[c-1]:
				same = True
			else:
				same = False
			if c==0:
				same = True

			before = rnd(2)
			if before==0:
				l = '"'+l.strip(' \t\n\r')+'"' +str (postverb(person,same))
			else:
				if same:
					add = str (person) + ' : '
				else:
					add = ''
				l = add+'"'+l.strip(' \t\n\r')+'"'

			#group many lines by same person together
			if same:
				l = '\n' + l

			if c>3 and not same and not inserttime:
				l = '\n'+str (givemetime())
				inserttime = True

			#give a bit of context...
			#if occ!=0 and lined==c:
			script = str (script) + str (l)+'\n'
			#script = script.replace('\n\n\n','\n\n')
			c = c + 1
	z = situate('')
	script = z +'\n'+script
	return script

def postverb(person,same):
	if same:
		order = rnd(3)
		verbs = ['said','offered','affirmed','mumbled','voiced','said','declared','suggested']
		if order==0:				
			return ' '+str (person) +' '+str (verbs[rnd(len(verbs))])
		elif order==1:
			return ' '+str (verbs[rnd(len(verbs))])+' '+str (person)
		else:
			return ' - '+str (person)
	else:
		return ''

def givemetime():
	time = ['(a few minutes later)','long hours pass, eventually they agree to continue the conversation...','the sun goes down slowly, as they finish their conversation','the air gets slghtly heavier, weighing on everyone present','a distant ringing is heard in the distance...','half an hour later','the next day...','suddenly something springs up...']
	return time[rnd(len(time))]

def context(param):
	group = []
	place = []
	action = []
	if param==0:
		g1 = str (people[0]) + ' and  '+ str(people[1])
		g2 = str (people[0]) + ' and  '+ str(people[2])
		g3 = str (people[1]) + ' and  '+ str(people[2])
		group = ['they','the gang','the friends','all of them',g1,g2,g3]
		return group[rnd(len(group))]

	if param==1:
		place = ['downstairs','upstairs','outside','to the lake','towards the frosted bay window','in the kitchen','into the hallway','in the office', 'on the porch','towards the bathroom','sit on the sofa in front of the tv','take a seat in the kitchen','to the table','grab a drink','']
		return place[rnd(len(place))]

	if param==2:
		action = ['ponder life for a short moment',' stare at each other, like they had never met before','wait for someone to break the silence',str  (people[rnd(3)])+' sighs, looking at the other two','act as if somehow, they did could not comprehend what had just been said']
		return action[rnd(len(action))]

def rnd(nm):
	no = random.randrange(0,nm)
	return 	no

def situate(line):
	sentence = str (context(0).capitalize()) + ' go ' + str (context(1)) + ' and ' + str (context(2))
	return sentence


Filters.py

this file in particular needs massive reworking...sigh.

from __future__ import division
import nltk.util
import re
def minsize(body):
	article = ''

	#go line by line
	for line in body:

		#are you at least 100 chars? (sentence)
		if len(line)>100:
			#line = line.replace('','')
			line = line.replace("\xe2\x80\x9c","")
			line = line.replace("\xe2\x80\x94","")
			line = line.replace("\xe2\x80\x9d","")
			line = line.replace("Mrs.","Mrs")
			line = line.replace("Ms.","Ms")
			line = line.replace("Mr.","Mr")
			line = line.replace("Dr.","Dr")
			line = line.replace('"','')
			line = line.replace('  ',' ')
			line = nltk.util.clean_html(line)

			line = line.replace("Jan.","January")
			line = line.replace("Feb.","February")
			line = line.replace("Mar.","March")
			line = line.replace("Apr.","April")
			line = line.replace("Jun.","June")
			line = line.replace("Jul.","July")
			line = line.replace("Aug.","August")
			line = line.replace("Sep.","Septembre")
			line = line.replace("Oct.","October")
			line = line.replace("Nov.","November")
			line = line.replace("Dec.","December")

			article = article + line
	return article

def cleanup(sentences):

	sents = ''
	blacklist = ['|','Internet Options','IE','comment','ENCODING','rss','Login','article','.com','Favorites','Advertisement','PM','AM','ET' 'DOCTYPE','login','password','loading','Loading','form','Stories','/','Buy','buy','Comment','News','Feed']

	for sent in sentences:
		found = False

		sent = re.sub('\.(?![a-zA-Z]{2})','',sent)

		#do you contain blacklist elements (passed the pre-html filter)?
		for x in blacklist:
			if x in sent:
				found = True
				break
		#


		#too many capital words (over 40%) = probably up to no good
		capitals = 0
		words = sent.split(' ')
		if len(words)>0:
			for ww in words:
				#and words that are too long (over 25 chars?)
				if len(ww)>25:
					found = True
				else:				
					if ww:
						if ww[0].isupper():
							capitals = capitals + 1
		if round(capitals/len(words)*100) > 40:
			found = True
		#


		#if filters above are passed...guess you're ok to join the bunch
		if not found:
			sents = sents + str (sent) +'. '
	return sents