User:Laurier Rochon/prototyping/??????????soft: Difference between revisions

Latest revision as of 22:06, 24 March 2011

Tragedy generator - v0.1

#!/usr/bin/python2.6
#-*- coding:utf-8 -*-
#print "Content-Type: text/html"
#print
from __future__ import division
import urllib2
from BeautifulSoup import BeautifulSoup, Comment
import json
import os
import fileinput
import random
import safe_html
import nltk.util

terms = ['tragedy']
entries=[]
us = []
titles=10
content=50000
maxcrawl = 64

blacklist = ['|','comment','ENCODING','Login','.com','Favorites', 'DOCTYPE','login','password','loading']

live=1

if live==0:
	for a in range(40):
		fn = 'items/item'+str (a)
		#fn = 'items/item'+str (random.randrange(40))
		c=''
		for line in fileinput.input(fn):
			c=c+line
		soup = BeautifulSoup(safe_html.safe_html(c))
		#soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=True)))
		heading = soup.h1
		if heading:
			heading = str (heading.renderContents())
			print heading.strip(" \t\n\r")
		a = BeautifulSoup(''.join(BeautifulSoup(safe_html.safe_html(c)).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
		cleartext = ''
		a = str (a)
		body = a.split("\n")
		for line in body:
			if len(line)>250:
				line = line.replace("  "," ")
				cleartext = cleartext +'\n\n'+ line
		print cleartext
		print '---------------------------------'

else:
	c=0
	for term in terms:
		start=0
		while start<maxcrawl:
			url = ('https://ajax.googleapis.com/ajax/services/search/news?v=1.0&q='+term+'&start='+ str (start)+'&rsz=large&geo=china')
		 
			f = urllib2.urlopen(url)
			data = json.load(f)
			nb = data['responseData']['cursor']['estimatedResultCount']
			if nb>maxcrawl:
				nb=maxcrawl
			for r in data['responseData']['results']:
				entry = r['unescapedUrl']
				if entry not in us:
					us.append(entry)
					#print entry
					entries.append(entry)
					c = c+1
					percent = int (round((c/maxcrawl)*100))
					print 'harvesting links...'+str (percent)+'% of possible maximum'
			start += 8

	print '----------------------------------------------------------------------'
	print 'done getting links - crawling them to find titles and content'
	print '----------------------------------------------------------------------'
	



	p=0
	w=0
	#random.shuffle(entries)
	for b in entries:
		if w<content:
			print '----------------------------------------------------------------------'
			print 'crawling : '+b
			request = urllib2.Request(b)
			request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
			try:
				f=urllib2.urlopen(request)
				q = f.read()
			except Exception, e:
				print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
				print 'urllib error ---> %s' % (e)
				print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
			try:
				c=''
				for line in q:
					c=c+line
				soup = BeautifulSoup(safe_html.safe_html(c))
				#find headings
				heading = soup.h1
				if heading:
					heading = str (heading.renderContents())
					heading = nltk.util.clean_html(heading.strip(" \t\n\r"))

				a = BeautifulSoup(''.join(BeautifulSoup(safe_html.safe_html(c)).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))

			except Exception, e:
				print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
				print 'beautifulsoup error ---> %s ' % (e)
				print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
			
			cleartext = ''
			a = str (a)
			body = a.split("\n")
			article = ''

			#go line by line
			for line in body:

				#are you at least 250 chars?
				if len(line)>250:
					line = line.replace("  "," ")
					line = nltk.util.clean_html(line)
					article = article + line
			
			sentences = article.split('. ')

			#give 15 sentences max per article...this is shaky
			if len(sentences)>14:
				sentences = sentences[:14]

			sents = ''

			for sent in sentences:
				found = False

				#do you contain blacklist elements (passed the pre-html filter)?
				for x in blacklist:
					if x in sent:
						found = True
						break
				#


				#too many capital words (over 40%) = probably up to no good
				capitals = 0
				words = sent.split(' ')
				if len(words)>0:
					for ww in words:
						if ww:
							if ww[0].isupper():
								capitals = capitals + 1
				if round(capitals/len(words)*100) > 40:
					found = True
				#

				
				#if filters above are passed...guess you're ok to join the bunch
				if not found:
					sents = sents + str (sent) +'. '

			cleartext = cleartext +'\n\n'+ str (sents)
			
			#make things more readable for humans -> this converts HTML entities
			#clrstr = BeautifulSoup(cleartext,convertEntities=BeautifulSoup.HTML_ENTITIES).contents[0]
			clrstr = cleartext

			#this shouldn't be needed...somehow small fragments still make their way down here
			if len(clrstr)>200:
				if heading:
					print
					print heading
				print clrstr

				w = w+len(cleartext)
				per = w/content*100

				print
				print 'found new content : '+str (per)+'%'
		p=p+1
	print
	print 'done crawling, we have enough content now'

Output

harvesting links...2% of possible maximum
harvesting links...3% of possible maximum
harvesting links...5% of possible maximum
harvesting links...6% of possible maximum
harvesting links...8% of possible maximum
harvesting links...9% of possible maximum
harvesting links...11% of possible maximum
harvesting links...13% of possible maximum
harvesting links...14% of possible maximum
harvesting links...16% of possible maximum
harvesting links...17% of possible maximum
harvesting links...19% of possible maximum
harvesting links...20% of possible maximum
harvesting links...22% of possible maximum
harvesting links...23% of possible maximum
harvesting links...25% of possible maximum
harvesting links...27% of possible maximum
harvesting links...28% of possible maximum
harvesting links...30% of possible maximum
harvesting links...31% of possible maximum
harvesting links...33% of possible maximum
harvesting links...34% of possible maximum
harvesting links...36% of possible maximum
harvesting links...38% of possible maximum
harvesting links...39% of possible maximum
harvesting links...41% of possible maximum
harvesting links...42% of possible maximum
harvesting links...44% of possible maximum
harvesting links...45% of possible maximum
harvesting links...47% of possible maximum
harvesting links...48% of possible maximum
harvesting links...50% of possible maximum
harvesting links...52% of possible maximum
harvesting links...53% of possible maximum
harvesting links...55% of possible maximum
harvesting links...56% of possible maximum
harvesting links...58% of possible maximum
harvesting links...59% of possible maximum
harvesting links...61% of possible maximum
harvesting links...63% of possible maximum
harvesting links...64% of possible maximum
harvesting links...66% of possible maximum
harvesting links...67% of possible maximum
harvesting links...69% of possible maximum
harvesting links...70% of possible maximum
harvesting links...72% of possible maximum
harvesting links...73% of possible maximum
harvesting links...75% of possible maximum
harvesting links...77% of possible maximum
harvesting links...78% of possible maximum
harvesting links...80% of possible maximum
harvesting links...81% of possible maximum
harvesting links...83% of possible maximum
harvesting links...84% of possible maximum
harvesting links...86% of possible maximum
harvesting links...88% of possible maximum
harvesting links...89% of possible maximum
harvesting links...91% of possible maximum
harvesting links...92% of possible maximum
harvesting links...94% of possible maximum
harvesting links...95% of possible maximum
harvesting links...97% of possible maximum
harvesting links...98% of possible maximum
harvesting links...100% of possible maximum
----------------------------------------------------------------------
done getting links - crawling them to find titles and content
----------------------------------------------------------------------
----------------------------------------------------------------------
crawling : http://www.atimes.com/atimes/China/MC25Ad01.html
----------------------------------------------------------------------
crawling : http://af.reuters.com/article/metalsNews/idAFL3E7EO0N020110324
----------------------------------------------------------------------
crawling : http://www.nypress.com/article-22247-education-chinese-rising-in-language-popularity.html

EDUCATION: Chinese Rising in Language Popularity


In this melting pot of a city, different dialects are thrown left and right, and as politics and business get concentrated overseas, Americans are picking up these languages. One way they learn to do it is at one of the dozens of schools specializing in foreign language studies, from the ever-popular Spanish to French to Japanese. Though Arabic too has recently come on strong, nothing appears to tromp the latest language craze: Chinese.&quot;It's become a buzz word, kind of like Japan was in the '80s when America concentrated on it and its growth,&quot; said Jon Hills, director of Hills Learning, a language school in Manhattan. &quot;Then, we thought we would be speaking Japanese soon, but now, Chinese has taken its place.&quot;No matter what city you go to, you can expect to find at least some version of a Chinatown, or a small pocket of Chinese immigrants living and working. Based on the CIA's World Fact Book, the estimated number of people living in China by July 2011 is 1,336,718,015. The number of people in the United States is approximately one billion less than that. So it comes as no surprise that the interest in learning Chinese has steadily been climbing.&quot;It's kind of a no-brainer for people following the language markets,&quot; said Hills. &quot;Schools in particular are putting investments into Chinese language classes, which also reflect the parents and what they think the next important language is going to be.&quot;Take Xiao Bao Chinese, a school run by Danielle Chang, who also organized the first all-Asian food and culture festival, Luckyrice. The school, in partnership with the Museum of Chinese in America, teaches children ages 5 and under about Chinese language and culture. Chang, who is Chinese, started the school five years ago in response to her own young daughter who declared she wasn't Chinese, she was American.&quot;That really upset me, so I basically started this program for young children as a response,&quot; said Chang. &quot;It snowballed, and one class turned into seven. I think parents really want to expose their kids to Chinese from an early age.&quot;Many of the kids that go to Xiao Bao aren't Chinese, nor have a strong affiliation to the country. Chang said it wasn't just about learning her language; she thinks most smart, cultured and sophisticated people see being bilingual as a tool, both in the utilitarian sense and as a way to expand the mind. Before Mandarin Chinese became popular, there was French and Japanese, but in the long run, not many people use these languages compared to the number of Chinese and Spanish speakers.Given that, Spanish is still one of the top languages being studied at ABC Language Exchange. 

found new content : 5.42%
----------------------------------------------------------------------
crawling : http://www.ft.com/cms/s/39b273f4-5556-11e0-87fe-00144feab49a.html

Edinburgh festival to honour Asian influence


Commenting on them, Jonathan Mills, the festival’s director, said: “European artists, explorers and philosophers have drawn inspiration from the Far East for centuries. This festival draws inspiration from the diverse cultures of Asia – from Vietnam to China.”One production to bring together east and west will be the Peony Pavilion, performed by the National Ballet of China with western classical ballet, a classic symphony orchestra and traditional Chinese instruments. It is based on a love story by one of China’s greatest writers, Tang Xianzu, a contemporary of Shakespeare.The Tempest will be re-imagined by Mokwha Repertory Company from Seoul in a production that weaves Shakespeare’s play with fifth-century Korean chronicles. In another adaptation of Shakespeare, Shanghai Peking Opera Troupe retells the familiar tale of Hamlet, setting it in China and performing the tragedy in the acrobatic and elaborately costumed style of Jingju opera.Scottish Ballet and the Royal Scottish National Orchestra will perform Sir Kenneth MacMillan’s ballet The Song of the Earth, set to Gustav Mahler’s song cycle inspired by Chinese poetry from the Tang Dynasty.Valery Gergiev and the Mariinsky Opera will bring a large scale production by Jonathan Kent, designed by Paul Brown, of Richard Strauss’s epic opera Die Frau ohne Schatten, spanning life on earth and in the spirit world. The fireworks concert that provides a traditional finale to the festival has found a new sponsor in Virgin Money, the financial services division of the Virgin Group, which is expanding from a new base in Edinburgh.It takes over that role from Bank of Scotland, which is now part of Lloyds Banking Group, though both Bank of Scotland and Royal Bank of Scotland continue as corporate friends of the festival. Besides Virgin Money, HSBC and Shell UK will sponsor festival productions for the first time.Mr Mills said this meant the festival’s corporate sponsorship was very healthy, in what was a very challenging environment.Copyright The Financial Times Limited 2011. You may share using our article tools. 

found new content : 9.64%
----------------------------------------------------------------------

Links scraper /w API

Scraping blog urls, checking them against the current archive + storing them in a tab-separated file.

And then adding a cron job lets you pile up the results like this, doing la scrape every hour.

#!/usr/bin/python2.6

#!/usr/bin/python2.6
#-*- coding:utf-8 -*-
print "Content-Type: text/html"
print
import urllib2
import json
from datetime import date
import os

txt = 'blogs'

start=0
scrapedate=date.today()
entries=[]
urllist=[]

if not os.path.exists(txt):
	f = open(txt,'w')
	f.close()
else:
	f = open(txt,'r')
	data = f.read()
	if len(data)>0:
		urls = data.split('\n')
		for a in urls:
			line = a.split('\t')
			if len(line)>1:
				urllist.append(line[2])
c=0
while start<64:
	url = ('https://ajax.googleapis.com/ajax/services/search/blogs?v=1.0&q=myself&start='+ str (start)+'&rsz=large')
 
	f = urllib2.urlopen(url)
	data = json.load(f)
	for r in data['responseData']['results']:
		if r['postUrl'] not in urllist:
			entry = "%s\t%s\t%s\t%s\t%s\t%s" % (scrapedate, r['title'], r['postUrl'], r['publishedDate'], r['blogUrl'], r['author'])
			entry = entry.encode("utf-8")
			entries.append(entry)
			c = c+1
	start += 8

print 'added %s entries' % (c)

se = '\n'.join(map(str, entries))
f = open(txt,'a')
if c>0:
	f.write(se)
	f.write('\n')
f.close()

execfile("spider.py")

Verbose blog page scraper : python + beautifulsoup

- - - updated 22nd feb 2011

- - - updated 19th feb 2011

#!/usr/bin/python2.6

import os
import urllib2
from BeautifulSoup import BeautifulSoup
import re

blogfile = 'blogs'
qfile = 'questions'
archive = 'archive'

forbiddenlist=['comment','wordpress','related','categories','dtd','w3','archives','tags','admin','php','twitter','subscribe','articles','.com','says:','linkback','post','password','statcounter','class=']

if os.path.exists(qfile):
	ff = open(qfile,'r')
	pureq = ff.read()
else:
	pureq = ''

f = open(blogfile,'r')
data = f.read()
f.close()

rows = data.split('\n')

#put the text file in a 2D array, I imagine this will save me some work later
a = []
c=0
for row in rows:
	items = row.split('\t')
	if len(items) > 1:
		a.append([])
		for item in items:
			a[c].append(item)
		c=c+1

p=1
qs = ""
for element in a:
	print '----------------------------------------------------------------------'
	print 'scraping link %d of %d' % (p,len(a))
	print '----------------------------------------------------------------------'
	target = element[2]
	request = urllib2.Request(target)
	request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
	try:
		f=urllib2.urlopen(request)
		c = f.read()
	except Exception, e:
		print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
		print 'urllib error ---> %s' % (e)
		print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
	try:
		soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
		
		cleartext = ""
		for line in soup:
			if len(line)>10:
				line = line.replace("\n"," ")
				line = line.replace("  ","")
				cleartext = cleartext + line

		#starts = [match.start() for match in re.finditer(re.escape('?'), cleartext)]
		l = re.compile('[!?.]')
		it = l.finditer(cleartext)
		k=0
		positions=[]
		for match in it:
			n = match.start()
			positions.append(n)
			#start cleaning
			if cleartext[n]=='?' and k>1:
				e = cleartext[positions[k-3]+1:n+1].strip(" \t\n\r")
				if len(e)>10 and len(e)<600:
					f = True
					for m in forbiddenlist:
						if m in e.lower():
							f = False
					if f:
						if e[0].islower() or e[0]==' ' or not e[0].isalpha():
							e = '...'+e
						#end cleaning
						if e.encode("utf-8") not in pureq:
							qs = qs+element[0]+'\t'+element[1]+'\t'+element[2]+'\t'+element[3]+'\t'+element[4]+'\t'+element[5]+'\t'+e+'\n'
							print e
							print 
						else:
							print "duplicate...this question already exists"
			k=k+1
				
		
		
	except Exception, e:
		print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
		print 'beautifulsoup error ---> %s ' % (e)
		print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
	p=p+1

#dump clean questions to questions file
if not os.path.exists(qfile):
	f = open(qfile,'w')
else:
	f = open(qfile,'a')
if len(qs)>1:
	f.write(qs.encode("utf-8"))
f.close()

#dump old urls to archive
if not os.path.exists(archive):
	f = open(archive,'w')
else:
	f = open(archive,'a')
f.write(data)
f.close()

#clean scaped urls file
f=open(blogfile,'w')
f.write('')
f.close()