User:Laurier Rochon/prototyping/??????????soft: Difference between revisions
No edit summary |
No edit summary |
||
(3 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
== Tragedy generator - v0.1 == | |||
<source lang="python"> | |||
#!/usr/bin/python2.6 | |||
#-*- coding:utf-8 -*- | |||
#print "Content-Type: text/html" | |||
#print | |||
from __future__ import division | |||
import urllib2 | |||
from BeautifulSoup import BeautifulSoup, Comment | |||
import json | |||
import os | |||
import fileinput | |||
import random | |||
import safe_html | |||
import nltk.util | |||
terms = ['tragedy'] | |||
entries=[] | |||
us = [] | |||
titles=10 | |||
content=50000 | |||
maxcrawl = 64 | |||
blacklist = ['|','comment','ENCODING','Login','.com','Favorites', 'DOCTYPE','login','password','loading'] | |||
live=1 | |||
if live==0: | |||
for a in range(40): | |||
fn = 'items/item'+str (a) | |||
#fn = 'items/item'+str (random.randrange(40)) | |||
c='' | |||
for line in fileinput.input(fn): | |||
c=c+line | |||
soup = BeautifulSoup(safe_html.safe_html(c)) | |||
#soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=True))) | |||
heading = soup.h1 | |||
if heading: | |||
heading = str (heading.renderContents()) | |||
print heading.strip(" \t\n\r") | |||
a = BeautifulSoup(''.join(BeautifulSoup(safe_html.safe_html(c)).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style"))) | |||
cleartext = '' | |||
a = str (a) | |||
body = a.split("\n") | |||
for line in body: | |||
if len(line)>250: | |||
line = line.replace(" "," ") | |||
cleartext = cleartext +'\n\n'+ line | |||
print cleartext | |||
print '---------------------------------' | |||
else: | |||
c=0 | |||
for term in terms: | |||
start=0 | |||
while start<maxcrawl: | |||
url = ('https://ajax.googleapis.com/ajax/services/search/news?v=1.0&q='+term+'&start='+ str (start)+'&rsz=large&geo=china') | |||
f = urllib2.urlopen(url) | |||
data = json.load(f) | |||
nb = data['responseData']['cursor']['estimatedResultCount'] | |||
if nb>maxcrawl: | |||
nb=maxcrawl | |||
for r in data['responseData']['results']: | |||
entry = r['unescapedUrl'] | |||
if entry not in us: | |||
us.append(entry) | |||
#print entry | |||
entries.append(entry) | |||
c = c+1 | |||
percent = int (round((c/maxcrawl)*100)) | |||
print 'harvesting links...'+str (percent)+'% of possible maximum' | |||
start += 8 | |||
print '----------------------------------------------------------------------' | |||
print 'done getting links - crawling them to find titles and content' | |||
print '----------------------------------------------------------------------' | |||
p=0 | |||
w=0 | |||
#random.shuffle(entries) | |||
for b in entries: | |||
if w<content: | |||
print '----------------------------------------------------------------------' | |||
print 'crawling : '+b | |||
request = urllib2.Request(b) | |||
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") | |||
try: | |||
f=urllib2.urlopen(request) | |||
q = f.read() | |||
except Exception, e: | |||
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | |||
print 'urllib error ---> %s' % (e) | |||
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | |||
try: | |||
c='' | |||
for line in q: | |||
c=c+line | |||
soup = BeautifulSoup(safe_html.safe_html(c)) | |||
#find headings | |||
heading = soup.h1 | |||
if heading: | |||
heading = str (heading.renderContents()) | |||
heading = nltk.util.clean_html(heading.strip(" \t\n\r")) | |||
a = BeautifulSoup(''.join(BeautifulSoup(safe_html.safe_html(c)).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style"))) | |||
except Exception, e: | |||
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | |||
print 'beautifulsoup error ---> %s ' % (e) | |||
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | |||
cleartext = '' | |||
a = str (a) | |||
body = a.split("\n") | |||
article = '' | |||
#go line by line | |||
for line in body: | |||
#are you at least 250 chars? | |||
if len(line)>250: | |||
line = line.replace(" "," ") | |||
line = nltk.util.clean_html(line) | |||
article = article + line | |||
sentences = article.split('. ') | |||
#give 15 sentences max per article...this is shaky | |||
if len(sentences)>14: | |||
sentences = sentences[:14] | |||
sents = '' | |||
for sent in sentences: | |||
found = False | |||
#do you contain blacklist elements (passed the pre-html filter)? | |||
for x in blacklist: | |||
if x in sent: | |||
found = True | |||
break | |||
# | |||
#too many capital words (over 40%) = probably up to no good | |||
capitals = 0 | |||
words = sent.split(' ') | |||
if len(words)>0: | |||
for ww in words: | |||
if ww: | |||
if ww[0].isupper(): | |||
capitals = capitals + 1 | |||
if round(capitals/len(words)*100) > 40: | |||
found = True | |||
# | |||
#if filters above are passed...guess you're ok to join the bunch | |||
if not found: | |||
sents = sents + str (sent) +'. ' | |||
cleartext = cleartext +'\n\n'+ str (sents) | |||
#make things more readable for humans -> this converts HTML entities | |||
#clrstr = BeautifulSoup(cleartext,convertEntities=BeautifulSoup.HTML_ENTITIES).contents[0] | |||
clrstr = cleartext | |||
#this shouldn't be needed...somehow small fragments still make their way down here | |||
if len(clrstr)>200: | |||
if heading: | |||
print | |||
print heading | |||
print clrstr | |||
w = w+len(cleartext) | |||
per = w/content*100 | |||
print | |||
print 'found new content : '+str (per)+'%' | |||
p=p+1 | |||
print | |||
print 'done crawling, we have enough content now' | |||
</source> | |||
== Output == | |||
<source lang='text'> | |||
harvesting links...2% of possible maximum | |||
harvesting links...3% of possible maximum | |||
harvesting links...5% of possible maximum | |||
harvesting links...6% of possible maximum | |||
harvesting links...8% of possible maximum | |||
harvesting links...9% of possible maximum | |||
harvesting links...11% of possible maximum | |||
harvesting links...13% of possible maximum | |||
harvesting links...14% of possible maximum | |||
harvesting links...16% of possible maximum | |||
harvesting links...17% of possible maximum | |||
harvesting links...19% of possible maximum | |||
harvesting links...20% of possible maximum | |||
harvesting links...22% of possible maximum | |||
harvesting links...23% of possible maximum | |||
harvesting links...25% of possible maximum | |||
harvesting links...27% of possible maximum | |||
harvesting links...28% of possible maximum | |||
harvesting links...30% of possible maximum | |||
harvesting links...31% of possible maximum | |||
harvesting links...33% of possible maximum | |||
harvesting links...34% of possible maximum | |||
harvesting links...36% of possible maximum | |||
harvesting links...38% of possible maximum | |||
harvesting links...39% of possible maximum | |||
harvesting links...41% of possible maximum | |||
harvesting links...42% of possible maximum | |||
harvesting links...44% of possible maximum | |||
harvesting links...45% of possible maximum | |||
harvesting links...47% of possible maximum | |||
harvesting links...48% of possible maximum | |||
harvesting links...50% of possible maximum | |||
harvesting links...52% of possible maximum | |||
harvesting links...53% of possible maximum | |||
harvesting links...55% of possible maximum | |||
harvesting links...56% of possible maximum | |||
harvesting links...58% of possible maximum | |||
harvesting links...59% of possible maximum | |||
harvesting links...61% of possible maximum | |||
harvesting links...63% of possible maximum | |||
harvesting links...64% of possible maximum | |||
harvesting links...66% of possible maximum | |||
harvesting links...67% of possible maximum | |||
harvesting links...69% of possible maximum | |||
harvesting links...70% of possible maximum | |||
harvesting links...72% of possible maximum | |||
harvesting links...73% of possible maximum | |||
harvesting links...75% of possible maximum | |||
harvesting links...77% of possible maximum | |||
harvesting links...78% of possible maximum | |||
harvesting links...80% of possible maximum | |||
harvesting links...81% of possible maximum | |||
harvesting links...83% of possible maximum | |||
harvesting links...84% of possible maximum | |||
harvesting links...86% of possible maximum | |||
harvesting links...88% of possible maximum | |||
harvesting links...89% of possible maximum | |||
harvesting links...91% of possible maximum | |||
harvesting links...92% of possible maximum | |||
harvesting links...94% of possible maximum | |||
harvesting links...95% of possible maximum | |||
harvesting links...97% of possible maximum | |||
harvesting links...98% of possible maximum | |||
harvesting links...100% of possible maximum | |||
---------------------------------------------------------------------- | |||
done getting links - crawling them to find titles and content | |||
---------------------------------------------------------------------- | |||
---------------------------------------------------------------------- | |||
crawling : http://www.atimes.com/atimes/China/MC25Ad01.html | |||
---------------------------------------------------------------------- | |||
crawling : http://af.reuters.com/article/metalsNews/idAFL3E7EO0N020110324 | |||
---------------------------------------------------------------------- | |||
crawling : http://www.nypress.com/article-22247-education-chinese-rising-in-language-popularity.html | |||
EDUCATION: Chinese Rising in Language Popularity | |||
In this melting pot of a city, different dialects are thrown left and right, and as politics and business get concentrated overseas, Americans are picking up these languages. One way they learn to do it is at one of the dozens of schools specializing in foreign language studies, from the ever-popular Spanish to French to Japanese. Though Arabic too has recently come on strong, nothing appears to tromp the latest language craze: Chinese."It's become a buzz word, kind of like Japan was in the '80s when America concentrated on it and its growth," said Jon Hills, director of Hills Learning, a language school in Manhattan. "Then, we thought we would be speaking Japanese soon, but now, Chinese has taken its place."No matter what city you go to, you can expect to find at least some version of a Chinatown, or a small pocket of Chinese immigrants living and working. Based on the CIA's World Fact Book, the estimated number of people living in China by July 2011 is 1,336,718,015. The number of people in the United States is approximately one billion less than that. So it comes as no surprise that the interest in learning Chinese has steadily been climbing."It's kind of a no-brainer for people following the language markets," said Hills. "Schools in particular are putting investments into Chinese language classes, which also reflect the parents and what they think the next important language is going to be."Take Xiao Bao Chinese, a school run by Danielle Chang, who also organized the first all-Asian food and culture festival, Luckyrice. The school, in partnership with the Museum of Chinese in America, teaches children ages 5 and under about Chinese language and culture. Chang, who is Chinese, started the school five years ago in response to her own young daughter who declared she wasn't Chinese, she was American."That really upset me, so I basically started this program for young children as a response," said Chang. "It snowballed, and one class turned into seven. I think parents really want to expose their kids to Chinese from an early age."Many of the kids that go to Xiao Bao aren't Chinese, nor have a strong affiliation to the country. Chang said it wasn't just about learning her language; she thinks most smart, cultured and sophisticated people see being bilingual as a tool, both in the utilitarian sense and as a way to expand the mind. Before Mandarin Chinese became popular, there was French and Japanese, but in the long run, not many people use these languages compared to the number of Chinese and Spanish speakers.Given that, Spanish is still one of the top languages being studied at ABC Language Exchange. | |||
found new content : 5.42% | |||
---------------------------------------------------------------------- | |||
crawling : http://www.ft.com/cms/s/39b273f4-5556-11e0-87fe-00144feab49a.html | |||
Edinburgh festival to honour Asian influence | |||
Commenting on them, Jonathan Mills, the festival’s director, said: “European artists, explorers and philosophers have drawn inspiration from the Far East for centuries. This festival draws inspiration from the diverse cultures of Asia – from Vietnam to China.”One production to bring together east and west will be the Peony Pavilion, performed by the National Ballet of China with western classical ballet, a classic symphony orchestra and traditional Chinese instruments. It is based on a love story by one of China’s greatest writers, Tang Xianzu, a contemporary of Shakespeare.The Tempest will be re-imagined by Mokwha Repertory Company from Seoul in a production that weaves Shakespeare’s play with fifth-century Korean chronicles. In another adaptation of Shakespeare, Shanghai Peking Opera Troupe retells the familiar tale of Hamlet, setting it in China and performing the tragedy in the acrobatic and elaborately costumed style of Jingju opera.Scottish Ballet and the Royal Scottish National Orchestra will perform Sir Kenneth MacMillan’s ballet The Song of the Earth, set to Gustav Mahler’s song cycle inspired by Chinese poetry from the Tang Dynasty.Valery Gergiev and the Mariinsky Opera will bring a large scale production by Jonathan Kent, designed by Paul Brown, of Richard Strauss’s epic opera Die Frau ohne Schatten, spanning life on earth and in the spirit world. The fireworks concert that provides a traditional finale to the festival has found a new sponsor in Virgin Money, the financial services division of the Virgin Group, which is expanding from a new base in Edinburgh.It takes over that role from Bank of Scotland, which is now part of Lloyds Banking Group, though both Bank of Scotland and Royal Bank of Scotland continue as corporate friends of the festival. Besides Virgin Money, HSBC and Shell UK will sponsor festival productions for the first time.Mr Mills said this meant the festival’s corporate sponsorship was very healthy, in what was a very challenging environment.Copyright The Financial Times Limited 2011. You may share using our article tools. | |||
found new content : 9.64% | |||
---------------------------------------------------------------------- | |||
</source> | |||
== Links scraper /w API == | == Links scraper /w API == | ||
Line 66: | Line 357: | ||
- - - updated 22nd feb 2011 | - - - updated 22nd feb 2011 | ||
- - - updated 19th feb 2011 | - - - updated 19th feb 2011 | ||
Latest revision as of 22:06, 24 March 2011
Tragedy generator - v0.1
#!/usr/bin/python2.6
#-*- coding:utf-8 -*-
#print "Content-Type: text/html"
#print
from __future__ import division
import urllib2
from BeautifulSoup import BeautifulSoup, Comment
import json
import os
import fileinput
import random
import safe_html
import nltk.util
terms = ['tragedy']
entries=[]
us = []
titles=10
content=50000
maxcrawl = 64
blacklist = ['|','comment','ENCODING','Login','.com','Favorites', 'DOCTYPE','login','password','loading']
live=1
if live==0:
for a in range(40):
fn = 'items/item'+str (a)
#fn = 'items/item'+str (random.randrange(40))
c=''
for line in fileinput.input(fn):
c=c+line
soup = BeautifulSoup(safe_html.safe_html(c))
#soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=True)))
heading = soup.h1
if heading:
heading = str (heading.renderContents())
print heading.strip(" \t\n\r")
a = BeautifulSoup(''.join(BeautifulSoup(safe_html.safe_html(c)).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
cleartext = ''
a = str (a)
body = a.split("\n")
for line in body:
if len(line)>250:
line = line.replace(" "," ")
cleartext = cleartext +'\n\n'+ line
print cleartext
print '---------------------------------'
else:
c=0
for term in terms:
start=0
while start<maxcrawl:
url = ('https://ajax.googleapis.com/ajax/services/search/news?v=1.0&q='+term+'&start='+ str (start)+'&rsz=large&geo=china')
f = urllib2.urlopen(url)
data = json.load(f)
nb = data['responseData']['cursor']['estimatedResultCount']
if nb>maxcrawl:
nb=maxcrawl
for r in data['responseData']['results']:
entry = r['unescapedUrl']
if entry not in us:
us.append(entry)
#print entry
entries.append(entry)
c = c+1
percent = int (round((c/maxcrawl)*100))
print 'harvesting links...'+str (percent)+'% of possible maximum'
start += 8
print '----------------------------------------------------------------------'
print 'done getting links - crawling them to find titles and content'
print '----------------------------------------------------------------------'
p=0
w=0
#random.shuffle(entries)
for b in entries:
if w<content:
print '----------------------------------------------------------------------'
print 'crawling : '+b
request = urllib2.Request(b)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
try:
f=urllib2.urlopen(request)
q = f.read()
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'urllib error ---> %s' % (e)
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
try:
c=''
for line in q:
c=c+line
soup = BeautifulSoup(safe_html.safe_html(c))
#find headings
heading = soup.h1
if heading:
heading = str (heading.renderContents())
heading = nltk.util.clean_html(heading.strip(" \t\n\r"))
a = BeautifulSoup(''.join(BeautifulSoup(safe_html.safe_html(c)).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'beautifulsoup error ---> %s ' % (e)
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
cleartext = ''
a = str (a)
body = a.split("\n")
article = ''
#go line by line
for line in body:
#are you at least 250 chars?
if len(line)>250:
line = line.replace(" "," ")
line = nltk.util.clean_html(line)
article = article + line
sentences = article.split('. ')
#give 15 sentences max per article...this is shaky
if len(sentences)>14:
sentences = sentences[:14]
sents = ''
for sent in sentences:
found = False
#do you contain blacklist elements (passed the pre-html filter)?
for x in blacklist:
if x in sent:
found = True
break
#
#too many capital words (over 40%) = probably up to no good
capitals = 0
words = sent.split(' ')
if len(words)>0:
for ww in words:
if ww:
if ww[0].isupper():
capitals = capitals + 1
if round(capitals/len(words)*100) > 40:
found = True
#
#if filters above are passed...guess you're ok to join the bunch
if not found:
sents = sents + str (sent) +'. '
cleartext = cleartext +'\n\n'+ str (sents)
#make things more readable for humans -> this converts HTML entities
#clrstr = BeautifulSoup(cleartext,convertEntities=BeautifulSoup.HTML_ENTITIES).contents[0]
clrstr = cleartext
#this shouldn't be needed...somehow small fragments still make their way down here
if len(clrstr)>200:
if heading:
print
print heading
print clrstr
w = w+len(cleartext)
per = w/content*100
print
print 'found new content : '+str (per)+'%'
p=p+1
print
print 'done crawling, we have enough content now'
Output
harvesting links...2% of possible maximum
harvesting links...3% of possible maximum
harvesting links...5% of possible maximum
harvesting links...6% of possible maximum
harvesting links...8% of possible maximum
harvesting links...9% of possible maximum
harvesting links...11% of possible maximum
harvesting links...13% of possible maximum
harvesting links...14% of possible maximum
harvesting links...16% of possible maximum
harvesting links...17% of possible maximum
harvesting links...19% of possible maximum
harvesting links...20% of possible maximum
harvesting links...22% of possible maximum
harvesting links...23% of possible maximum
harvesting links...25% of possible maximum
harvesting links...27% of possible maximum
harvesting links...28% of possible maximum
harvesting links...30% of possible maximum
harvesting links...31% of possible maximum
harvesting links...33% of possible maximum
harvesting links...34% of possible maximum
harvesting links...36% of possible maximum
harvesting links...38% of possible maximum
harvesting links...39% of possible maximum
harvesting links...41% of possible maximum
harvesting links...42% of possible maximum
harvesting links...44% of possible maximum
harvesting links...45% of possible maximum
harvesting links...47% of possible maximum
harvesting links...48% of possible maximum
harvesting links...50% of possible maximum
harvesting links...52% of possible maximum
harvesting links...53% of possible maximum
harvesting links...55% of possible maximum
harvesting links...56% of possible maximum
harvesting links...58% of possible maximum
harvesting links...59% of possible maximum
harvesting links...61% of possible maximum
harvesting links...63% of possible maximum
harvesting links...64% of possible maximum
harvesting links...66% of possible maximum
harvesting links...67% of possible maximum
harvesting links...69% of possible maximum
harvesting links...70% of possible maximum
harvesting links...72% of possible maximum
harvesting links...73% of possible maximum
harvesting links...75% of possible maximum
harvesting links...77% of possible maximum
harvesting links...78% of possible maximum
harvesting links...80% of possible maximum
harvesting links...81% of possible maximum
harvesting links...83% of possible maximum
harvesting links...84% of possible maximum
harvesting links...86% of possible maximum
harvesting links...88% of possible maximum
harvesting links...89% of possible maximum
harvesting links...91% of possible maximum
harvesting links...92% of possible maximum
harvesting links...94% of possible maximum
harvesting links...95% of possible maximum
harvesting links...97% of possible maximum
harvesting links...98% of possible maximum
harvesting links...100% of possible maximum
----------------------------------------------------------------------
done getting links - crawling them to find titles and content
----------------------------------------------------------------------
----------------------------------------------------------------------
crawling : http://www.atimes.com/atimes/China/MC25Ad01.html
----------------------------------------------------------------------
crawling : http://af.reuters.com/article/metalsNews/idAFL3E7EO0N020110324
----------------------------------------------------------------------
crawling : http://www.nypress.com/article-22247-education-chinese-rising-in-language-popularity.html
EDUCATION: Chinese Rising in Language Popularity
In this melting pot of a city, different dialects are thrown left and right, and as politics and business get concentrated overseas, Americans are picking up these languages. One way they learn to do it is at one of the dozens of schools specializing in foreign language studies, from the ever-popular Spanish to French to Japanese. Though Arabic too has recently come on strong, nothing appears to tromp the latest language craze: Chinese."It's become a buzz word, kind of like Japan was in the '80s when America concentrated on it and its growth," said Jon Hills, director of Hills Learning, a language school in Manhattan. "Then, we thought we would be speaking Japanese soon, but now, Chinese has taken its place."No matter what city you go to, you can expect to find at least some version of a Chinatown, or a small pocket of Chinese immigrants living and working. Based on the CIA's World Fact Book, the estimated number of people living in China by July 2011 is 1,336,718,015. The number of people in the United States is approximately one billion less than that. So it comes as no surprise that the interest in learning Chinese has steadily been climbing."It's kind of a no-brainer for people following the language markets," said Hills. "Schools in particular are putting investments into Chinese language classes, which also reflect the parents and what they think the next important language is going to be."Take Xiao Bao Chinese, a school run by Danielle Chang, who also organized the first all-Asian food and culture festival, Luckyrice. The school, in partnership with the Museum of Chinese in America, teaches children ages 5 and under about Chinese language and culture. Chang, who is Chinese, started the school five years ago in response to her own young daughter who declared she wasn't Chinese, she was American."That really upset me, so I basically started this program for young children as a response," said Chang. "It snowballed, and one class turned into seven. I think parents really want to expose their kids to Chinese from an early age."Many of the kids that go to Xiao Bao aren't Chinese, nor have a strong affiliation to the country. Chang said it wasn't just about learning her language; she thinks most smart, cultured and sophisticated people see being bilingual as a tool, both in the utilitarian sense and as a way to expand the mind. Before Mandarin Chinese became popular, there was French and Japanese, but in the long run, not many people use these languages compared to the number of Chinese and Spanish speakers.Given that, Spanish is still one of the top languages being studied at ABC Language Exchange.
found new content : 5.42%
----------------------------------------------------------------------
crawling : http://www.ft.com/cms/s/39b273f4-5556-11e0-87fe-00144feab49a.html
Edinburgh festival to honour Asian influence
Commenting on them, Jonathan Mills, the festival’s director, said: “European artists, explorers and philosophers have drawn inspiration from the Far East for centuries. This festival draws inspiration from the diverse cultures of Asia – from Vietnam to China.”One production to bring together east and west will be the Peony Pavilion, performed by the National Ballet of China with western classical ballet, a classic symphony orchestra and traditional Chinese instruments. It is based on a love story by one of China’s greatest writers, Tang Xianzu, a contemporary of Shakespeare.The Tempest will be re-imagined by Mokwha Repertory Company from Seoul in a production that weaves Shakespeare’s play with fifth-century Korean chronicles. In another adaptation of Shakespeare, Shanghai Peking Opera Troupe retells the familiar tale of Hamlet, setting it in China and performing the tragedy in the acrobatic and elaborately costumed style of Jingju opera.Scottish Ballet and the Royal Scottish National Orchestra will perform Sir Kenneth MacMillan’s ballet The Song of the Earth, set to Gustav Mahler’s song cycle inspired by Chinese poetry from the Tang Dynasty.Valery Gergiev and the Mariinsky Opera will bring a large scale production by Jonathan Kent, designed by Paul Brown, of Richard Strauss’s epic opera Die Frau ohne Schatten, spanning life on earth and in the spirit world. The fireworks concert that provides a traditional finale to the festival has found a new sponsor in Virgin Money, the financial services division of the Virgin Group, which is expanding from a new base in Edinburgh.It takes over that role from Bank of Scotland, which is now part of Lloyds Banking Group, though both Bank of Scotland and Royal Bank of Scotland continue as corporate friends of the festival. Besides Virgin Money, HSBC and Shell UK will sponsor festival productions for the first time.Mr Mills said this meant the festival’s corporate sponsorship was very healthy, in what was a very challenging environment.Copyright The Financial Times Limited 2011. You may share using our article tools.
found new content : 9.64%
----------------------------------------------------------------------
Links scraper /w API
Scraping blog urls, checking them against the current archive + storing them in a tab-separated file.
And then adding a cron job lets you pile up the results like this, doing la scrape every hour.
#!/usr/bin/python2.6
#!/usr/bin/python2.6
#-*- coding:utf-8 -*-
print "Content-Type: text/html"
print
import urllib2
import json
from datetime import date
import os
txt = 'blogs'
start=0
scrapedate=date.today()
entries=[]
urllist=[]
if not os.path.exists(txt):
f = open(txt,'w')
f.close()
else:
f = open(txt,'r')
data = f.read()
if len(data)>0:
urls = data.split('\n')
for a in urls:
line = a.split('\t')
if len(line)>1:
urllist.append(line[2])
c=0
while start<64:
url = ('https://ajax.googleapis.com/ajax/services/search/blogs?v=1.0&q=myself&start='+ str (start)+'&rsz=large')
f = urllib2.urlopen(url)
data = json.load(f)
for r in data['responseData']['results']:
if r['postUrl'] not in urllist:
entry = "%s\t%s\t%s\t%s\t%s\t%s" % (scrapedate, r['title'], r['postUrl'], r['publishedDate'], r['blogUrl'], r['author'])
entry = entry.encode("utf-8")
entries.append(entry)
c = c+1
start += 8
print 'added %s entries' % (c)
se = '\n'.join(map(str, entries))
f = open(txt,'a')
if c>0:
f.write(se)
f.write('\n')
f.close()
execfile("spider.py")
Verbose blog page scraper : python + beautifulsoup
- - - updated 22nd feb 2011
- - - updated 19th feb 2011
#!/usr/bin/python2.6
import os
import urllib2
from BeautifulSoup import BeautifulSoup
import re
blogfile = 'blogs'
qfile = 'questions'
archive = 'archive'
forbiddenlist=['comment','wordpress','related','categories','dtd','w3','archives','tags','admin','php','twitter','subscribe','articles','.com','says:','linkback','post','password','statcounter','class=']
if os.path.exists(qfile):
ff = open(qfile,'r')
pureq = ff.read()
else:
pureq = ''
f = open(blogfile,'r')
data = f.read()
f.close()
rows = data.split('\n')
#put the text file in a 2D array, I imagine this will save me some work later
a = []
c=0
for row in rows:
items = row.split('\t')
if len(items) > 1:
a.append([])
for item in items:
a[c].append(item)
c=c+1
p=1
qs = ""
for element in a:
print '----------------------------------------------------------------------'
print 'scraping link %d of %d' % (p,len(a))
print '----------------------------------------------------------------------'
target = element[2]
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
try:
f=urllib2.urlopen(request)
c = f.read()
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'urllib error ---> %s' % (e)
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
try:
soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
cleartext = ""
for line in soup:
if len(line)>10:
line = line.replace("\n"," ")
line = line.replace(" ","")
cleartext = cleartext + line
#starts = [match.start() for match in re.finditer(re.escape('?'), cleartext)]
l = re.compile('[!?.]')
it = l.finditer(cleartext)
k=0
positions=[]
for match in it:
n = match.start()
positions.append(n)
#start cleaning
if cleartext[n]=='?' and k>1:
e = cleartext[positions[k-3]+1:n+1].strip(" \t\n\r")
if len(e)>10 and len(e)<600:
f = True
for m in forbiddenlist:
if m in e.lower():
f = False
if f:
if e[0].islower() or e[0]==' ' or not e[0].isalpha():
e = '...'+e
#end cleaning
if e.encode("utf-8") not in pureq:
qs = qs+element[0]+'\t'+element[1]+'\t'+element[2]+'\t'+element[3]+'\t'+element[4]+'\t'+element[5]+'\t'+e+'\n'
print e
print
else:
print "duplicate...this question already exists"
k=k+1
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'beautifulsoup error ---> %s ' % (e)
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
p=p+1
#dump clean questions to questions file
if not os.path.exists(qfile):
f = open(qfile,'w')
else:
f = open(qfile,'a')
if len(qs)>1:
f.write(qs.encode("utf-8"))
f.close()
#dump old urls to archive
if not os.path.exists(archive):
f = open(archive,'w')
else:
f = open(archive,'a')
f.write(data)
f.close()
#clean scaped urls file
f=open(blogfile,'w')
f.write('')
f.close()