User:Laurier Rochon/prototyping/??????????soft: Difference between revisions

Revision as of 22:04, 24 March 2011

Tragedy generator - v0.1

#!/usr/bin/python2.6
#-*- coding:utf-8 -*-
#print "Content-Type: text/html"
#print
from __future__ import division
import urllib2
from BeautifulSoup import BeautifulSoup, Comment
import json
import os
import fileinput
import random
import safe_html
import nltk.util

terms = ['tragedy']
entries=[]
us = []
titles=10
content=50000
maxcrawl = 64

blacklist = ['|','comment','ENCODING','Login','.com','Favorites', 'DOCTYPE','login','password','loading']

live=1

if live==0:
	for a in range(40):
		fn = 'items/item'+str (a)
		#fn = 'items/item'+str (random.randrange(40))
		c=''
		for line in fileinput.input(fn):
			c=c+line
		soup = BeautifulSoup(safe_html.safe_html(c))
		#soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=True)))
		heading = soup.h1
		if heading:
			heading = str (heading.renderContents())
			print heading.strip(" \t\n\r")
		a = BeautifulSoup(''.join(BeautifulSoup(safe_html.safe_html(c)).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
		cleartext = ''
		a = str (a)
		body = a.split("\n")
		for line in body:
			if len(line)>250:
				line = line.replace("  "," ")
				cleartext = cleartext +'\n\n'+ line
		print cleartext
		print '---------------------------------'

else:
	c=0
	for term in terms:
		start=0
		while start<maxcrawl:
			url = ('https://ajax.googleapis.com/ajax/services/search/news?v=1.0&q='+term+'&start='+ str (start)+'&rsz=large&geo=china')
		 
			f = urllib2.urlopen(url)
			data = json.load(f)
			nb = data['responseData']['cursor']['estimatedResultCount']
			if nb>maxcrawl:
				nb=maxcrawl
			for r in data['responseData']['results']:
				entry = r['unescapedUrl']
				if entry not in us:
					us.append(entry)
					#print entry
					entries.append(entry)
					c = c+1
					percent = int (round((c/maxcrawl)*100))
					print 'harvesting links...'+str (percent)+'% of possible maximum'
			start += 8

	print '----------------------------------------------------------------------'
	print 'done getting links - crawling them to find titles and content'
	print '----------------------------------------------------------------------'
	



	p=0
	w=0
	#random.shuffle(entries)
	for b in entries:
		if w<content:
			print '----------------------------------------------------------------------'
			print 'crawling : '+b
			request = urllib2.Request(b)
			request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
			try:
				f=urllib2.urlopen(request)
				q = f.read()
			except Exception, e:
				print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
				print 'urllib error ---> %s' % (e)
				print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
			try:
				c=''
				for line in q:
					c=c+line
				soup = BeautifulSoup(safe_html.safe_html(c))
				#find headings
				heading = soup.h1
				if heading:
					heading = str (heading.renderContents())
					heading = nltk.util.clean_html(heading.strip(" \t\n\r"))

				a = BeautifulSoup(''.join(BeautifulSoup(safe_html.safe_html(c)).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))

			except Exception, e:
				print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
				print 'beautifulsoup error ---> %s ' % (e)
				print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
			
			cleartext = ''
			a = str (a)
			body = a.split("\n")
			article = ''

			#go line by line
			for line in body:

				#are you at least 250 chars?
				if len(line)>250:
					line = line.replace("  "," ")
					line = nltk.util.clean_html(line)
					article = article + line
			
			sentences = article.split('. ')

			#give 15 sentences max per article...this is shaky
			if len(sentences)>14:
				sentences = sentences[:14]

			sents = ''

			for sent in sentences:
				found = False

				#do you contain blacklist elements (passed the pre-html filter)?
				for x in blacklist:
					if x in sent:
						found = True
						break
				#


				#too many capital words (over 40%) = probably up to no good
				capitals = 0
				words = sent.split(' ')
				if len(words)>0:
					for ww in words:
						if ww:
							if ww[0].isupper():
								capitals = capitals + 1
				if round(capitals/len(words)*100) > 40:
					found = True
				#

				
				#if filters above are passed...guess you're ok to join the bunch
				if not found:
					sents = sents + str (sent) +'. '

			cleartext = cleartext +'\n\n'+ str (sents)
			
			#make things more readable for humans -> this converts HTML entities
			#clrstr = BeautifulSoup(cleartext,convertEntities=BeautifulSoup.HTML_ENTITIES).contents[0]
			clrstr = cleartext

			#this shouldn't be needed...somehow small fragments still make their way down here
			if len(clrstr)>200:
				if heading:
					print
					print heading
				print clrstr

				w = w+len(cleartext)
				per = w/content*100

				print
				print 'found new content : '+str (per)+'%'
		p=p+1
	print
	print 'done crawling, we have enough content now'

Links scraper /w API

Scraping blog urls, checking them against the current archive + storing them in a tab-separated file.

And then adding a cron job lets you pile up the results like this, doing la scrape every hour.

#!/usr/bin/python2.6

#!/usr/bin/python2.6
#-*- coding:utf-8 -*-
print "Content-Type: text/html"
print
import urllib2
import json
from datetime import date
import os

txt = 'blogs'

start=0
scrapedate=date.today()
entries=[]
urllist=[]

if not os.path.exists(txt):
	f = open(txt,'w')
	f.close()
else:
	f = open(txt,'r')
	data = f.read()
	if len(data)>0:
		urls = data.split('\n')
		for a in urls:
			line = a.split('\t')
			if len(line)>1:
				urllist.append(line[2])
c=0
while start<64:
	url = ('https://ajax.googleapis.com/ajax/services/search/blogs?v=1.0&q=myself&start='+ str (start)+'&rsz=large')
 
	f = urllib2.urlopen(url)
	data = json.load(f)
	for r in data['responseData']['results']:
		if r['postUrl'] not in urllist:
			entry = "%s\t%s\t%s\t%s\t%s\t%s" % (scrapedate, r['title'], r['postUrl'], r['publishedDate'], r['blogUrl'], r['author'])
			entry = entry.encode("utf-8")
			entries.append(entry)
			c = c+1
	start += 8

print 'added %s entries' % (c)

se = '\n'.join(map(str, entries))
f = open(txt,'a')
if c>0:
	f.write(se)
	f.write('\n')
f.close()

execfile("spider.py")

Verbose blog page scraper : python + beautifulsoup

- - - updated 22nd feb 2011

- - - updated 19th feb 2011

#!/usr/bin/python2.6

import os
import urllib2
from BeautifulSoup import BeautifulSoup
import re

blogfile = 'blogs'
qfile = 'questions'
archive = 'archive'

forbiddenlist=['comment','wordpress','related','categories','dtd','w3','archives','tags','admin','php','twitter','subscribe','articles','.com','says:','linkback','post','password','statcounter','class=']

if os.path.exists(qfile):
	ff = open(qfile,'r')
	pureq = ff.read()
else:
	pureq = ''

f = open(blogfile,'r')
data = f.read()
f.close()

rows = data.split('\n')

#put the text file in a 2D array, I imagine this will save me some work later
a = []
c=0
for row in rows:
	items = row.split('\t')
	if len(items) > 1:
		a.append([])
		for item in items:
			a[c].append(item)
		c=c+1

p=1
qs = ""
for element in a:
	print '----------------------------------------------------------------------'
	print 'scraping link %d of %d' % (p,len(a))
	print '----------------------------------------------------------------------'
	target = element[2]
	request = urllib2.Request(target)
	request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
	try:
		f=urllib2.urlopen(request)
		c = f.read()
	except Exception, e:
		print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
		print 'urllib error ---> %s' % (e)
		print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
	try:
		soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
		
		cleartext = ""
		for line in soup:
			if len(line)>10:
				line = line.replace("\n"," ")
				line = line.replace("  ","")
				cleartext = cleartext + line

		#starts = [match.start() for match in re.finditer(re.escape('?'), cleartext)]
		l = re.compile('[!?.]')
		it = l.finditer(cleartext)
		k=0
		positions=[]
		for match in it:
			n = match.start()
			positions.append(n)
			#start cleaning
			if cleartext[n]=='?' and k>1:
				e = cleartext[positions[k-3]+1:n+1].strip(" \t\n\r")
				if len(e)>10 and len(e)<600:
					f = True
					for m in forbiddenlist:
						if m in e.lower():
							f = False
					if f:
						if e[0].islower() or e[0]==' ' or not e[0].isalpha():
							e = '...'+e
						#end cleaning
						if e.encode("utf-8") not in pureq:
							qs = qs+element[0]+'\t'+element[1]+'\t'+element[2]+'\t'+element[3]+'\t'+element[4]+'\t'+element[5]+'\t'+e+'\n'
							print e
							print 
						else:
							print "duplicate...this question already exists"
			k=k+1
				
		
		
	except Exception, e:
		print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
		print 'beautifulsoup error ---> %s ' % (e)
		print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
	p=p+1

#dump clean questions to questions file
if not os.path.exists(qfile):
	f = open(qfile,'w')
else:
	f = open(qfile,'a')
if len(qs)>1:
	f.write(qs.encode("utf-8"))
f.close()

#dump old urls to archive
if not os.path.exists(archive):
	f = open(archive,'w')
else:
	f = open(archive,'a')
f.write(data)
f.close()

#clean scaped urls file
f=open(blogfile,'w')
f.write('')
f.close()