User:Laurier Rochon/prototyping/??????????soft: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
Line 57: Line 57:


== Verbose blog page scraper : python + beautifulsoup ==
== Verbose blog page scraper : python + beautifulsoup ==
- - - updated 19th feb 2011


<source lang='python'>
<source lang='python'>
Line 64: Line 66:
import urllib2
import urllib2
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import BeautifulSoup
import re


blogfile = 'blogs'
blogfile = 'blogs'
Line 102: Line 105:
soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
cleartext = ""
for line in soup:
for line in soup:
if len(line)>10:
if len(line)>10:
line = line.replace("\n"," ")
line = line.replace("\n"," ")
line = line.replace("  ","")
line = line.replace("  ","")
print line
cleartext = cleartext + line
 
#starts = [match.start() for match in re.finditer(re.escape('?'), cleartext)]
l = re.compile('[!?.]')
it = l.finditer(cleartext)
k=0
positions=[]
for match in it:
n = match.start()
positions.append(n)
if cleartext[n]=='?':
if k>1:
onesentence = cleartext[positions[k-2]+1:n+1]
print onesentence
print
k=k+1
except Exception, e:
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
Line 112: Line 134:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
p=p+1
p=p+1


</source>
</source>

Revision as of 22:31, 19 February 2011

Links scraper /w API

Scraping blog urls, checking them against the current archive + storing them in a tab-separated file.

And then adding a cron job lets you pile up the results like this, doing la scrape every hour.

#!/usr/bin/python2.6

import urllib2
import json
from datetime import date
import os

#txt = '../cgiscrape/blogs'
txt = 'blogs'

start=0
scrapedate=date.today()
entries=[]
urllist=[]

if not os.path.exists(txt):
	f = open(txt,'w')
	f.close()
else:
	f = open(txt,'r')
	data = f.read()
	if len(data)>0:
		urls = data.split('\n')
		for a in urls:
			line = a.split('\t')
			if len(line)>1:
				urllist.append(line[2])
c=0
while start<64:
	url = ('https://ajax.googleapis.com/ajax/services/search/blogs?v=1.0&q=myself&start='+ str (start)+'&rsz=large')
 
	f = urllib2.urlopen(url)
	data = json.load(f)
	for r in data['responseData']['results']:
		if r['postUrl'] not in urllist:
			entry = "%s\t%s\t%s\t%s\t%s\t%s" % (scrapedate, r['title'], r['postUrl'], r['publishedDate'], r['blogUrl'], r['author'])
			entry = entry.encode("utf-8")
			entries.append(entry)
			c = c+1
	start += 8

print 'added %s entries' % (c)

se = '\n'.join(map(str, entries))
f = open(txt,'a')
f.write(se)
f.close()


Verbose blog page scraper : python + beautifulsoup

- - - updated 19th feb 2011

#!/usr/bin/python2.6

import os
import urllib2
from BeautifulSoup import BeautifulSoup
import re

blogfile = 'blogs'

f = open(blogfile,'r')
data = f.read()
f.close()

rows = data.split('\n')

#put the text file in a 2D array, I imagine this will save me some work later
a = []
c=0
for row in rows:
	items = row.split('\t')
	if len(items) > 1:
		a.append([])
		for item in items:
			a[c].append(item)
		c=c+1

p=1
for element in a:
	print '----------------------------------------------------------------------'
	print 'scraping link %d of %d' % (p,len(a))
	print '----------------------------------------------------------------------'
	target = element[2]
	request = urllib2.Request(target)
	request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
	try:
		f=urllib2.urlopen(request)
		c = f.read()
	except Exception, e:
		print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
		print 'urllib error ---> %s' % (e)
		print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
	try:
		soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
		
		cleartext = ""
		for line in soup:
			if len(line)>10:
				line = line.replace("\n"," ")
				line = line.replace("  ","")
				cleartext = cleartext + line

		#starts = [match.start() for match in re.finditer(re.escape('?'), cleartext)]
		l = re.compile('[!?.]')
		it = l.finditer(cleartext)
		k=0
		positions=[]
		for match in it:
			n = match.start()
			positions.append(n)
			if cleartext[n]=='?':
				if k>1:
					onesentence = cleartext[positions[k-2]+1:n+1]
					print onesentence
					print
			k=k+1
				
		
		
	except Exception, e:
		print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
		print 'beautifulsoup error ---> %s ' % (e)
		print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
	p=p+1