User:Laurier Rochon/prototyping/??????????soft: Difference between revisions
No edit summary |
No edit summary |
||
Line 1: | Line 1: | ||
== | == Links scraper /w API == | ||
Scraping blog urls, checking them against the current archive + storing them in a tab-separated file. | Scraping blog urls, checking them against the current archive + storing them in a tab-separated file. | ||
Line 53: | Line 53: | ||
f.write(se) | f.write(se) | ||
f.close() | f.close() | ||
</source> | |||
== Verbose blog page scraper : python + beautifulsoup == | |||
<source lang='python'> | |||
#!/usr/bin/python2.6 | |||
import os | |||
import urllib2 | |||
from BeautifulSoup import BeautifulSoup | |||
blogfile = 'blogs' | |||
f = open(blogfile,'r') | |||
data = f.read() | |||
f.close() | |||
rows = data.split('\n') | |||
#put the text file in a 2D array, I imagine this will save me some work later | |||
a = [] | |||
c=0 | |||
for row in rows: | |||
items = row.split('\t') | |||
if len(items) > 1: | |||
a.append([]) | |||
for item in items: | |||
a[c].append(item) | |||
c=c+1 | |||
p=1 | |||
for element in a: | |||
print '----------------------------------------------------------------------' | |||
print 'scraping link %d of %d' % (p,len(a)) | |||
print '----------------------------------------------------------------------' | |||
target = element[2] | |||
request = urllib2.Request(target) | |||
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") | |||
try: | |||
f=urllib2.urlopen(request) | |||
c = f.read() | |||
except Exception, e: | |||
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | |||
print 'urllib error ---> %s' % (e) | |||
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | |||
try: | |||
soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style"))) | |||
for line in soup: | |||
if len(line)>10: | |||
line = line.replace("\n"," ") | |||
line = line.replace(" ","") | |||
print line | |||
except Exception, e: | |||
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | |||
print 'beautifulsoup error ---> %s ' % (e) | |||
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' | |||
p=p+1 | |||
</source> | </source> |
Revision as of 21:00, 13 February 2011
Links scraper /w API
Scraping blog urls, checking them against the current archive + storing them in a tab-separated file.
And then adding a cron job lets you pile up the results like this, doing la scrape every hour.
#!/usr/bin/python2.6
import urllib2
import json
from datetime import date
import os
#txt = '../cgiscrape/blogs'
txt = 'blogs'
start=0
scrapedate=date.today()
entries=[]
urllist=[]
if not os.path.exists(txt):
f = open(txt,'w')
f.close()
else:
f = open(txt,'r')
data = f.read()
if len(data)>0:
urls = data.split('\n')
for a in urls:
line = a.split('\t')
if len(line)>1:
urllist.append(line[2])
c=0
while start<64:
url = ('https://ajax.googleapis.com/ajax/services/search/blogs?v=1.0&q=myself&start='+ str (start)+'&rsz=large')
f = urllib2.urlopen(url)
data = json.load(f)
for r in data['responseData']['results']:
if r['postUrl'] not in urllist:
entry = "%s\t%s\t%s\t%s\t%s\t%s" % (scrapedate, r['title'], r['postUrl'], r['publishedDate'], r['blogUrl'], r['author'])
entry = entry.encode("utf-8")
entries.append(entry)
c = c+1
start += 8
print 'added %s entries' % (c)
se = '\n'.join(map(str, entries))
f = open(txt,'a')
f.write(se)
f.close()
Verbose blog page scraper : python + beautifulsoup
#!/usr/bin/python2.6
import os
import urllib2
from BeautifulSoup import BeautifulSoup
blogfile = 'blogs'
f = open(blogfile,'r')
data = f.read()
f.close()
rows = data.split('\n')
#put the text file in a 2D array, I imagine this will save me some work later
a = []
c=0
for row in rows:
items = row.split('\t')
if len(items) > 1:
a.append([])
for item in items:
a[c].append(item)
c=c+1
p=1
for element in a:
print '----------------------------------------------------------------------'
print 'scraping link %d of %d' % (p,len(a))
print '----------------------------------------------------------------------'
target = element[2]
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64;fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
try:
f=urllib2.urlopen(request)
c = f.read()
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'urllib error ---> %s' % (e)
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
try:
soup = BeautifulSoup(''.join(BeautifulSoup(c).findAll(text=lambda text:text.parent.name != "script" and text.parent.name != "style")))
for line in soup:
if len(line)>10:
line = line.replace("\n"," ")
line = line.replace(" ","")
print line
except Exception, e:
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
print 'beautifulsoup error ---> %s ' % (e)
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
p=p+1