New News scraper
This script collects 8 most recent news from Google News by a searchterm. It saves/writes their titles, content and image (if they have it) to a txt file. The titles are also imported into an HTML file and make links to their own URLs. <source lang="python">
import cgitb, cgi cgitb.enable()
print "Content-Type: text/html" print
import os import urllib2 import simplejson, pprint
searchterm ='and'
request = urllib2.Request(url) response = urllib2.urlopen(request)
- Process the JSON string.
results = simplejson.load(response)
pp = pprint.PrettyPrinter(indent=4)
myfile = open("/home/pmilicki/newsresults.txt", "a")
myhtml = open("/home/pmilicki/public_html/empty.html", "r+")
alllines = myhtml.readlines() print alllines
for result in results['responseData']['results']: outstring= htmloutstring=
outstring+= '\n\ntitle:' outstring+= str(result['title'].encode('utf-8'))
htmloutstring+='
<a href="' + str(result['unescapedUrl'].encode('utf-8')) + '">' htmloutstring+= str(result['title'].encode('utf-8')) htmloutstring+='</a>
\n'
incurrentfile = 0 for line in alllines: if line == htmloutstring: incurrentfile = 1
outstring+= '\n\ncontent:' outstring+= str(result['content'].encode('utf-8'))
outstring+= '\n\nurl:' outstring+= str(result['unescapedUrl'].encode('utf-8'))
outstring+= '\n\nimage:' try: outstring+= str(result['image']['url'].encode('utf-8')) except: outstring +=
- ------------------------------------------
outstring+= ---------- ----------
myfile.write(outstring) if incurrentfile == 0: myhtml.write(htmloutstring)