New News scraper

From XPUB & Lens-Based wiki

This script collects 8 most recent news from Google News by a searchterm. It saves/writes their titles, content and image (if they have it) to a txt file. The titles are also imported into an HTML file and make links to their own URLs. <source lang="python">

import cgitb, cgi cgitb.enable()

print "Content-Type: text/html" print

import os import urllib2 import simplejson, pprint

searchterm ='and'

url = ('https://ajax.googleapis.com/ajax/services/search/news?'+'v=1.0&rsz=8&ned=us&hl=en&scoring=d&q='+searchterm)

request = urllib2.Request(url) response = urllib2.urlopen(request)

  1. Process the JSON string.

results = simplejson.load(response)

pp = pprint.PrettyPrinter(indent=4)


myfile = open("/home/pmilicki/newsresults.txt", "a") myhtml = open("/home/pmilicki/public_html/empty.html", "r+")

alllines = myhtml.readlines() print alllines

for result in results['responseData']['results']: outstring= htmloutstring=

outstring+= '\n\ntitle:' outstring+= str(result['title'].encode('utf-8'))

htmloutstring+='

<a href="' + str(result['unescapedUrl'].encode('utf-8')) + '">' htmloutstring+= str(result['title'].encode('utf-8')) htmloutstring+='</a>

\n'

incurrentfile = 0 for line in alllines: if line == htmloutstring: incurrentfile = 1

outstring+= '\n\ncontent:' outstring+= str(result['content'].encode('utf-8'))

outstring+= '\n\nurl:' outstring+= str(result['unescapedUrl'].encode('utf-8'))

outstring+= '\n\nimage:' try: outstring+= str(result['image']['url'].encode('utf-8')) except: outstring +=

  1. ------------------------------------------

outstring+= ---------- ----------

myfile.write(outstring) if incurrentfile == 0: myhtml.write(htmloutstring)