User:Renee Oldemonnikhof/ebooks
< User:Renee Oldemonnikhof
Revision as of 20:42, 23 September 2010 by Michael Murtaugh (talk | contribs) (Created page with " == Internet News Graph == '''The newsfeeds that i worked with''' <source lang="text"> feeds = [ {"url":"http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf",...")
Internet News Graph
The newsfeeds that i worked with
feeds = [
{"url":"http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf", "name":"All Africa", "location":"Africa", "timezone":2},
{"url":"http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml", "name":"New York Times", "location":"America", "timezone":2},
{"url":"http://rssfeeds.usatoday.com/usatoday-NewsTopStories", "name":"USA Today", "location":"America", "timezone":2},
{"url":"http://english.peopledaily.com.cn/rss/China.xml", "name":"People Daily", "location":"China", "timezone":2},
#{"url":"http://feeds.feedburner.com/japantimes", "name":"Japan Times", "location":"Japan", "timezone":2},
{"url":"http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_205.xml", "name":"Herald Sun", "location":"Australia", "timezone":2},
{"url":"http://feeds.guardian.co.uk/the guardian/rss", "name":"Guardian", "location":"England", "timezone":2}
]
The code that is reading the feeds
#!/usr/bin/python
import feedparser
from pprint import pprint
feeds = [
{"url":"http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf", "name":"All Africa", "location":"Africa", "timezone":2},
{"url":"http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml", "name":"New York Times", "location":"America", "timezone":2},
{"url":"http://rssfeeds.usatoday.com/usatoday-NewsTopStories", "name":"USA Today", "location":"America", "timezone":2},
{"url":"http://english.peopledaily.com.cn/rss/China.xml", "name":"People Daily", "location":"China", "timezone":2},
{"url":"http://feeds.feedburner.com/japantimes", "name":"Japan Times", "location":"Japan", "timezone":2},
{"url":"http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_205.xml", "name":"Herald Sun", "location":"Australia", "timezone":2},
{"url":"http://feeds.guardian.co.uk/the guardian/rss", "name":"Guardian", "location":"England", "timezone":2}
]
def parseFeed(url, db):
feed = feedparser.parse(url)
for e in feed.entries:
#print type(e.updated_parsed), e.updated_parsed
news = {"title": e.title, "summary": e.summary, "url": e.link }
try:
news["updated"] = e.updated
except AttributeError:
pass
try:
news["updated_parsed"] = e.updated_parsed
time = e.updated_parsed
except AttributeError:
continue
if (time.tm_mon < 10):
month = "0"+str(time.tm_mon)
else:
month = str(time.tm_mon)
key = str(time.tm_year)+"-"+month+"-"+str(time.tm_mday)+" "+str(time.tm_hour)+" "+url
if (key not in db):
db[key] = [news]
else:
db[key].append(news)
# key = str(time.tm_year)+"-"+str(time.tm_mon)+"-"+str(time.tm_mday)
db = {}
for newsfeed in feeds:
parseFeed(newsfeed["url"], db)
#pprint(db)
#for key in db.keys():
# print key
# pickle the db...
import pickle
myfile = file("/var/lib/newsgraph/database.dat", "w")
pickle.dump(db, myfile)
myfile.close()
The code that is using the database to create a table structure
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import pickle
import datetime, time
import cgi
# List with dicts describing the newsfeeds
feeds = [
{"url":"http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf", "name":"All Africa", "location":"Africa", "timezone":2},
{"url":"http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml", "name":"New York Times", "location":"America", "timezone":2},
{"url":"http://rssfeeds.usatoday.com/usatoday-NewsTopStories", "name":"USA Today", "location":"America", "timezone":2},
{"url":"http://english.peopledaily.com.cn/rss/China.xml", "name":"People Daily", "location":"China", "timezone":2},
{"url":"http://feeds.feedburner.com/japantimes", "name":"Japan Times", "location":"Japan", "timezone":2},
{"url":"http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_205.xml", "name":"Herald Sun", "location":"Australia", "timezone":2},
{"url":"http://feeds.guardian.co.uk/the guardian/rss", "name":"Guardian", "location":"England", "timezone":2}
]
#search colors
searchcolors= [
{"fg":"000000", "bg":"FF0099"},# black - Pink
{"fg":"000000", "bg":"FFFF00"},# black - yellow
{"fg":"000000", "bg":"3399FF"},# black - light blue
{"fg":"000000", "bg":"66FF00"},# black - light green
{"fg":"000000", "bg":"CC00FF"},# black - light purple
{"fg":"ffffff", "bg":"660033"},# white - dark purple
{"fg":"ffffff", "bg":"FF0000"},# white - red
{"fg":"ffffff", "bg":"FF6600"},# white - orange
{"fg":"ffffff", "bg":"006600"},# white - dark green
{"fg":"ffffff", "bg":"663300"},# white - brown
]
dbfile = file("/var/lib/newsgraph/database.dat", "r")
db = pickle.load(dbfile)
#search = "obama"
# parse form-data
formdata = cgi.FieldStorage()
if "search" in formdata:
all_search_terms = formdata["search"].value
search = all_search_terms.split(",")
else:
all_search_terms = ""
search = []
#Daylight Savings Time?
#'s zomers '1', 's winters '0'for d in reversed(range(days_history)):
dst = 1
days_history = 4
#for (key, articles) in db.items():
# print key
# for a in articles:
# print "\t", a['title']
# Apache webserver expects the first two lies of output from ANY cgi-script to be content-type data.
print """Content-type: text/html\n\n"""
print """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<link rel="stylesheet" href="/newsgraph.css" type="text/css" media="screen" />
<title>Internet Newsgraph</title>
</head>"""
print """<body>
<div>
<h1 style="font-family:verdana; color:#ffffff; font-size:30px"><b>Internet News Graph</b><br/>
<form>"""
print """Search: <input type="text" name="search" value="%s"/>""" % all_search_terms
print """</form>
</h1>
</div>
"""
print """<table><tr>"""
color_nr = 0
for search_str in search:
print """<td class="searchterm" style="background-color: #%(bg)s; color: #%(fg)s;">""" % searchcolors[color_nr]
print search_str
print "</td>"
color_nr = (color_nr + 1) % len(searchcolors)
print "</tr></table>"
colspan = len(feeds) + 1
divwidth = 80 * days_history
print """<div style="width: %d%%;">""" % divwidth
print """<table>"""
# Table header for each day
print """<tr>"""
for d in reversed(range(days_history)):
day = datetime.date.today() - datetime.timedelta(days = d)
print '<th colspan="%d" style="font-family:verdana; color:#ffffff; font-size:20px; background-color: #545454;">' % colspan
print str(day)
print """</th>"""
print """</tr>"""
# top row with names of newsfeeds
print """<tr>"""
for d in reversed(range(days_history)):
print """<td class="time">GMT</td>"""
for feed in feeds:
print """<td class="feedname">"""+feed["name"].encode("utf-8")
print feed["location"].encode("utf-8")
print "<br/>"
print time.strftime("%H:%M", (datetime.datetime.utcnow() + datetime.timedelta(hours=(feed["timezone"] + dst))).timetuple())+"""</td>"""
print """</tr>"""
# one table-row for each hour of the day
for hour in range(0, 24):
print """<tr>"""
for d in reversed(range(days_history)):
day = datetime.date.today() - datetime.timedelta(days = d)
print """<td class="time">"""
print "%2d:00" % hour
print """</td>"""
for feed in feeds:
# search summary for the "search" var
# add the database
key = str(day)+" "+str(hour)+" "+feed["url"]
found=False
if key in db:
for article in db[key]:
color_nr = 0
for search_str in search:
found = search_str.lower() in article['summary'].lower()
if found:
break
color_nr = (color_nr + 1) % len(searchcolors)
if found:
break
if found:
print """<td class="news" style="background: #%s">""" % searchcolors[color_nr]["bg"]
else:
print """<td class="news">"""
print """<div style="height:100px;">"""
# key = day.year+"-"+day.month+"-"+day.mday+" "+hour+" "+feed
#key = str(day)+" "+str(hour)+" "+feed["url"]
# print "searching "+key
# for k in db.keys():
# print "in "+k
if key in db:
for article in db[key]:
if found:
print """<a class="news" style="color: #%s" href="%s">""" % (searchcolors[color_nr]["fg"], article["url"].encode("utf-8"))
else:
print """<a class="news" href="%s">""" % article["url"].encode("utf-8")
print article['title'].encode("utf-8")
print """</a>"""
print "<br/>"
print """</div>"""
print """</td>"""
print """</tr>"""
print """</table>"""
print "</div>"
print "</body>"
print "</html>"