User:Renee Oldemonnikhof/ebooks

From XPUB & Lens-Based wiki


Internet News Graph

The newsfeeds that i worked with

feeds = [
    {"url":"http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf", "name":"All Africa", "location":"Africa", "timezone":2},
    {"url":"http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml", "name":"New York Times", "location":"America", "timezone":2},
    {"url":"http://rssfeeds.usatoday.com/usatoday-NewsTopStories", "name":"USA Today", "location":"America", "timezone":2},
    {"url":"http://english.peopledaily.com.cn/rss/China.xml", "name":"People Daily", "location":"China", "timezone":2},
    #{"url":"http://feeds.feedburner.com/japantimes", "name":"Japan Times", "location":"Japan", "timezone":2},
    {"url":"http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_205.xml", "name":"Herald Sun", "location":"Australia", "timezone":2},
    {"url":"http://feeds.guardian.co.uk/the guardian/rss", "name":"Guardian", "location":"England", "timezone":2}
]




The code that is reading the feeds

#!/usr/bin/python
import feedparser

from pprint import pprint


feeds = [
    {"url":"http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf", "name":"All Africa", "location":"Africa", "timezone":2},
    {"url":"http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml", "name":"New York Times", "location":"America", "timezone":2},
    {"url":"http://rssfeeds.usatoday.com/usatoday-NewsTopStories", "name":"USA Today", "location":"America", "timezone":2},
    {"url":"http://english.peopledaily.com.cn/rss/China.xml", "name":"People Daily", "location":"China", "timezone":2},
    {"url":"http://feeds.feedburner.com/japantimes", "name":"Japan Times", "location":"Japan", "timezone":2},
    {"url":"http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_205.xml", "name":"Herald Sun", "location":"Australia", "timezone":2},
    {"url":"http://feeds.guardian.co.uk/the guardian/rss", "name":"Guardian", "location":"England", "timezone":2}
]



def parseFeed(url, db):
    feed = feedparser.parse(url)

    for e in feed.entries:
        #print type(e.updated_parsed), e.updated_parsed
        news = {"title": e.title, "summary": e.summary, "url": e.link }

        try:
            news["updated"] = e.updated
        except AttributeError:
            pass
        
        try:
            news["updated_parsed"] = e.updated_parsed
            time = e.updated_parsed
        except AttributeError:
            continue

        if (time.tm_mon < 10):
            month = "0"+str(time.tm_mon)
        else:
            month = str(time.tm_mon)

        key = str(time.tm_year)+"-"+month+"-"+str(time.tm_mday)+" "+str(time.tm_hour)+" "+url

        if (key not in db):
            db[key] = [news]
        else:
            db[key].append(news)

        # key = str(time.tm_year)+"-"+str(time.tm_mon)+"-"+str(time.tm_mday)


db = {}
for newsfeed in feeds:
    parseFeed(newsfeed["url"], db)

#pprint(db)
#for key in db.keys():
#    print key


# pickle the db...
import pickle

myfile = file("/var/lib/newsgraph/database.dat", "w")
pickle.dump(db, myfile)
myfile.close()




The code that is using the database to create a table structure

#!/usr/bin/env python

# -*- coding:utf-8 -*-



import pickle

import datetime, time

import cgi



# List with dicts describing the newsfeeds

feeds = [

    {"url":"http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf", "name":"All Africa", "location":"Africa", "timezone":2},

    {"url":"http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml", "name":"New York Times", "location":"America", "timezone":2},

    {"url":"http://rssfeeds.usatoday.com/usatoday-NewsTopStories", "name":"USA Today", "location":"America", "timezone":2},

    {"url":"http://english.peopledaily.com.cn/rss/China.xml", "name":"People Daily", "location":"China", "timezone":2},

    {"url":"http://feeds.feedburner.com/japantimes", "name":"Japan Times", "location":"Japan", "timezone":2},

    {"url":"http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_205.xml", "name":"Herald Sun", "location":"Australia", "timezone":2},

    {"url":"http://feeds.guardian.co.uk/the guardian/rss", "name":"Guardian", "location":"England", "timezone":2}

]



#search colors

searchcolors= [

    {"fg":"000000", "bg":"FF0099"},# black - Pink

    {"fg":"000000", "bg":"FFFF00"},# black - yellow

    {"fg":"000000", "bg":"3399FF"},# black - light blue

    {"fg":"000000", "bg":"66FF00"},# black - light green

    {"fg":"000000", "bg":"CC00FF"},# black - light purple



    {"fg":"ffffff", "bg":"660033"},# white - dark purple

    {"fg":"ffffff", "bg":"FF0000"},# white - red

    {"fg":"ffffff", "bg":"FF6600"},# white - orange

    {"fg":"ffffff", "bg":"006600"},# white - dark green

    {"fg":"ffffff", "bg":"663300"},# white - brown

]



dbfile = file("/var/lib/newsgraph/database.dat", "r")

db = pickle.load(dbfile)



#search = "obama"



# parse form-data

formdata = cgi.FieldStorage()



if "search" in formdata:

    all_search_terms = formdata["search"].value

    search = all_search_terms.split(",")

else:

    all_search_terms = ""

    search = []



#Daylight Savings Time?

#'s zomers '1', 's winters '0'for d in reversed(range(days_history)):



dst = 1



days_history = 4



#for (key, articles) in db.items():

#    print key

#    for a in articles:

#        print "\t", a['title']



# Apache webserver expects the first two lies of output from ANY cgi-script to be content-type data.

print """Content-type: text/html\n\n"""



print """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >

<head>

	<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />

    <link rel="stylesheet" href="/newsgraph.css" type="text/css" media="screen" />

    <title>Internet Newsgraph</title>

</head>"""



print """<body>

    <div>

    <h1 style="font-family:verdana; color:#ffffff; font-size:30px"><b>Internet News Graph</b><br/>

    <form>"""



print """Search: <input type="text" name="search" value="%s"/>""" % all_search_terms



print """</form> 

    </h1>

    </div>

"""



print """<table><tr>"""

color_nr = 0

for search_str in search:

    print """<td class="searchterm" style="background-color: #%(bg)s; color: #%(fg)s;">""" % searchcolors[color_nr]

    print search_str

    print "</td>"

    color_nr = (color_nr + 1) % len(searchcolors)



print "</tr></table>"



colspan = len(feeds) + 1

divwidth = 80 * days_history



print """<div style="width: %d%%;">""" % divwidth

print """<table>"""



# Table header for each day

print """<tr>"""

for d in reversed(range(days_history)):

    day = datetime.date.today() - datetime.timedelta(days = d)

    print '<th colspan="%d" style="font-family:verdana; color:#ffffff; font-size:20px; background-color: #545454;">' % colspan

    print str(day)

    print """</th>"""



print """</tr>"""



# top row with names of newsfeeds

print """<tr>"""

for d in reversed(range(days_history)):

    print """<td class="time">GMT</td>""" 

    for feed in feeds:

        print """<td class="feedname">"""+feed["name"].encode("utf-8")

        print feed["location"].encode("utf-8")

        print "<br/>"

        print time.strftime("%H:%M", (datetime.datetime.utcnow() + datetime.timedelta(hours=(feed["timezone"] + dst))).timetuple())+"""</td>"""



print """</tr>"""



# one table-row for each hour of the day

for hour in range(0, 24):

    print """<tr>"""

    for d in reversed(range(days_history)):

        day = datetime.date.today() - datetime.timedelta(days = d)

        print """<td class="time">"""

        print "%2d:00" % hour

        print """</td>"""

        for feed in feeds:

            # search summary for the "search" var

    # add the database

            key = str(day)+" "+str(hour)+" "+feed["url"]

            found=False 

            if key in db:

                for article in db[key]:

                    color_nr = 0

                    for search_str in search:

                        found = search_str.lower() in article['summary'].lower()

                        if found:

                            break

                        

                        color_nr = (color_nr + 1) % len(searchcolors)



                    if found:

                        break

            if found:

                print """<td class="news" style="background: #%s">""" % searchcolors[color_nr]["bg"]

            else:

                print """<td class="news">"""



            print """<div style="height:100px;">"""



    #            key = day.year+"-"+day.month+"-"+day.mday+" "+hour+" "+feed

            #key = str(day)+" "+str(hour)+" "+feed["url"]

    #            print "searching "+key

    #           for k in db.keys():

    #                print "in "+k            



            if key in db:

                for article in db[key]:

                    if found:

                        print """<a class="news" style="color: #%s" href="%s">""" % (searchcolors[color_nr]["fg"], article["url"].encode("utf-8"))

                    else:

                        print """<a class="news" href="%s">""" % article["url"].encode("utf-8")

                    print article['title'].encode("utf-8")

                    print """</a>"""

                    print "<br/>"

            print """</div>"""            

            print """</td>"""

        

    print """</tr>"""



print """</table>"""

print "</div>"

print "</body>"

print "</html>"