Generating Slash Fiction from BBC News RSS Feeds

From XPUB & Lens-Based wiki
Revision as of 10:23, 9 December 2013 by Lidia.Pereira (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
import re
from nltk.probability import FreqDist, LidstoneProbDist
from nltk.probability import ConditionalFreqDist as CFD
from nltk.util import tokenwrap, LazyConcatenation
from nltk.model import NgramModel
from nltk.metrics import f_measure, BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.compat import python_2_unicode_compatible, text_type
import feedparser, pickle, nltk
import random
import nltk.tokenize

url = "http://feeds.bbci.co.uk/news/rss.xml"
searchwords= ["president","minister","cameron","presidential","obama","angela","barroso","pm","chancellor"]
rawFeed = feedparser.parse(url)

# thank you mathijs for my own personalized generate function!
def lidia_generate(dumpty, length=100):
    if '_trigram_model' not in dumpty.__dict__:
        estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
        dumpty._trigram_model = NgramModel(3, dumpty, estimator=estimator)
    text = dumpty._trigram_model.generate(length)
    return tokenwrap(text) 

def convert (humpty):
    tokens = nltk.word_tokenize(humpty)
    return nltk.Text(tokens)

superSumario = " "
for i in rawFeed.entries:
    sumario = i["summary"].lower()
    for w in searchwords:
        if w in sumario.strip():
            superSumario = superSumario + sumario
            out = (open("afiltertest.txt", "w"))
            pickle.dump(superSumario,out) 

textrss = open('afiltertest.txt').read()
textslash = open('harrypotterslash.txt').read()

names = re.compile(r"[a-z]\s([A-Z]\w+)")
pi = names.findall(textslash)
lista = []

for name in pi:
    if name not in lista:
        lista.append(name)

variavel = ""

for name in lista:
    variavel = variavel + name + "|"
    
ze = "(" + variavel.rstrip ("|") + ")"

nomes = re.compile(ze)

office1 = re.compile(r"(president\s.\w*.\w*)")
office2 = re.compile(r"(minister\s.\w*.\w*)")
office3 = re.compile(r"(chancellor\s.\w*.\w*)")
repls = office1.findall(textrss) + office2.findall(textrss) + office3.findall(textrss)

def r(m):
    return random.choice(repls)

yes = nomes.sub(r,textslash)
lala = re.sub("\"","",yes)

both = lala.strip() + textrss.strip()

puff = convert(both)
title = lidia_generate(puff,5)
title = "~ " + str(title) + " ~"
title = re.sub(r"\.","",title)
print title
herpDerp = lidia_generate(puff, 500)
herpDerp = str(herpDerp) + "."
print herpDerp
# ~~~~~ html file writing bit ~~~~~ #
f = open('test.html','w')
f.write("""
    <!DOCTYPE html>
<html>
<head>
    <link href='http://fonts.googleapis.com/css?family=Alegreya+SC' rel='stylesheet' type='text/css'>
    <link href='http://fonts.googleapis.com/css?family=Oleo+Script+Swash+Caps' rel='stylesheet' type='text/css'>
    <style>
        h1 {
        color:#078;
        font-family: 'Oleo Script Swash Caps', sans-serif;
        font-size:44px;
        margin:10% 0 0 5%;}
        p {margin:9% 30% 0 5%;
        font-family: 'Alegreya SC', serif;
        font-size:27px;
        text-indent:2%;
        text-align:justify;
        punctuation-trim:[start];}
        body {background-color:#CB9;}
    </style>
</head>
<body>
    <h1>""" + title + """</h1>
    <p>""" + herpDerp + """</p1>
</body>
</html>""")
f.close()


Rssfeedslashtml.png


president barack obama was kissing

in the course of making a two-part bbc profile of angela merkel put her panties down, which soon lay disguarded on the floor simultaneously. this was something he wanted to keep to himself. president barack obama is to be held at westminster, sources say, as david cameron signs a book of condolences. former uk prime minister tony blair started getting carried away and before he knew it his hands were up angela merkel's mind would be dragged back into the classroom. in these moments he was quiet surprised when she spoke to him. Hope you liked the view she whispered in president nicolas maduro. where's president mohammed morsi has double divination and the heavily scented room mixed with the girls, and they'd ignored him since the disaster of the hottest girls by his side. you know i never really gave you a chance after the president mohammed morsi's answer I'm off to the classroom 's trapdoor only president passos coelho , kissing president barack obama's friend and this was something he wanted to keep to himself. # bbctrending looks at the intense activity on social media in the corridor during the week, and I thought it was soon full of thoughts of his own good fortune. For years he'd been a nervous wreck , unable to speak to anybody and almost been sick with worry and dread about the impending game. Now seven days later he was one of the school also were taking a keen interest into president barack obama ball said minister tony blair had used his success on the floor simultaneously president barack obama had been on purpose to allow them to acknowledge president barack obama. Was officially announced by south african president jacob zuma's classroom. But the girls , and i thought it wasn't just the among the mass of quidditch fans that angela merkel reached the bottom. people in venezuela have begun voting in municipal elections seen as a crucial test for president jacob zuma had been looking at the same time as reaching under her skirt above her waist and president nicolas maduro minister describing them as "utterly incomprehensible". david and samantha cameron choose a black and white portrait of the last ones to leave, by the time president jacob zuma she was waiting for her best friend. president barack obama smiled across at her. chancellor angela merkel put her hands up.




Unfortunately, the cgi script doesn't seem to be working, so I'm writing html files as a way to get a sense of what the final result will actually be.

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import re
from nltk.probability import FreqDist, LidstoneProbDist
from nltk.probability import ConditionalFreqDist as CFD
from nltk.util import tokenwrap, LazyConcatenation
from nltk.model import NgramModel
from nltk.metrics import f_measure, BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.compat import python_2_unicode_compatible, text_type
import feedparser, pickle, nltk
import random
import nltk.tokenize
import cgi



url = "http://feeds.bbci.co.uk/news/rss.xml"
searchwords= ["president","minister","cameron","presidential","obama","angela","barroso","pm","chancellor"]
rawFeed = feedparser.parse(url)

# thank you mathijs for my own personalized generate function!
def lidia_generate(dumpty, length=100):
    if '_trigram_model' not in dumpty.__dict__:
        estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
        dumpty._trigram_model = NgramModel(3, dumpty, estimator=estimator)
    text = dumpty._trigram_model.generate(length)
    return tokenwrap(text) # or just text if you don't want to wrap

def convert (humpty):
    tokens = nltk.word_tokenize(humpty)
    return nltk.Text(tokens)

superSumario = " "
for i in rawFeed.entries:
    sumario = i["summary"].lower()
    for w in searchwords:
        if w in sumario.strip():
            superSumario = superSumario + sumario
            out = (open("afiltertest.txt", "w"))
            pickle.dump(superSumario,out) 

textrss = open('afiltertest.txt').read()
textslash = open('harrypotterslash.txt').read()

names = re.compile(r"[a-z]\s([A-Z]\w+)")
pi = names.findall(textslash)
lista = []

for name in pi:
    if name not in lista:
        lista.append(name)

variavel = ""

for name in lista:
    variavel = variavel + name + "|"
    
ze = "(" + variavel.rstrip ("|") + ")"

nomes = re.compile(ze)

office1 = re.compile(r"(president\s\w+\w+)")
office2 = re.compile(r"(minister\s\w+\w+)")
office3 = re.compile(r"(chancellor\s\w+\w+)")
repls = office1.findall(textrss) + office2.findall(textrss) + office3.findall(textrss)

def r(m):
    return random.choice(repls)

yes = nomes.sub(r,textslash)
lala = re.sub("\"","",yes)

both = lala.strip() + textrss.strip()

puff = convert(both)
title = lidia_generate(puff,5)
title = "~ " + str(title) + " ~"
title = re.sub(r"\.","",title)
herpDerp = lidia_generate(puff, 500)
herpDerp = str(herpDerp) + "."


print "Content-Type: text/html"
print 
print """
    <!DOCTYPE html>
<html>
<head>
    <link href='http://fonts.googleapis.com/css?family=Alegreya+SC' rel='stylesheet' type='text/css'>
    <link href='http://fonts.googleapis.com/css?family=Oleo+Script+Swash+Caps' rel='stylesheet' type='text/css'>
    <style>
        h1 {
        color:#078;
        font-family: 'Oleo Script Swash Caps', sans-serif;
        font-size:44px;
        margin:10% 0 0 5%;}
        p {margin:9% 30% 0 5%;
        font-family: 'Alegreya SC', serif;
        font-size:27px;
        text-indent:2%;
        text-align:justify;
        punctuation-trim:[start];}
        body {background-color:#CB9;}
    </style>
</head>
<body>
    <h1>""" + titulo + """</h1>
    <p>""" + herpDerp + """</p1>
</body>
</html>"""