User:Manetta/i-could-have-written-that/twitter-truth
Twitter truth exercise with a KNN classifier, from Pattern's example folder
the following code generates a static html page, that queries the twitter API every 10 seconds, and asks for 300 tweets that are tagged with the #fantastic or #horrible. five test sentences are provided to see in which category they would fall. this is a pattern-recognition exercise with the K-Nearest-Neighbour algorithm. the script is written in python and based on a example script from pattern: pattern-2.6/examples/vector/04-knn.py.
# Example from the README.md file in the main pattern-2.6 folder
from pattern.web import Twitter
from pattern.en import tag
from pattern.vector import KNN, count
import time
twitter, knn = Twitter(), KNN()
while True:
a = []
b = []
for i in range(1, 3):
for tweet in twitter.search('#fantastic OR #horrible', start=i, count=100):
print tweet
s = tweet.text.lower()
# detecting if tweet is hashtagged
p = '#fantastic' in s and 'fantastic' or 'horrible'
print p
# word --> POS (NN/VB/NNS/DT/JJ/..) + selecting only adjectives
v = tag(s)
v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective
# adjectives in tweets are counted
v = count(v) # {'sweet': 1}
print 'annotated adjective for', p, '>>>', v
print
# adjectives are grouped under either one of the hashtags
if v:
knn.train(v, type=p)
if p == 'fantastic':
a.append(v)
if p == 'horrible':
b.append(v)
# print 'list of adjectives for A:',a
# print
# print 'list of adjectives for B:',b
# print
f = open('output.html','w')
htmlOpen = """
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="refresh" content="10; URL=output.html">
<meta charset="utf-8">
<link rel="stylesheet" href="css/stylesheet.css" type="text/css" media="screen" />
<title>*TT*</title>
</head>
<body>
<div id="title">TWITTER TRUTH</div>
"""
f.write(htmlOpen)
# ---------------------------------------------------------
# class A
print
print "***** class A *****"
aOpen = """
<div id="a" class="train">
<h1>#fantastic</h1>
"""
f.write(aOpen)
for item in a:
for key in item.keys():
print key
key = """<div class="indicator">"""+key.encode('utf-8')+"""</div>"""
f.write(key)
aClose = """</div>"""
f.write(aClose)
# ---------------------------------------------------------
# class B
print
print "***** class B *****"
bOpen = """
<div id="b" class="train">
<h1>#horrible</h1>
"""
f.write(bOpen)
for item in b:
for key in item.keys():
print key
key = """<div class="indicator">"""+key.encode('utf-8')+"""</div>"""
f.write(key)
bClose = """</div>"""
f.write(bClose)
# ---------------------------------------------------------
# test sentences
sentences = [
"the cloud is the future",
"technology will bring a beautiful future",
"the future will change thanks to innovation",
"the future will be global",
"the future will be ridiculous"
]
testOpen = "<div class='test'>"
f.write(testOpen)
for sentence in sentences:
print sentence
c = knn.classify(sentence) # If KNN.classify() is unable to classify a document, it returns the baseline (by default, the most frequent class).
print c
print
test = """
<div class="sentence """+c+""" ">"""+sentence+"""</div>
<div class="result """+c+""" ">#"""+c+"""</div>
"""
f.write(test)
testClose = "</div>"
f.write(testClose)
htmlClose = """
</body>
</html>
"""
f.write(htmlClose)
f.close()
print
print sorted(knn.features)
time.sleep(20)