Twitter truth exercise with a KNN classifier, from Pattern's example folder

the following code generates a static html page, that queries the twitter API every 10 seconds, and asks for 300 tweets that are tagged with the #fantastic or #horrible. five test sentences are provided to see in which category they would fall. this is a pattern-recognition exercise with the K-Nearest-Neighbour algorithm. the script is written in python and based on a example script from pattern: pattern-2.6/examples/vector/04-knn.py.

# Example from the README.md file in the main pattern-2.6 folder

from pattern.web    import Twitter
from pattern.en     import tag
from pattern.vector import KNN, count

import time

twitter, knn = Twitter(), KNN()

while True: 

	a = []
	b = []

	for i in range(1, 3):
		for tweet in twitter.search('#fantastic OR #horrible', start=i, count=100):

			print tweet
			
			s = tweet.text.lower()

			# detecting if tweet is hashtagged 
			p = '#fantastic' in s and 'fantastic' or 'horrible'
			print p

			# word --> POS (NN/VB/NNS/DT/JJ/..) + selecting only adjectives
			v = tag(s)
			v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective

			# adjectives in tweets are counted
			v = count(v) # {'sweet': 1}
			print 'annotated adjective for', p, '>>>', v
			print 

			# adjectives are grouped under either one of the hashtags
			if v:
				knn.train(v, type=p)
				if p == 'fantastic':
					a.append(v)
				if p == 'horrible':
					b.append(v)

	# print 'list of adjectives for A:',a
	# print
	# print 'list of adjectives for B:',b
	# print

	f = open('output.html','w')

	htmlOpen = """
	<!DOCTYPE html>
	<html>
	<head>
		<meta http-equiv="refresh" content="10; URL=output.html">
		<meta charset="utf-8">
		<link rel="stylesheet" href="css/stylesheet.css" type="text/css" media="screen" />
		<title>*TT*</title>
	</head>
	<body>
	<div id="title">TWITTER TRUTH</div>
	"""
	f.write(htmlOpen)


	# ---------------------------------------------------------
	# class A 

	print
	print "***** class A *****"
	aOpen = """
	<div id="a" class="train">
	<h1>#fantastic</h1>
	"""
	f.write(aOpen)

	for item in a: 
		for key in item.keys(): 
			print key
			key = """<div class="indicator">"""+key.encode('utf-8')+"""</div>"""
			f.write(key)

	aClose = """</div>"""
	f.write(aClose)


	# ---------------------------------------------------------
	# class B

	print
	print "***** class B *****"
	bOpen = """
	<div id="b" class="train">
	<h1>#horrible</h1>
	"""
	f.write(bOpen)

	for item in b: 
		for key in item.keys(): 
			print key
			key = """<div class="indicator">"""+key.encode('utf-8')+"""</div>"""
			f.write(key)

	bClose = """</div>"""
	f.write(bClose)


	# ---------------------------------------------------------
	# test sentences

	sentences = [
		"the cloud is the future", 
		"technology will bring a beautiful future", 
		"the future will change thanks to innovation",
		"the future will be global",
		"the future will be ridiculous"
	]

	testOpen = "<div class='test'>"
	f.write(testOpen)

	for sentence in sentences: 
		print sentence
		c = knn.classify(sentence) #  If KNN.classify() is unable to classify a document, it returns the baseline (by default, the most frequent class).
		print c
		print 
		test = """
			<div class="sentence """+c+""" ">"""+sentence+"""</div>
			<div class="result """+c+""" ">#"""+c+"""</div>
		"""
		f.write(test)

	testClose = "</div>"
	f.write(testClose)


	htmlClose = """
	</body>
	</html>
	"""
	f.write(htmlClose)

	f.close()

	print
	print sorted(knn.features)
	time.sleep(20)