User:Laurier Rochon/prototyping/slinkybookreader: Difference between revisions
No edit summary |
No edit summary |
||
Line 31: | Line 31: | ||
* nodes and links : I wanted to write links dynamically as the nodes popped up - but the links are required 2 points to draw from and to. So how do you draw a line to a node that doesn't exist yet? I had make a little change (that took me a while to figure out) to report that drawing to later, after the "target" node is drawn. | * nodes and links : I wanted to write links dynamically as the nodes popped up - but the links are required 2 points to draw from and to. So how do you draw a line to a node that doesn't exist yet? I had make a little change (that took me a while to figure out) to report that drawing to later, after the "target" node is drawn. | ||
* related to previous problem - my data structure reflected links, but I was considering it as "words" of the markov chain. Things didn't add up logically until I wrapped my head around the fact that every json dictionary describes a link, not a node. | * related to previous problem - my data structure reflected links, but I was considering it as "words" of the markov chain. Things didn't add up logically until I wrapped my head around the fact that every json dictionary describes a link, not a node. | ||
== Soft == | |||
Python shittez | |||
<source lang="python"> | |||
#!/usr/bin/python2.6 | |||
#-*- coding:utf-8 -*- | |||
print "Content-Type: text/html" | |||
print | |||
#command-line/query arg/param : http://www.gutenberg.org/cache/epub/14838/pg14838.txt | |||
import urllib2 | |||
import cgi | |||
import re | |||
import sys | |||
import simplejson | |||
get = cgi.FieldStorage() | |||
url = get.getlist("url") | |||
limit = get.getlist("limit") | |||
class Work: | |||
#instantiate | |||
def __init__(self): | |||
try: | |||
u = url[0] | |||
except: | |||
u = "http://www.gutenberg.org/cache/epub/14838/pg14838.txt" | |||
try: | |||
#open up the url given | |||
txt = urllib2.urlopen(u).read().strip('') | |||
#cleaning | |||
s = r'\*\*\* .* OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*' | |||
r = re.compile(s) | |||
usable = re.split(r,txt) | |||
story = usable[1] | |||
story = re.sub("\[Illustration\]","",story) | |||
story = re.sub(",|\.|;|:|'|\!|\?","",story) | |||
self.story = story | |||
self.unique_words = self.unique_words() | |||
except: | |||
#error | |||
print "could not load url" | |||
sys.exit() | |||
#print to sdout | |||
def readme(self): | |||
print self.story | |||
#return unique words | |||
def unique_words(self): | |||
unique_words = list(set(self.story.lower().split())) | |||
return unique_words | |||
#return a list of tuples with all words | |||
def markovize(self): | |||
pairs = [] | |||
words = self.story.lower().split() | |||
#limit... | |||
words = words[0:int(limit[0])] | |||
a = 0 | |||
for w in words: | |||
if a<len(words)-1: | |||
pairs.append([self.unique_words.index(w),self.unique_words.index(words[a+1]),w]); | |||
a=a+1 | |||
#this does it a bit differently...unique words first | |||
#for w in self.unique_words: | |||
# matches = [i for i,val in enumerate(words) if val.lower()==w.lower()] | |||
# for match in matches: | |||
# pairs.append([a,match,words[match]]) | |||
# a=a+1 | |||
jsonfile = open("../../slinkfiles/words","w") | |||
simplejson.dump(pairs, jsonfile) | |||
work = Work() | |||
work.markovize() | |||
print '''<!DOCTYPE html> | |||
<html> | |||
<head> | |||
<title>Slink-A-Dink SVG</title> | |||
<script type="text/javascript" src="../../slinkfiles/d3_all.js"></script> | |||
<script type="text/javascript" src="../../slinkfiles/jquery-1.6.4min.js"></script> | |||
<link rel="stylesheet" href="../../slinkfiles/main.css" media="screen" /> | |||
</head> | |||
<body> | |||
<form type="get"> | |||
<div id="txt">(you can click and drag stuff) | |||
<select name="limit"> | |||
''' | |||
for b in range(2,201): | |||
print '<option value="%d">%d</option>' % (b,b) | |||
print '''</select> | |||
<input type="submit" value="go"> | |||
</div> | |||
</form> | |||
<div id="chart"></div> | |||
<div id="low" class="low">Default text, if "url" GET param not specified : <a href="http://www.gutenberg.org/cache/epub/14838/pg14838.txt" target="_blank">http://www.gutenberg.org/cache/epub/14838/pg14838.txt</a></div> | |||
<script type="text/javascript" src="../../slinkfiles/main.js"></script> | |||
</body> | |||
</html> | |||
''' | |||
</source> | |||
Main js shittez | |||
<source lang="javascript"> | |||
var w = 960, | |||
h = 500, | |||
fill = d3.scale.category20(), | |||
nodes = [], | |||
links = []; | |||
var vis = d3.select("#chart").append("svg:svg") | |||
.attr("width", w) | |||
.attr("height", h); | |||
vis.append("svg:rect") | |||
.attr("width", w) | |||
.attr("height", h); | |||
var force = d3.layout.force() | |||
.distance(30) | |||
.nodes(nodes) | |||
.links(links) | |||
.size([w, h]); | |||
force.on("tick", function() { | |||
vis.selectAll("line.link") | |||
.attr("x1", function(d) { return d.source.x; }) | |||
.attr("y1", function(d) { return d.source.y; }) | |||
.attr("x2", function(d) { return d.target.x; }) | |||
.attr("y2", function(d) { return d.target.y; }); | |||
vis.selectAll("circle.node") | |||
.attr("cx", function(d) { return d.x; }) | |||
.attr("cy", function(d) { return d.y; }); | |||
vis.selectAll("text.label") | |||
.attr("x", function(d) { return d.x+5; }) | |||
.attr("y", function(d) { return d.y+5; }); | |||
}); | |||
//declare shittez | |||
forlater = false; | |||
a = 0; | |||
dcounter=0; | |||
data = []; | |||
function deliver(data){ | |||
if(dcounter<data.length){ | |||
var f = function(){deliver(data)} | |||
setTimeout(f,200); | |||
anew(data[dcounter][2],data[dcounter][0],data[dcounter][1]); | |||
dcounter++; | |||
} | |||
} | |||
function findmatches(no,myid){ | |||
m_list = [] | |||
for(q=0;q<nodes.length;q++){ | |||
if(nodes[q].t==no && q!=myid){ | |||
m_list.push(q) | |||
} | |||
} | |||
return m_list; | |||
} | |||
jQuery.getJSON("words",function(data){ | |||
deliver(data); | |||
}); | |||
function node_exists(t){ | |||
found = false | |||
for(c=0;c<nodes.length;c++){ | |||
if(nodes[c].sou==t){ | |||
found = c; | |||
} | |||
} | |||
return found; | |||
} | |||
function anew(txt,sou,tar){ | |||
exists = node_exists(sou); | |||
if(!exists){ | |||
node = {x: w/2, y: h/2, sou:sou, tar:tar}; | |||
n = nodes.push(node) | |||
vis.append("svg:text") | |||
.attr("x",nodes[a]['x']) | |||
.attr("y",nodes[a]['y']) | |||
.attr("class", "label") | |||
.style("fill", "black") | |||
.style("font-size", "10px") | |||
.style("font-family", "Arial") | |||
.text(function(d) { return txt; }); | |||
if(a>0){ | |||
links.push({source: nodes[a], target: nodes[a-1]}) | |||
} | |||
if(forlater){ | |||
links.push({source: nodes[forlater], target: nodes[a]}) | |||
links.push({source: nodes[forlater], target: nodes[a-1]}) | |||
forlater = false; | |||
} | |||
a++; | |||
}else{ | |||
//stupid shit...can't create a link to a node that doesn't exist can we? | |||
//why do something now when we can do it later huh | |||
forlater = exists; | |||
} | |||
restart(); | |||
} | |||
restart(); | |||
function restart() { | |||
force.start(); | |||
vis.selectAll("line.link") | |||
.data(links) | |||
.enter().insert("svg:line", "circle.node") | |||
.attr("class", "link") | |||
.attr("x1", function(d) { return d.source.x; }) | |||
.attr("y1", function(d) { return d.source.y; }) | |||
.attr("x2", function(d) { return d.target.x; }) | |||
.attr("y2", function(d) { return d.target.y; }); | |||
vis.selectAll("circle.node") | |||
.data(nodes) | |||
.enter().insert("svg:circle", "circle.cursor") | |||
.attr("class", "node") | |||
.attr("cx", function(d) { return d.x; }) | |||
.attr("cy", function(d) { return d.y; }) | |||
.attr("r", 3.5) | |||
.call(force.drag); | |||
//er...wtf? | |||
vis.selectAll("text.label") | |||
.data(nodes) | |||
.enter().insert("svg:text", "text.label") | |||
.attr("class", "label") | |||
.attr("x", function(d) { return d.x; }) | |||
.attr("y", function(d) { return d.y; }) | |||
.call(force.drag); | |||
} | |||
</source> |
Revision as of 19:54, 26 October 2011
CGI/Python + D3 js library + markov chains = Slinky Text Reader
- Check it out here
- You can change the amount of nodes (words) displayed by changing the "limit" param
- You can change the URL to open/parse by giving in another page (preferably from the Gutenberg collection...there is some cleaning code specific to those files)
What is does
So I was wandering rather aimlessly in the gutenberg database, and couldn't find much that was so compelling to me. So I decided to just make a simple "work" parser that would pick apart the different pieces of a text and maybe rearrange them. I created a small class that made a "work" object, on which you could call the markovize() method, which effectively did what is says. I thought then, for fun, that I could try to replicate the graphs I built last year, but in animation. The idea was that the system would "read" out to you a book, and when a non-unique word occurred, the story would "split" in many paths, so that you can follow the one you preferred. Erm...I can't say the graph reads in that way, but that was the objective. As new nodes are being pumped out, the rest would move away and create a certain narrative that one could follow on the screen. D3 for js seemed a good fit for this.
What happens, in order
- checks the url param - uses urlib2 to grab it
- parses the file, creates a markov chain by gathering all the unique words of the text, then pairing every word with an index from the unique words list
- write the whole thing in JSON format using simpleJSON (very nice json lib...). the data structure looks like so {"source":"1","target":"2","word":"bananas"}
- if all of the above occurs smoothly, use JS and jQuery to load up the JSON file
- start writing the SVG nodes (circles), whenever you find a node that already exists, create another link (SVG line) instead of another node
A few issues encountered along the way
- D3 documentation sucks. I had worked with it before though
- nodes and links : I wanted to write links dynamically as the nodes popped up - but the links are required 2 points to draw from and to. So how do you draw a line to a node that doesn't exist yet? I had make a little change (that took me a while to figure out) to report that drawing to later, after the "target" node is drawn.
- related to previous problem - my data structure reflected links, but I was considering it as "words" of the markov chain. Things didn't add up logically until I wrapped my head around the fact that every json dictionary describes a link, not a node.
Soft
Python shittez
#!/usr/bin/python2.6
#-*- coding:utf-8 -*-
print "Content-Type: text/html"
print
#command-line/query arg/param : http://www.gutenberg.org/cache/epub/14838/pg14838.txt
import urllib2
import cgi
import re
import sys
import simplejson
get = cgi.FieldStorage()
url = get.getlist("url")
limit = get.getlist("limit")
class Work:
#instantiate
def __init__(self):
try:
u = url[0]
except:
u = "http://www.gutenberg.org/cache/epub/14838/pg14838.txt"
try:
#open up the url given
txt = urllib2.urlopen(u).read().strip('')
#cleaning
s = r'\*\*\* .* OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*'
r = re.compile(s)
usable = re.split(r,txt)
story = usable[1]
story = re.sub("\[Illustration\]","",story)
story = re.sub(",|\.|;|:|'|\!|\?","",story)
self.story = story
self.unique_words = self.unique_words()
except:
#error
print "could not load url"
sys.exit()
#print to sdout
def readme(self):
print self.story
#return unique words
def unique_words(self):
unique_words = list(set(self.story.lower().split()))
return unique_words
#return a list of tuples with all words
def markovize(self):
pairs = []
words = self.story.lower().split()
#limit...
words = words[0:int(limit[0])]
a = 0
for w in words:
if a<len(words)-1:
pairs.append([self.unique_words.index(w),self.unique_words.index(words[a+1]),w]);
a=a+1
#this does it a bit differently...unique words first
#for w in self.unique_words:
# matches = [i for i,val in enumerate(words) if val.lower()==w.lower()]
# for match in matches:
# pairs.append([a,match,words[match]])
# a=a+1
jsonfile = open("../../slinkfiles/words","w")
simplejson.dump(pairs, jsonfile)
work = Work()
work.markovize()
print '''<!DOCTYPE html>
<html>
<head>
<title>Slink-A-Dink SVG</title>
<script type="text/javascript" src="../../slinkfiles/d3_all.js"></script>
<script type="text/javascript" src="../../slinkfiles/jquery-1.6.4min.js"></script>
<link rel="stylesheet" href="../../slinkfiles/main.css" media="screen" />
</head>
<body>
<form type="get">
<div id="txt">(you can click and drag stuff)
<select name="limit">
'''
for b in range(2,201):
print '<option value="%d">%d</option>' % (b,b)
print '''</select>
<input type="submit" value="go">
</div>
</form>
<div id="chart"></div>
<div id="low" class="low">Default text, if "url" GET param not specified : <a href="http://www.gutenberg.org/cache/epub/14838/pg14838.txt" target="_blank">http://www.gutenberg.org/cache/epub/14838/pg14838.txt</a></div>
<script type="text/javascript" src="../../slinkfiles/main.js"></script>
</body>
</html>
'''
Main js shittez
var w = 960,
h = 500,
fill = d3.scale.category20(),
nodes = [],
links = [];
var vis = d3.select("#chart").append("svg:svg")
.attr("width", w)
.attr("height", h);
vis.append("svg:rect")
.attr("width", w)
.attr("height", h);
var force = d3.layout.force()
.distance(30)
.nodes(nodes)
.links(links)
.size([w, h]);
force.on("tick", function() {
vis.selectAll("line.link")
.attr("x1", function(d) { return d.source.x; })
.attr("y1", function(d) { return d.source.y; })
.attr("x2", function(d) { return d.target.x; })
.attr("y2", function(d) { return d.target.y; });
vis.selectAll("circle.node")
.attr("cx", function(d) { return d.x; })
.attr("cy", function(d) { return d.y; });
vis.selectAll("text.label")
.attr("x", function(d) { return d.x+5; })
.attr("y", function(d) { return d.y+5; });
});
//declare shittez
forlater = false;
a = 0;
dcounter=0;
data = [];
function deliver(data){
if(dcounter<data.length){
var f = function(){deliver(data)}
setTimeout(f,200);
anew(data[dcounter][2],data[dcounter][0],data[dcounter][1]);
dcounter++;
}
}
function findmatches(no,myid){
m_list = []
for(q=0;q<nodes.length;q++){
if(nodes[q].t==no && q!=myid){
m_list.push(q)
}
}
return m_list;
}
jQuery.getJSON("words",function(data){
deliver(data);
});
function node_exists(t){
found = false
for(c=0;c<nodes.length;c++){
if(nodes[c].sou==t){
found = c;
}
}
return found;
}
function anew(txt,sou,tar){
exists = node_exists(sou);
if(!exists){
node = {x: w/2, y: h/2, sou:sou, tar:tar};
n = nodes.push(node)
vis.append("svg:text")
.attr("x",nodes[a]['x'])
.attr("y",nodes[a]['y'])
.attr("class", "label")
.style("fill", "black")
.style("font-size", "10px")
.style("font-family", "Arial")
.text(function(d) { return txt; });
if(a>0){
links.push({source: nodes[a], target: nodes[a-1]})
}
if(forlater){
links.push({source: nodes[forlater], target: nodes[a]})
links.push({source: nodes[forlater], target: nodes[a-1]})
forlater = false;
}
a++;
}else{
//stupid shit...can't create a link to a node that doesn't exist can we?
//why do something now when we can do it later huh
forlater = exists;
}
restart();
}
restart();
function restart() {
force.start();
vis.selectAll("line.link")
.data(links)
.enter().insert("svg:line", "circle.node")
.attr("class", "link")
.attr("x1", function(d) { return d.source.x; })
.attr("y1", function(d) { return d.source.y; })
.attr("x2", function(d) { return d.target.x; })
.attr("y2", function(d) { return d.target.y; });
vis.selectAll("circle.node")
.data(nodes)
.enter().insert("svg:circle", "circle.cursor")
.attr("class", "node")
.attr("cx", function(d) { return d.x; })
.attr("cy", function(d) { return d.y; })
.attr("r", 3.5)
.call(force.drag);
//er...wtf?
vis.selectAll("text.label")
.data(nodes)
.enter().insert("svg:text", "text.label")
.attr("class", "label")
.attr("x", function(d) { return d.x; })
.attr("y", function(d) { return d.y; })
.call(force.drag);
}