User:Birgit Bachler/openevents/code
ah04.py:
#!/usr/local/bin/python
import html5lib, urllib2, urlparse
import os, glob
from os import system, sys
import subprocess
import time, random
cardnumber=""
def runBash(cmd):
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
out = p.stdout.read().strip()
return out #This is the stdout from the shell command
def absolutize (href, base):
if not href.lower().startswith("http://"):
return urlparse.urljoin(base, href)
return href
def openURL (url, data):
"""
returns (page, actualurl)
sets user_agent and resolves possible redirection
realurl maybe different than url in the case of a redirect
"""
request = urllib2.Request(url)
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
request.add_header("User-Agent", user_agent)
if "?" in url:
(url, data) = url.split("?")
if data:
# print "posting"
pagefile=urllib2.urlopen(request, data)
else:
pagefile=urllib2.urlopen(request)
realurl = pagefile.geturl()
return (pagefile, realurl)
def downloadURL (url, foldername):
"""
returns (page, actualurl)
sets user_agent and resolves possible redirection
realurl maybe different than url in the case of a redirect
"""
request = urllib2.Request(url)
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
request.add_header("User-Agent", user_agent)
pagefile=urllib2.urlopen(request)
realurl = pagefile.geturl()
data = pagefile.read()
urlpath = urlparse.urlparse(url)[2]
(path, filename) = os.path.split(urlpath)
filename = os.path.join(foldername, filename)
out = open(filename, "wb")
out.write(data)
def topProducts():
from string import punctuation
N = 20
words = {}
words_gen = (word.strip(punctuation).lower() for line in open("alle.txt")
for word in line.split())
for word in words_gen:
words[word] = words.get(word, 0) + 1
top_words = sorted(words.iteritems(),
key=lambda(word, count): (-count, word))[:N]
for word, frequency in top_words:
alle = "%s: %d" % (word, frequency)+"\n"
print >> html2, alle+"<br>"
print "*"*50
print "*"*50
print "New Bonuskaartfriend? Otherwise just press enter. Example:"
print "2601234567890 YourName"
newCard=raw_input()
print "*"*50
print "*"*50
if newCard != "":
# read the current contents of the file
f = open("cardnumbers.txt")
text = f.read()
f.close()
# open the file again for writing
f = open("cardnumbers.txt", "w")
f.write(str(newCard)+"\n")
# write the original contents
f.write(text)
f.close()
print "*"*50
print "START"
print "*"*50
txx = open("cardnumbers.txt", "r")
for line in txx:
print "*"*50
print "Busy with Bonuskaartfriend", line
print "*"*50
url = "http://www.ah.nl/previouslybought/PreviouslyBought.do"
line = line.rstrip("\n")
line2 = line[0:13]
print line, line2
cardnumber = line2
system("mkdir "+str(cardnumber)+"")
system("mkdir "+str(cardnumber)+"/output")
#print type(cardnumber)
#f, url2 = openURL(url, "cardNumber=2620481956139")
#cardnumber = "2620474055641"
f, url = openURL(url, "cardNumber="+cardnumber)
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("BeautifulSoup"))
firstPage = parser.parse(f)
# print type(firstPage)
f.close()
#fillet stands for pythonnumber of item pages
loadedURL = {}
import codecs
txt = codecs.open(str(cardnumber)+"/"+str(cardnumber)+".txt", "w", encoding="utf-8")
for a in firstPage.findAll("a", { "class" : "fillet" }):
pageurl = a["href"]
pageurl = absolutize(pageurl, url)
if not loadedURL.has_key(pageurl):
#LOAD URL...
print "PAGE ", pageurl
f2, pageurl = openURL(pageurl, None)
# print f2.read()
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("BeautifulSoup"))
page = parser.parse(f2)
for img in page.findAll("img"):
# print " ", img
# src = img.getAttribute("src")
src = img["src"]
if src:
src = absolutize(src, pageurl)
#print " ",src
time.sleep(random.random())
try:
downloadURL(src, ""+str(cardnumber)+"/output")
except urllib2.URLError, msg:
print "ncfiles: Urllib2 error (%s)" % msg
except socket.error, (errno, strerror):
print "ncfiles: Socket error (%s) for host %s (%s)" % (errno, host, strerror)
for h3 in page.findAll("h3"):
value = (h3.contents[0])
if value != "Afdeling":
print >> txt, value
import codecs
f = codecs.open("alle.txt", "r", encoding="utf-8")
text = f.read()
f.close()
# open the file again for writing
f = codecs.open("alle.txt", "w", encoding="utf-8")
f.write(value+"\n")
# write the original contents
f.write(text)
f.close()
loadedURL[pageurl] = True
f.close()
f2.close()
system("mkdir "+str(cardnumber)+"/products")
system("mv "+str(cardnumber)+"/output/*_80.jpg "+str(cardnumber)+"/products")
system("montage "+str(cardnumber)+"/products/*.jpg "+str(cardnumber)+"/"+str(cardnumber)+".jpg")
system("convert -delay 100 "+str(cardnumber)+"/products/*.jpg "+str(cardnumber)+"/"+str(cardnumber)+".gif")
system("rm -r "+str(cardnumber)+"/output")
system("rm -r "+str(cardnumber)+"/products")
#system("display "+str(cardnumber)+"/"+str(cardnumber)+".jpg &")
#TRY TO GRAB ALL EXISTING BONUSCARD-DIRECTORIES FOR COMPARISON-LOOP
path = '/home/merglind/Documents/OpenEvents/try04'
html = codecs.open(str(cardnumber)+"/index.html", "w", encoding="utf-8")
friends=[]
for infile in glob.glob( os.path.join(path, '26*') ):
(head, tail) = os.path.split(infile)
#print "current file is: " + tail
compareNumber = tail
if compareNumber != cardnumber:
cmd="./compare.sh "+str(cardnumber)+"/"+str(cardnumber)+".txt "+str(compareNumber)+"/"+str(compareNumber)+".txt"
value=runBash(cmd)
value=round(float(value),2)
# print value, "% Match with bonuscard number", compareNumber
friend = value, compareNumber
#print >> txt02, friend
if value > 10.0:
friends.append(friend)
#print friends
friends.sort(reverse=True)
# import pprint
# pprint.pprint(friends)
##alle.close()
#### TOP10 products
#################HTML STARTS HERE#########################
if line[14:]!="":
nickname=line[14:]
else:
nickname=cardnumber
print >> html, "<html><head><title>"+line[14:]+"'s Profile</title>"
print >> html, """<link rel="stylesheet" href="../style.css"></head>"""
print >> html, """
<body>
<table bgcolor="#00A0E2"width="100%" height="40px "border="0">
<tr>
<td width="25%"><img src="../ah_friends_t.gif"></td>
<td><h1>
"""
print >> html, ""+cardnumber+"'s Profile</></h1></td>"
print >> html, """
</tr>
</table>
<div class="outer"><div class="inner">
"""
print >> html, "<p><h2>This is "+nickname+"</h2>"
print >> html, "<div><a href='../"+cardnumber+"/"+cardnumber+".jpg'><img src='../"+cardnumber+"/"+cardnumber+".gif'></a></div>"
print >> html, "<h3>"+nickname+"'s BEST BONUS-FRIENDS:</h3>"
print >> html, "<ul>"
for friend in friends:
card=friend[1]
score="%0.2f" % friend[0]
if score > 10.:
print >> html, "<li><a href='../"+card+"/index.html'>"+card+"</a> "+score+"% Match</li>"
else:
print >> html, " "
print >> html, "</ul>"
print >> html, "<h3><a href='../"+cardnumber+"/"+cardnumber+".txt'>"+nickname+"'s Shopping List</a></h3>"
#print >> html, "<div><img src='../"+cardnumber+"/"+cardnumber+".jpg'></div>"
print >> html, "</div></div></body></html>"
f2.close()
txt.close()
cardnumber=""
#system("sys.stdout.flush()")
print "*"*50
print "END"
print "*"*50
import codecs
html2 = codecs.open("index.html", "w", encoding="utf-8")
print >> html2, "<html><head><title>Statistics</title>"
print >> html2, """<link rel="stylesheet" href="style.css"></head>"""
print >> html2, """
<body>
<table bgcolor="#00A0E2"width="100%" height="40px "border="0">
<tr>
<td width="25%"><img src="ah_friends_t.gif"></td>
<td><h1>
"""
print >> html2, "Top 20</></h1></td>"
print >> html2, """
</tr>
</table>
<div class="outer"><div class="inner">
"""
print >> html2, "<p><h2>Today's Top 20:</h2>"
topProducts()
compare.sh:
#!/bin/bash
TOTAL=`cat $1 | wc -l`
#echo $TOTAL
#WTWO=wc -l txt02.txt
cat $1 | sort | uniq > /tmp/txt01-uniq.txt
cat $2 | sort | uniq > /tmp/txt02-uniq.txt
cat /tmp/txt01-uniq.txt >> /tmp/txt02-uniq.txt
MATCHES=`cat /tmp/txt02-uniq.txt | sort | uniq -d | wc -l`
BFF=`echo "$MATCHES*100/$TOTAL" | bc -l`
echo "$BFF"