User:Birgit Bachler/openevents/code

From XPUB & Lens-Based wiki

ah04.py:

 #!/usr/local/bin/python
import html5lib, urllib2, urlparse
import os, glob
from os import system, sys
import subprocess
import time, random

cardnumber=""



def runBash(cmd):
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    out = p.stdout.read().strip()
    return out  #This is the stdout from the shell command

def absolutize (href, base):
    if not href.lower().startswith("http://"):
        return urlparse.urljoin(base, href)
    return href

def openURL (url, data):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
    request.add_header("User-Agent", user_agent)

    if "?" in url:
        (url, data) = url.split("?") 

    if data:
        # print "posting"
        pagefile=urllib2.urlopen(request, data)
    else:
        pagefile=urllib2.urlopen(request)
    
    realurl = pagefile.geturl()
    return (pagefile, realurl)

def downloadURL (url, foldername):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request)
    realurl = pagefile.geturl()
    data = pagefile.read()

    urlpath = urlparse.urlparse(url)[2]
    (path, filename) = os.path.split(urlpath)
    filename = os.path.join(foldername, filename)
    out = open(filename, "wb")
    out.write(data)


def topProducts():
    from string import punctuation

    N = 20
    words = {}

    words_gen = (word.strip(punctuation).lower() for line in open("alle.txt")
                                                 for word in line.split())

    for word in words_gen:
        words[word] = words.get(word, 0) + 1

    top_words = sorted(words.iteritems(),
                       key=lambda(word, count): (-count, word))[:N] 

    for word, frequency in top_words:
        alle = "%s: %d" % (word, frequency)+"\n"
        print >> html2, alle+"<br>"

print "*"*50
print "*"*50
print "New Bonuskaartfriend? Otherwise just press enter. Example:"
print "2601234567890 YourName"
newCard=raw_input()
print "*"*50
print "*"*50
if newCard != "":
    # read the current contents of the file
    f = open("cardnumbers.txt")
    text = f.read()
    f.close()
    # open the file again for writing
    f = open("cardnumbers.txt", "w")
    f.write(str(newCard)+"\n")
    # write the original contents
    f.write(text)
    f.close()

print "*"*50
print "START"
print "*"*50


txx = open("cardnumbers.txt", "r")
for line in txx:
    print "*"*50
    print "Busy with Bonuskaartfriend", line
    print "*"*50
    url = "http://www.ah.nl/previouslybought/PreviouslyBought.do"

    line = line.rstrip("\n")
    line2 = line[0:13]
    print line, line2
    cardnumber = line2
    system("mkdir "+str(cardnumber)+"")
    system("mkdir "+str(cardnumber)+"/output")
    #print type(cardnumber)

    #f, url2 = openURL(url, "cardNumber=2620481956139")
    #cardnumber = "2620474055641"



    f, url = openURL(url, "cardNumber="+cardnumber)
    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("BeautifulSoup"))
    firstPage = parser.parse(f)
#    print type(firstPage)
    f.close()


    #fillet stands for pythonnumber of item pages
    loadedURL = {}
    import codecs
    txt = codecs.open(str(cardnumber)+"/"+str(cardnumber)+".txt", "w", encoding="utf-8")

    for a in firstPage.findAll("a", { "class" : "fillet" }):
        pageurl = a["href"]
        pageurl = absolutize(pageurl, url)
        if not loadedURL.has_key(pageurl):
            #LOAD URL...
            print "PAGE ", pageurl
            f2, pageurl = openURL(pageurl, None)
            # print f2.read()


            parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("BeautifulSoup"))
            page = parser.parse(f2)
            for img in page.findAll("img"):
                # print "    ", img
    #            src = img.getAttribute("src")
                src = img["src"]
                if src:
                    src = absolutize(src, pageurl)
                    #print "   ",src
                    time.sleep(random.random()) 
                    try:
                        downloadURL(src, ""+str(cardnumber)+"/output")
                    except urllib2.URLError, msg:
		                print "ncfiles: Urllib2 error (%s)" % msg
                    except socket.error, (errno, strerror):
	                    print "ncfiles: Socket error (%s) for host %s (%s)" % (errno, host, strerror)
                 
            for h3 in page.findAll("h3"):
                value = (h3.contents[0])
                if value != "Afdeling":
                    print >> txt, value
                    import codecs
                    f = codecs.open("alle.txt", "r", encoding="utf-8")      
                    text = f.read()
                    f.close()
                    # open the file again for writing
                    f = codecs.open("alle.txt", "w", encoding="utf-8")
                    f.write(value+"\n")
                    # write the original contents
                    f.write(text)
                    f.close()

            loadedURL[pageurl] = True
        f.close()
        f2.close()

    system("mkdir "+str(cardnumber)+"/products")
    system("mv "+str(cardnumber)+"/output/*_80.jpg "+str(cardnumber)+"/products")
    system("montage "+str(cardnumber)+"/products/*.jpg "+str(cardnumber)+"/"+str(cardnumber)+".jpg")
    system("convert -delay 100 "+str(cardnumber)+"/products/*.jpg "+str(cardnumber)+"/"+str(cardnumber)+".gif")
    system("rm -r "+str(cardnumber)+"/output")
    system("rm -r "+str(cardnumber)+"/products")
    #system("display "+str(cardnumber)+"/"+str(cardnumber)+".jpg &")


    #TRY TO GRAB ALL EXISTING BONUSCARD-DIRECTORIES FOR COMPARISON-LOOP

    path = '/home/merglind/Documents/OpenEvents/try04'
    html = codecs.open(str(cardnumber)+"/index.html", "w", encoding="utf-8")
    friends=[]
    for infile in glob.glob( os.path.join(path, '26*') ):
        (head, tail) = os.path.split(infile)
        #print "current file is: " + tail
        compareNumber = tail
        if compareNumber != cardnumber:
            cmd="./compare.sh "+str(cardnumber)+"/"+str(cardnumber)+".txt  "+str(compareNumber)+"/"+str(compareNumber)+".txt"
            value=runBash(cmd)
            value=round(float(value),2)
#            print value, "% Match with bonuscard number", compareNumber
            friend = value, compareNumber
            #print >> txt02, friend
            if value > 10.0:
                friends.append(friend)
    #print friends
    friends.sort(reverse=True)
#    import pprint
#    pprint.pprint(friends)

##alle.close()
#### TOP10 products







#################HTML STARTS HERE#########################
    if line[14:]!="":
        nickname=line[14:]
    else:
        nickname=cardnumber
    print >> html, "<html><head><title>"+line[14:]+"'s Profile</title>"
    print >> html, """<link rel="stylesheet" href="../style.css"></head>"""

    print >> html, """
<body>
<table bgcolor="#00A0E2"width="100%" height="40px "border="0">
  <tr>
    <td width="25%"><img src="../ah_friends_t.gif"></td>
    <td><h1>
"""
    print >> html, ""+cardnumber+"'s Profile</></h1></td>"
    print >> html, """
  </tr>
</table>
<div class="outer"><div class="inner">
"""
    print >> html, "<p><h2>This is "+nickname+"</h2>"

    print >> html, "<div><a href='../"+cardnumber+"/"+cardnumber+".jpg'><img src='../"+cardnumber+"/"+cardnumber+".gif'></a></div>"
    print >> html, "<h3>"+nickname+"'s BEST BONUS-FRIENDS:</h3>"
    print >> html, "<ul>"
    for friend in friends:
        card=friend[1]
        score="%0.2f" % friend[0]
        if score > 10.:
            print >>  html, "<li><a href='../"+card+"/index.html'>"+card+"</a> "+score+"% Match</li>"
        else: 
            print >> html, " "
    print >> html, "</ul>"
    print >> html, "<h3><a href='../"+cardnumber+"/"+cardnumber+".txt'>"+nickname+"'s Shopping List</a></h3>"
    #print >> html, "<div><img src='../"+cardnumber+"/"+cardnumber+".jpg'></div>"
    print >> html, "</div></div></body></html>"

    f2.close()
    txt.close() 
    cardnumber=""
    #system("sys.stdout.flush()")

print "*"*50
print "END"
print "*"*50


import  codecs
html2 = codecs.open("index.html", "w", encoding="utf-8")
print >> html2, "<html><head><title>Statistics</title>"
print >> html2, """<link rel="stylesheet" href="style.css"></head>"""

print >> html2, """
<body>
<table bgcolor="#00A0E2"width="100%" height="40px "border="0">
  <tr>
    <td width="25%"><img src="ah_friends_t.gif"></td>
    <td><h1>
"""
print >> html2, "Top 20</></h1></td>"
print >> html2, """
  </tr>
</table>
<div class="outer"><div class="inner">
"""
print >> html2, "<p><h2>Today's Top 20:</h2>"
topProducts()


compare.sh:

#!/bin/bash
TOTAL=`cat $1 | wc -l`

#echo $TOTAL


#WTWO=wc -l txt02.txt

cat $1 | sort | uniq > /tmp/txt01-uniq.txt
cat $2 | sort | uniq > /tmp/txt02-uniq.txt

cat /tmp/txt01-uniq.txt >> /tmp/txt02-uniq.txt
MATCHES=`cat /tmp/txt02-uniq.txt | sort | uniq -d | wc -l`
BFF=`echo "$MATCHES*100/$TOTAL" | bc -l`

echo "$BFF"