User:Birgit bachler/openevents/

From XPUB & Lens-Based wiki

Description

Bonuskaart-friends is a social network that connects people according to their shopping behaviour.
Just provide the system with your bonuskaart-number and optionally your name and you will find
like-minded shoppers. Your profile does not require any further maintenance since it creates itself
completely out of the shopping data you give to albertheijn by swiping the barcode of your card
every time you do your groceries. Enjoy photo galleries of your purchased products, have a look at
your detailed shopping list and get to know your new friends.

Like any customer card the albertheijn bonuskaart is an effective tool for data mining and creating
customer profiles. The Dutch supermarket chain albertheijn with their popular bonuskaart allows
their website-visitors to view their previously done shopping just by filling in the unique number
from the back of their card. By questioning the relevance and sensitivity of this data, the social
network of Bonuskaart-friends, which is based upon the look of facebook, tries to portray possible
abuse and false conclusions about a person based on that data.

To counteract this mechanism the card number 2620496071032 is available as a sticker to paste onto
your personal card in order to conceal your shopping behaviour and share one identity with many other
shoppers.

Documentation

Openevents02.png

Openevents products.jpg

Bonuskaart club.JPG

Source Code

ah04.py:

 #!/usr/local/bin/python
import html5lib, urllib2, urlparse
import os, glob
from os import system, sys
import subprocess
import time, random

cardnumber=""



def runBash(cmd):
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    out = p.stdout.read().strip()
    return out  #This is the stdout from the shell command

def absolutize (href, base):
    if not href.lower().startswith("http://"):
        return urlparse.urljoin(base, href)
    return href

def openURL (url, data):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
    request.add_header("User-Agent", user_agent)

    if "?" in url:
        (url, data) = url.split("?") 

    if data:
        # print "posting"
        pagefile=urllib2.urlopen(request, data)
    else:
        pagefile=urllib2.urlopen(request)
    
    realurl = pagefile.geturl()
    return (pagefile, realurl)

def downloadURL (url, foldername):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request)
    realurl = pagefile.geturl()
    data = pagefile.read()

    urlpath = urlparse.urlparse(url)[2]
    (path, filename) = os.path.split(urlpath)
    filename = os.path.join(foldername, filename)
    out = open(filename, "wb")
    out.write(data)


def topProducts():
    from string import punctuation

    N = 20
    words = {}

    words_gen = (word.strip(punctuation).lower() for line in open("alle.txt")
                                                 for word in line.split())

    for word in words_gen:
        words[word] = words.get(word, 0) + 1

    top_words = sorted(words.iteritems(),
                       key=lambda(word, count): (-count, word))[:N] 

    for word, frequency in top_words:
        alle = "%s: %d" % (word, frequency)+"\n"
        print >> html2, alle+"<br>"

print "*"*50
print "*"*50
print "New Bonuskaartfriend? Otherwise just press enter. Example:"
print "2601234567890 YourName"
newCard=raw_input()
print "*"*50
print "*"*50
if newCard != "":
    # read the current contents of the file
    f = open("cardnumbers.txt")
    text = f.read()
    f.close()
    # open the file again for writing
    f = open("cardnumbers.txt", "w")
    f.write(str(newCard)+"\n")
    # write the original contents
    f.write(text)
    f.close()

print "*"*50
print "START"
print "*"*50


txx = open("cardnumbers.txt", "r")
for line in txx:
    print "*"*50
    print "Busy with Bonuskaartfriend", line
    print "*"*50
    url = "http://www.ah.nl/previouslybought/PreviouslyBought.do"

    line = line.rstrip("\n")
    line2 = line[0:13]
    print line, line2
    cardnumber = line2
    system("mkdir "+str(cardnumber)+"")
    system("mkdir "+str(cardnumber)+"/output")
    #print type(cardnumber)

    #f, url2 = openURL(url, "cardNumber=2620481956139")
    #cardnumber = "2620474055641"



    f, url = openURL(url, "cardNumber="+cardnumber)
    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("BeautifulSoup"))
    firstPage = parser.parse(f)
#    print type(firstPage)
    f.close()


    #fillet stands for pythonnumber of item pages
    loadedURL = {}
    import codecs
    txt = codecs.open(str(cardnumber)+"/"+str(cardnumber)+".txt", "w", encoding="utf-8")

    for a in firstPage.findAll("a", { "class" : "fillet" }):
        pageurl = a["href"]
        pageurl = absolutize(pageurl, url)
        if not loadedURL.has_key(pageurl):
            #LOAD URL...
            print "PAGE ", pageurl
            f2, pageurl = openURL(pageurl, None)
            # print f2.read()


            parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("BeautifulSoup"))
            page = parser.parse(f2)
            for img in page.findAll("img"):
                # print "    ", img
    #            src = img.getAttribute("src")
                src = img["src"]
                if src:
                    src = absolutize(src, pageurl)
                    #print "   ",src
                    time.sleep(random.random()) 
                    try:
                        downloadURL(src, ""+str(cardnumber)+"/output")
                    except urllib2.URLError, msg:
		                print "ncfiles: Urllib2 error (%s)" % msg
                    except socket.error, (errno, strerror):
	                    print "ncfiles: Socket error (%s) for host %s (%s)" % (errno, host, strerror)
                 
            for h3 in page.findAll("h3"):
                value = (h3.contents[0])
                if value != "Afdeling":
                    print >> txt, value
                    import codecs
                    f = codecs.open("alle.txt", "r", encoding="utf-8")      
                    text = f.read()
                    f.close()
                    # open the file again for writing
                    f = codecs.open("alle.txt", "w", encoding="utf-8")
                    f.write(value+"\n")
                    # write the original contents
                    f.write(text)
                    f.close()

            loadedURL[pageurl] = True
        f.close()
        f2.close()

    system("mkdir "+str(cardnumber)+"/products")
    system("mv "+str(cardnumber)+"/output/*_80.jpg "+str(cardnumber)+"/products")
    system("montage "+str(cardnumber)+"/products/*.jpg "+str(cardnumber)+"/"+str(cardnumber)+".jpg")
    system("convert -delay 100 "+str(cardnumber)+"/products/*.jpg "+str(cardnumber)+"/"+str(cardnumber)+".gif")
    system("rm -r "+str(cardnumber)+"/output")
    system("rm -r "+str(cardnumber)+"/products")
    #system("display "+str(cardnumber)+"/"+str(cardnumber)+".jpg &")


    #TRY TO GRAB ALL EXISTING BONUSCARD-DIRECTORIES FOR COMPARISON-LOOP

    path = '/home/merglind/Documents/OpenEvents/try04'
    html = codecs.open(str(cardnumber)+"/index.html", "w", encoding="utf-8")
    friends=[]
    for infile in glob.glob( os.path.join(path, '26*') ):
        (head, tail) = os.path.split(infile)
        #print "current file is: " + tail
        compareNumber = tail
        if compareNumber != cardnumber:
            cmd="./compare.sh "+str(cardnumber)+"/"+str(cardnumber)+".txt  "+str(compareNumber)+"/"+str(compareNumber)+".txt"
            value=runBash(cmd)
            value=round(float(value),2)
#            print value, "% Match with bonuscard number", compareNumber
            friend = value, compareNumber
            #print >> txt02, friend
            if value > 10.0:
                friends.append(friend)
    #print friends
    friends.sort(reverse=True)
#    import pprint
#    pprint.pprint(friends)

##alle.close()
#### TOP10 products







#################HTML STARTS HERE#########################
    if line[14:]!="":
        nickname=line[14:]
    else:
        nickname=cardnumber
    print >> html, "<html><head><title>"+line[14:]+"'s Profile</title>"
    print >> html, """<link rel="stylesheet" href="../style.css"></head>"""

    print >> html, """
<body>
<table bgcolor="#00A0E2"width="100%" height="40px "border="0">
  <tr>
    <td width="25%"><img src="../ah_friends_t.gif"></td>
    <td><h1>
"""
    print >> html, ""+cardnumber+"'s Profile</></h1></td>"
    print >> html, """
  </tr>
</table>
<div class="outer"><div class="inner">
"""
    print >> html, "<p><h2>This is "+nickname+"</h2>"

    print >> html, "<div><a href='../"+cardnumber+"/"+cardnumber+".jpg'><img src='../"+cardnumber+"/"+cardnumber+".gif'></a></div>"
    print >> html, "<h3>"+nickname+"'s BEST BONUS-FRIENDS:</h3>"
    print >> html, "<ul>"
    for friend in friends:
        card=friend[1]
        score="%0.2f" % friend[0]
        if score > 10.:
            print >>  html, "<li><a href='../"+card+"/index.html'>"+card+"</a> "+score+"% Match</li>"
        else: 
            print >> html, " "
    print >> html, "</ul>"
    print >> html, "<h3><a href='../"+cardnumber+"/"+cardnumber+".txt'>"+nickname+"'s Shopping List</a></h3>"
    #print >> html, "<div><img src='../"+cardnumber+"/"+cardnumber+".jpg'></div>"
    print >> html, "</div></div></body></html>"

    f2.close()
    txt.close() 
    cardnumber=""
    #system("sys.stdout.flush()")

print "*"*50
print "END"
print "*"*50


import  codecs
html2 = codecs.open("index.html", "w", encoding="utf-8")
print >> html2, "<html><head><title>Statistics</title>"
print >> html2, """<link rel="stylesheet" href="style.css"></head>"""

print >> html2, """
<body>
<table bgcolor="#00A0E2"width="100%" height="40px "border="0">
  <tr>
    <td width="25%"><img src="ah_friends_t.gif"></td>
    <td><h1>
"""
print >> html2, "Top 20</></h1></td>"
print >> html2, """
  </tr>
</table>
<div class="outer"><div class="inner">
"""
print >> html2, "<p><h2>Today's Top 20:</h2>"
topProducts()


compare.sh:


#!/bin/bash
TOTAL=`cat $1 | wc -l`

#echo $TOTAL


#WTWO=wc -l txt02.txt

cat $1 | sort | uniq > /tmp/txt01-uniq.txt
cat $2 | sort | uniq > /tmp/txt02-uniq.txt

cat /tmp/txt01-uniq.txt >> /tmp/txt02-uniq.txt
MATCHES=`cat /tmp/txt02-uniq.txt | sort | uniq -d | wc -l`
BFF=`echo "$MATCHES*100/$TOTAL" | bc -l`

echo "$BFF"