User:Max Dovey/PT/TRIMESTER 3: Difference between revisions

Revision as of 23:21, 26 June 2014

Around the world in 200 Googles

google.com
google.ac
google.ad
google.ae
google.com.af
google.com.ag
google.com.ai
google.al
google.am
google.co.ao
google.com.ar
google.as
google.at
google.com.au
google.az
google.ba
google.com.bd
google.be
google.bf
google.bg
google.com.bh
google.bi
google.bj
google.com.bn
google.com.bo
google.com.br
google.bs
google.bt
google.co.bw
google.by
google.com.bz
google.ca
google.com.kh
google.cc
google.cd
google.cf
google.cat
google.cg
google.ch
google.ci
google.co.ck
google.cl
google.cm
google.cn
g.cn
google.com.co
google.co.cr
google.com.cu
google.cv
google.com.cy
google.cz
google.de
google.dj
google.dk
google.dm
google.com.do
google.dz
google.com.ec
google.ee
google.com.eg
google.es
google.com.et
google.fi
google.com.fj
google.fm
google.fr
google.ga
google.ge
google.gf
google.gg
google.com.gh
google.com.gi
google.gl
google.gm
google.gp
google.gr
google.com.gt
google.gy
google.com.hk
google.hn
google.hr
google.ht
google.hu
google.co.id
google.ir
google.iq
google.ie
google.co.il
google.im
google.co.in
google.io
google.is
google.it
google.je
google.com.jm
google.jo
google.co.jp
google.co.ke
google.ki
google.kg
google.co.kr
google.com.kw
google.kz
google.la
google.com.lb
google.com.lc
google.li
google.lk
google.co.ls
google.lt
google.lu
google.lv
google.com.ly
google.co.ma
google.md
google.me
google.mg
google.mk
google.ml
google.com.mm
google.mn
google.ms
google.com.mt
google.mu
google.mv
google.mw
google.com.mx
google.com.my
google.co.mz
google.com.na
google.ne
google.com.nf
google.com.ng
google.com.ni
google.nl
google.no
google.com.np
google.nr
google.nu
google.co.nz
google.com.om
google.com.pa
google.com.pe
google.com.ph
google.com.pk
google.pl
google.com.pg
google.pn
google.com.pr
google.ps
google.pt
google.com.py
google.com.qa
google.ro
google.rs
google.ru
google.rw
google.com.sa
google.com.sb
google.sc
google.se
google.com.sg
google.sh
google.si
google.sk
google.com.sl
google.sn
google.sm
google.so
google.st
google.com.sv
google.td
google.tg
google.co.th
google.com.tj
google.tk
google.tl
google.tm
google.to
google.tn
google.com.tn
google.com.tr
google.tt
google.com.tw
google.co.tz
google.com.ua
google.co.ug
google.co.uk
google.us
google.com.uy
google.co.uz
google.com.vc
google.co.ve
google.vg
google.co.vi
google.com.vn
google.vu
google.ws
google.co.za
google.co.zm
google.co.zw

import webbrowser, time 

f = open("googleindex.txt")
url = []
action = False 
for line in f:
    line = line.strip()
    link =  ('http://' + line)
    action = True
    while action:
        time.sleep(4)
        webbrowser.open_new_tab(link)
        action = False

Bank Of Broken Dreams

Process Documentation

STEP 1

Make a list of all URLS of failed kickstarter projects. The script below grabs all links from every category (over 150) that are failed using the prefix http://www.kickspy.com/browse/all-failed/ The output is a txt file 'kickurlscate.txt' that has 6000 failed projects from all the different categories, taking the maximum (100) failed projects from each category.

import urllib2
from bs4 import BeautifulSoup
import requests
import time , re
from itertools import izip
import csv 
from urlparse import urljoin, urldefrag

s =  open('allfailedurls.txt', 'rb') #list of urls of all categories 
startlist= []
urllist = []
cleanurls = []
prefix = "http://www.kickspy.com"
f = open('kickurlscate.txt', "a") #write links to a text file for scraping later! 

for line in s:
    startlist.append(line)
limit = len(startlist)
print limit 

counter = 0 
for i in startlist:
    page = urllib2.urlopen(i).read()
    soup = BeautifulSoup(page)
    counter +=1 
    print counter 
    if counter > limit:
        break
    else:
        #Grab all links from start beginnning with projects and append them to urllist
        for a in soup.find_all('a', href=re.compile ("/projects")):
            l = (a.get('href'))
            c = urljoin(prefix, l)
            urllist.append(c)
            print c 
            f.write(c)
            f.write("\n")

example output >>>

http://www.kickspy.com/projects/artastictime/artastic-time
http://www.kickspy.com/projects/561192189/epermanent
http://www.kickspy.com/projects/salgallery/marseille-art
http://www.kickspy.com/projects/htavos/kooky-plush-monsters-and-more
http://www.kickspy.com/projects/18521543/dingbats-handmade-treasures-farmers-market-booth

Step 2

The script below is a basic scaper that goes through the list of failed kickstarter pages and extracts the title, description and category information from the page and saves in csv format.

import urllib2
from bs4 import BeautifulSoup
import requests
import time , re
from itertools import izip
import csv 
from urlparse import urljoin, urldefrag
startlist = []
title = []
desc = []
genre = []        
counter = 0 
s = open("fullcleanurls.txt", "r").readlines()
    
for line in s:
    startlist.append(line)
limit = len(startlist)
print limit 

    #iterate through urllist and scrape title, info etc 
for i in startlist[0:500]: #limits for procarity
    counter +=1 
    page = urllib2.urlopen(i).read()
    soup = BeautifulSoup(page)
    soup.prettify()
    print counter 
    if counter > limit:
        break
    else:
        a = soup.find("h1").text.encode('utf-8') 
        b = soup.find("div", {"class" : "description"}).text.encode('utf-8') 
        c = soup.find("div", {"class" : "category"}).text.encode('utf-8') 
        title.append(a)
        desc.append(b)
        genre.append(c)
        with open('fullkicklist2.csv', 'wb') as f:
            writer =csv.writer(f)
            writer.writerows(izip(title, desc, genre))
            time.sleep(1)

output example >>>

Step 3

Now have 6000 failed kickstarters in a nicely formated csv its time to make some $$$$$$ The script below reads the csv and turns the data into dictionary so that i can append each row to different parts of an SVG using a brutal find and replace method.

import sys, os , time 
import csv
from collections import defaultdict
f = open('run/template4.svg', 'r')
count = 0 
field_names = ['title', 'desc', 'cat']
with open ('fullkickfaillist5.csv', 'rU') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=",", quotechar='"', fieldnames=field_names)
    for row in reader:
        title = (row['title'])
        desc = (row['desc'])
        cat = (row['cat'])
        print title + desc 
        tmp = open("tmp.svg", "w")
        tmp.write(open("run/template4.svg").read().replace("unique",title).replace("world",desc).replace("bath", cat))
        tmp.close()
        count += 1
        cmd = '/Applications/Inkscape.app/Contents/Resources/bin/inkscape --export-png=bill{0:06d}.png tmp.svg'.format(count)
        print cmd
        os.system(cmd)

output 6000 of these bills >>>>

Revision as of 23:14, 26 June 2014 (view source) Max Dovey (talk \| contribs) (→‎Step 3) ← Older edit		Revision as of 23:21, 26 June 2014 (view source) Max Dovey (talk \| contribs) (→‎Step 3) Newer edit →
Line 365:		Line 365:

	output 6000 of these bills >>>>		output 6000 of these bills >>>>
	[[File:~~Slide~~.gif~~\|300 x 100px\|thumbnail~~]]		[[File:Slide2.gif]]