User:Andre Castro/prototyping/1.2/Archiveorg-seachTerm: Difference between revisions
Andrecastro (talk | contribs) No edit summary |
Andrecastro (talk | contribs) No edit summary |
||
Line 2: | Line 2: | ||
==Structure== | ==Structure== | ||
< | |||
<source lang="email"> | |||
[Player / Front End] < --------- | |||
\ | |||
[Player / Front End] < ------- | \ | ||
\ | |||
[Sound Scraper] -----> { Sound sequences pool } | |||
^----------------- | |||
\ | |||
[Texts spide] ----> { Texts pool } | [Texts spide] ----> { Texts pool } | ||
^ | ^ | ||
| | | | ||
topics --- { topics pool } | topics ---> { topics pool } | ||
</source> | |||
Revision as of 22:53, 19 February 2012
Blind Archive
Structure
[Player / Front End] < ---------
\
\
\
[Sound Scraper] -----> { Sound sequences pool }
^-----------------
\
[Texts spide] ----> { Texts pool }
^
|
topics ---> { topics pool }
Searching soundfiles per term
Feching sound files from archive.org based on search terms
In order to do that I am making 2 API requests:
- 1 - searching for a given term within mediaType:Audio
- getting the identifier of the first search occurance id_0
- 2 - requesting details on identifier (id_0)
I use the 2nd (details) API query to look for the containig files.
From this list I get the first ogg (in case ogg files are present)
Downloading soundfile In archive.org files are stored http://www.archive.org/download/ + identifier + filename
12/02/2012 - Latest script
#!/usr/bin/pyhton
import urllib2, urllib, json, re, shutil, datetime, os
#create directroy where sfoundfiles will be saved
dt_obj = datetime.datetime.now()
date_str = dt_obj.strftime("%Y%m%d-%H%M%S")
archive_dir = 'sf-archive-' + date_str
os.makedirs(archive_dir)
sentence = "US President Obama unveils a $3.8 trillion budget, with plans to raise taxes on the wealthy"
sentance_list=re.findall(r"[\w']+|[.,!?;]", sentence) # find words and puctuation and slipt themo list
search_list = []
info_list = [] # Structure: [term, url, num of results ]
download_urls = []
# Results of search tem + mediatype:Audio
for term in sentance_list: # build info list [ [term, url, response, num of results], [...], [...] ]
results = []
results.append(term)
if ('.'in term) or ('?' in term) or ('!' in term):
results.append("fullstop")
results.append("No Url")
results.append("No Respose")
results.append("No Results")
info_list.append(results) # push the results list into the info list
print 'stop: ' + term
elif (','in term) or (';' in term):
results.append("comma")
results.append("No Url")
results.append("No Respose")
results.append("No Results")
info_list.append(results) # push the results list into the info list
print 'comma: ' + term
else:
print 'word: ' + term
url = 'http://www.archive.org/advancedsearch.php?q=' + term + '+AND+mediatype:Audio&rows=300&output=json' #api query
print 'url: '+ url
results.append(url)
search = urllib2.urlopen(url)
search_result = json.load(search)
response = search_result['response']
num_results = response['numFound']
results.append(response)
results.append(num_results)
info_list.append(results) # push the results list into the info list
# go throught the info list, checking if its a punctuation mar, if there are more than 0 search results, and if the item contains ogg files
for info in info_list: # checks the number of results results_list
url = info[1]
print info[0]
print info[1]
print info[3]
print
if ('comma' in url):
download_urls.append('comma')
print 'comma found'
elif ('fullstop' in url):
download_urls.append('fullstop')
elif num_results < 1:
print 'num_results is 0'
print
download_urls.append(0)
else:
done = False
for n in range(num_results): #loop through the results looking for .ogg and < size limit
identifier = info[2]['docs'][n]['identifier']
print
print identifier
format = info[2]['docs'][n]['format']
if "Ogg Vorbis" in format:
# go to details url
details_url = 'http://www.archive.org/details/' + identifier + '&output=json' #details on identifier http://www.archive.org/details/electroacoustic_music&output=json
print details_url
try:
details_search = urllib2.urlopen(details_url)
details_result = json.load(details_search)
files=details_result['files'].keys() #look at the containig files
for ogg in files:
#print str(o)
if re.search('.ogg$', ogg) or re.search('.OGG$', ogg): #if there are .ogg or .OGG
print "ogg found"
print ogg
size = details_result['files'][ogg]['size']
print size
if int(size) > 1000000: #check file size
print "file TOO large"
else:
print "RIGHT SIZE"
audio_url = 'http://www.archive.org/download/' + identifier + ogg
download_urls.append(audio_url)
done = True
break
except urllib2.HTTPError:
print '404'+ details_url
if done:
break
print download_urls
#silence and punctuation soundfiles - WILL LEAVE THEM OUT FOR NOW - since you don't have it in your machine
#silence = "silences/silence.ogg"
#comma = "silences/comma.ogg"
#fullstop = "silences/fullstop.ogg"
for i, url in enumerate(download_urls): #Download files from url
num = '%02d' % (1+(i))
if url == 0:
#silence_file = archive_dir+"/"+str(num)+'silence.ogg'
#shutil.copyfile(silence, silence_file)
elif url == 'comma':
#comma_file = archive_dir+"/"+str(num)+'comma.ogg'
#shutil.copyfile(comma, comma_file)
elif url == 'fullstop':
#fullstop_file = archive_dir+"/"+str(num)+'fullstop.ogg'
#shutil.copyfile(fullstop, fullstop_file)
else:
file_name = str(num) + sentance_list[i] + '.ogg'
print file_name + ' ' + url
urllib.urlretrieve(url, archive_dir + "/" + file_name)
09/02/2012 - Old
#!/usr/bin/pyhton
import urllib2, urllib, json, re
# ====API Query====
term = 'orange'
url = 'http://www.archive.org/advancedsearch.php?q=' + term + '+AND+mediatype:Audio&rows=15&output=json' #api query
search = urllib2.urlopen(url)
search_result = json.load(search)
id_0 = search_result['response']['docs'][0]['identifier'] #look for the identifier in json dict
details_url = 'http://www.archive.org/details/' + id_0 + '&output=json' #details on identifier
details_search = urllib2.urlopen(details_url)
details_result = json.load(details_search)
files=details_result['files'].keys() #look for the containig files
files_list=[]
for i in files:
mp3 = re.findall(r'.mp3$', i)
ogg = re.findall(r'.ogg$', i)
#print mp3
#print ogg
if len(ogg)>0:
files_list.append(i)
extension = '.ogg'
if i in ogg:
print 'ogg in list'
elif len(mp3)>0:
files_list.append(i)
extension = '.mp3'
print files_list
audio_url = 'http://www.archive.org/download/' + id_0 + files_list[0]
urllib.urlretrieve(audio_url, term + extension)
print files_list[0]
print audio_url