Selenium
An automated browser engine -- useful for things like scraping and browser based installations. Has been used in projects like PlaySureVeillance and In The Company Of Bots.
Using with python
To use selenium with python you just need the selenium python library (installable with pip) and the "webdriver" for your specific browser (for instance: firefox or chrome (maybe chromium, though haven't tested that).
pip install selenium
You also need a "driver" that connects to a specific browser:
- Firefox: https://github.com/mozilla/geckodriver/releases
- Chrome: https://chromedriver.storage.googleapis.com/index.html?path=2.25/
Untar/zip them and place in ~/bin which is in my PATH. Sadly, I found that the gecko / firefox one failed with some drop down selects. So I used the chromedriver + Chrome with the scraper code pasted below.
- TODO** have some simpler cookbook style examples... but for now here's a (somewhat) complicated working script to do some particular scraping of a [ particular online museum's website]. This work was done for the Diversions worksession in Brussels, 2017. A scrape result ordered by name and visualized with leaflet
from __future__ import print_function
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException
from urllib2 import urlopen
import sys, json, os
from argparse import ArgumentParser
BUFSIZE = 1024 * 1000
def wget (url, tofile):
f = urlopen(url)
count = 0
with open(tofile, "wb") as fout:
while True:
data = f.read(BUFSIZE)
if data == "":
break
count += len(data)
fout.write(data)
return count
def image_path (x):
x = x.replace(" ", "_").replace("/", "-").lower()
x = x+".jpg"
return x
def log (*msg):
print (*msg, file=sys.stderr)
ap = ArgumentParser("MRAH scaper")
ap.add_argument("--starturl", default="http://carmentis.be")
ap.add_argument("--browser", choices=("firefox", "chrome", "opera"), default="chrome", help="browser driver: firefox (default), chrome, opera")
ap.add_argument("--format", choices=("json", "csv"), default="csv", help="output format: json (default), csv")
ap.add_argument("--imagepath", default="images")
ap.add_argument("--skipimages", action="store_true", default=False)
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--sleeptime", type=float, default=None, help="sleeptime")
args = ap.parse_args()
if not args.skipimages:
try:
os.makedirs(args.imagepath)
except OSError:
pass
sleeptime = args.sleeptime
log("Opening browser...")
driver = None
if args.browser == "opera":
b = webdriver.Opera()
elif args.browser == "chrome":
b = webdriver.Chrome()
else:
b = webdriver.Firefox()
if sleeptime == None:
sleeptime = 0.5
b.get(args.starturl)
log("Perform a search and select detail mode, then press enter to start scraping items... (Ctrl-c to cancel)")
raw_input()
props = """
collectionName
inventoryNb
objectName
objectTitle
objectCulture
geography
dating
material
technique
dimensions
legalRightOwner
""".strip().splitlines()
if args.format == "csv":
from csv import DictWriter
fieldnames = props[:]
fieldnames.append("url")
if not args.skipimages:
fieldnames.extend(("imageurl", "image"))
csvout = DictWriter(sys.stdout, fieldnames=fieldnames)
csvout.writeheader()
count = 0
while True:
b.implicitly_wait(0)
count += 1
item = {}
for p in props:
try:
li = b.find_element_by_css_selector("li."+p)
name = li.find_element_by_css_selector(".tspPrefix")
span = li.find_element_by_css_selector(".tspValue")
# item[name.text] = span.text
item[p] = span.text
except NoSuchElementException as e:
pass
# permalink / bookmark
tries = 0
while tries < 5:
try:
permalink = b.find_element_by_css_selector("li.bookmark")\
.find_element_by_css_selector("input")\
.get_attribute("value")
item['url'] = permalink
break
except NoSuchElementException:
tries += 1
sleep(0.1)
# print ("PERMALINK: {0}".format(permalink))
if not args.skipimages:
imglink = b.find_element_by_css_selector("dt.detailImg a")
imglink.click()
img_src = None
tries = 0
while tries < 5:
try:
b.switch_to_window('HighResImage')
img = b.find_element_by_css_selector("img")
img_src = img.get_attribute("src")
item['imageurl'] = img_src
b.close()
b.switch_to_window("")
break
except NoSuchWindowException as e:
log("NoSuchWindowException", tries)
sleep(1)
tries += 1
if img_src:
# log("IMAGE: {0}".format(img_src))
ifilename = image_path(item['inventoryNb'])
ipath = os.path.join(args.imagepath, ifilename)
if wget(img_src, ipath):
item['image'] = ifilename
if args.format == "json":
print (json.dumps(item))
elif args.format == "csv":
csvout.writerow({k:v.encode('utf8') for k,v in item.items()})
log(u"{0} {1}/{2}".format(count, item.get('objectName', u''), item.get('objectTitle', u'')).encode("utf-8"))
next = None
try:
next = b.find_element_by_css_selector('#pageSetEntries-nextSet a')
except NoSuchElementException as e:
pass
if next == None:
log("END OF LIST")
break
b.implicitly_wait(10)
next.click()
if sleeptime:
sleep(sleeptime)
log("output {0} items".format(count))
b.close()