Selenium

From XPUB & Lens-Based wiki

An automated browser engine -- useful for things like scraping and browser based installations. Has been used in projects like PlaySureVeillance and In The Company Of Bots.

Using with python

To use selenium with python you just need the selenium python library (installable with pip) and the "webdriver" for your specific browser (for instance: firefox or chrome (maybe chromium, though haven't tested that).

from __future__ import print_function
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException
from urllib2 import urlopen
import sys, json, os
from argparse import ArgumentParser


BUFSIZE  = 1024 * 1000
def wget (url, tofile):
    f = urlopen(url)
    count = 0
    with open(tofile, "wb") as fout:
        while True:
            data = f.read(BUFSIZE)
            if data == "":
                break
            count += len(data)
            fout.write(data)
    return count

def image_path (x):
    x = x.replace(" ", "_").replace("/", "-").lower()
    x = x+".jpg"
    return x

def log (*msg):
    print (*msg, file=sys.stderr)


ap = ArgumentParser("MRAH scaper")
ap.add_argument("--starturl", default="http://carmentis.be")
ap.add_argument("--browser", choices=("firefox", "chrome", "opera"), default="chrome", help="browser driver: firefox (default), chrome, opera")
ap.add_argument("--format", choices=("json", "csv"), default="csv", help="output format: json (default), csv")
ap.add_argument("--imagepath", default="images")
ap.add_argument("--skipimages", action="store_true", default=False)
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--sleeptime", type=float, default=None, help="sleeptime")

args = ap.parse_args()
if not args.skipimages:
    try:
        os.makedirs(args.imagepath)
    except OSError:
        pass

sleeptime = args.sleeptime
log("Opening browser...")
driver = None
if args.browser == "opera":
    b = webdriver.Opera()
elif args.browser == "chrome":
    b = webdriver.Chrome()
else:
    b = webdriver.Firefox()
    if sleeptime == None:
        sleeptime = 0.5

b.get(args.starturl)

log("Perform a search and select detail mode, then press enter to start scraping items... (Ctrl-c to cancel)")
raw_input()

props = """
collectionName
inventoryNb
objectName
objectTitle
objectCulture
geography
dating
material
technique
dimensions
legalRightOwner
""".strip().splitlines()

if args.format == "csv":
    from csv import DictWriter
    fieldnames = props[:]
    fieldnames.append("url")
    if not args.skipimages:
        fieldnames.extend(("imageurl", "image"))
    csvout = DictWriter(sys.stdout, fieldnames=fieldnames)
    csvout.writeheader()

count = 0

while True:
    b.implicitly_wait(0)
    count += 1
    item = {}
    for p in props:
        try:
            li = b.find_element_by_css_selector("li."+p)
            name = li.find_element_by_css_selector(".tspPrefix")
            span = li.find_element_by_css_selector(".tspValue")
            # item[name.text] = span.text
            item[p] = span.text
        except NoSuchElementException as e:
            pass

    # permalink / bookmark
    tries = 0
    while tries < 5:
        try:
            permalink = b.find_element_by_css_selector("li.bookmark")\
                .find_element_by_css_selector("input")\
                .get_attribute("value")
            item['url'] = permalink
            break
        except NoSuchElementException:
            tries += 1
            sleep(0.1)
    # print ("PERMALINK: {0}".format(permalink))

    if not args.skipimages:
        imglink = b.find_element_by_css_selector("dt.detailImg a")
        imglink.click()


        img_src = None
        tries = 0
        while tries < 5:
            try:
                b.switch_to_window('HighResImage')
                img = b.find_element_by_css_selector("img")
                img_src = img.get_attribute("src")
                item['imageurl'] = img_src
                b.close()
                b.switch_to_window("")
                break
            except NoSuchWindowException as e:
                log("NoSuchWindowException", tries)
                sleep(1)
                tries += 1
        

        if img_src:
            # log("IMAGE: {0}".format(img_src))
            ifilename = image_path(item['inventoryNb'])
            ipath = os.path.join(args.imagepath, ifilename)
            if wget(img_src, ipath):
                item['image'] = ifilename

    if args.format == "json":
        print (json.dumps(item))
    elif args.format == "csv":
        csvout.writerow({k:v.encode('utf8') for k,v in item.items()})

    log(u"{0} {1}/{2}".format(count, item.get('objectName', u''), item.get('objectTitle', u'')).encode("utf-8"))

    next = None
    try:
        next = b.find_element_by_css_selector('#pageSetEntries-nextSet a')
    except NoSuchElementException as e:
        pass
    if next == None:
        log("END OF LIST")
        break
        
    b.implicitly_wait(10)
    next.click()
    if sleeptime:
        sleep(sleeptime)

log("output {0} items".format(count))
b.close()