Selenium

From XPUB & Lens-Based wiki
Revision as of 17:00, 20 February 2017 by Michael Murtaugh (talk | contribs) (→‎Using with python)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

An automated browser engine -- useful for things like scraping and browser based installations. Has been used in projects like PlaySureVeillance and In The Company Of Bots.

Using with python

To use selenium with python you just need the selenium python library (installable with pip) and the "webdriver" for your specific browser (for instance: firefox or chrome (maybe chromium, though haven't tested that).

pip install selenium

You also need a "driver" that connects to a specific browser:

Untar/zip them and place in ~/bin which is in my PATH. Sadly, I found that the gecko / firefox one failed with some drop down selects. So I used the chromedriver + Chrome with the scraper code pasted below.

    • TODO** have some simpler cookbook style examples... but for now here's a (somewhat) complicated working script to do some particular scraping of a [ particular online museum's website]. This work was done for the Diversions worksession in Brussels, 2017. A scrape result ordered by name and visualized with leaflet
from __future__ import print_function
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException
from urllib2 import urlopen
import sys, json, os
from argparse import ArgumentParser


BUFSIZE  = 1024 * 1000
def wget (url, tofile):
    f = urlopen(url)
    count = 0
    with open(tofile, "wb") as fout:
        while True:
            data = f.read(BUFSIZE)
            if data == "":
                break
            count += len(data)
            fout.write(data)
    return count

def image_path (x):
    x = x.replace(" ", "_").replace("/", "-").lower()
    x = x+".jpg"
    return x

def log (*msg):
    print (*msg, file=sys.stderr)


ap = ArgumentParser("MRAH scaper")
ap.add_argument("--starturl", default="http://carmentis.be")
ap.add_argument("--browser", choices=("firefox", "chrome", "opera"), default="chrome", help="browser driver: firefox (default), chrome, opera")
ap.add_argument("--format", choices=("json", "csv"), default="csv", help="output format: json (default), csv")
ap.add_argument("--imagepath", default="images")
ap.add_argument("--skipimages", action="store_true", default=False)
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--sleeptime", type=float, default=None, help="sleeptime")

args = ap.parse_args()
if not args.skipimages:
    try:
        os.makedirs(args.imagepath)
    except OSError:
        pass

sleeptime = args.sleeptime
log("Opening browser...")
driver = None
if args.browser == "opera":
    b = webdriver.Opera()
elif args.browser == "chrome":
    b = webdriver.Chrome()
else:
    b = webdriver.Firefox()
    if sleeptime == None:
        sleeptime = 0.5

b.get(args.starturl)

log("Perform a search and select detail mode, then press enter to start scraping items... (Ctrl-c to cancel)")
raw_input()

props = """
collectionName
inventoryNb
objectName
objectTitle
objectCulture
geography
dating
material
technique
dimensions
legalRightOwner
""".strip().splitlines()

if args.format == "csv":
    from csv import DictWriter
    fieldnames = props[:]
    fieldnames.append("url")
    if not args.skipimages:
        fieldnames.extend(("imageurl", "image"))
    csvout = DictWriter(sys.stdout, fieldnames=fieldnames)
    csvout.writeheader()

count = 0

while True:
    b.implicitly_wait(0)
    count += 1
    item = {}
    for p in props:
        try:
            li = b.find_element_by_css_selector("li."+p)
            name = li.find_element_by_css_selector(".tspPrefix")
            span = li.find_element_by_css_selector(".tspValue")
            # item[name.text] = span.text
            item[p] = span.text
        except NoSuchElementException as e:
            pass

    # permalink / bookmark
    tries = 0
    while tries < 5:
        try:
            permalink = b.find_element_by_css_selector("li.bookmark")\
                .find_element_by_css_selector("input")\
                .get_attribute("value")
            item['url'] = permalink
            break
        except NoSuchElementException:
            tries += 1
            sleep(0.1)
    # print ("PERMALINK: {0}".format(permalink))

    if not args.skipimages:
        imglink = b.find_element_by_css_selector("dt.detailImg a")
        imglink.click()


        img_src = None
        tries = 0
        while tries < 5:
            try:
                b.switch_to_window('HighResImage')
                img = b.find_element_by_css_selector("img")
                img_src = img.get_attribute("src")
                item['imageurl'] = img_src
                b.close()
                b.switch_to_window("")
                break
            except NoSuchWindowException as e:
                log("NoSuchWindowException", tries)
                sleep(1)
                tries += 1
        

        if img_src:
            # log("IMAGE: {0}".format(img_src))
            ifilename = image_path(item['inventoryNb'])
            ipath = os.path.join(args.imagepath, ifilename)
            if wget(img_src, ipath):
                item['image'] = ifilename

    if args.format == "json":
        print (json.dumps(item))
    elif args.format == "csv":
        csvout.writerow({k:v.encode('utf8') for k,v in item.items()})

    log(u"{0} {1}/{2}".format(count, item.get('objectName', u''), item.get('objectTitle', u'')).encode("utf-8"))

    next = None
    try:
        next = b.find_element_by_css_selector('#pageSetEntries-nextSet a')
    except NoSuchElementException as e:
        pass
    if next == None:
        log("END OF LIST")
        break
        
    b.implicitly_wait(10)
    next.click()
    if sleeptime:
        sleep(sleeptime)

log("output {0} items".format(count))
b.close()