Selenium: Difference between revisions
No edit summary |
|||
Line 4: | Line 4: | ||
== Using with python == | == Using with python == | ||
To use selenium with python you just need the selenium python library (installable with [[pip]]) and the "webdriver" for your specific browser (for instance: firefox or chrome (maybe chromium, though haven't tested that). | |||
<source lang="python"> | <source lang="python"> |
Revision as of 15:53, 20 February 2017
An automated browser engine -- useful for things like scraping and browser based installations. Has been used in projects like PlaySureVeillance and In The Company Of Bots.
Using with python
To use selenium with python you just need the selenium python library (installable with pip) and the "webdriver" for your specific browser (for instance: firefox or chrome (maybe chromium, though haven't tested that).
from __future__ import print_function
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException
from urllib2 import urlopen
import sys, json, os
from argparse import ArgumentParser
BUFSIZE = 1024 * 1000
def wget (url, tofile):
f = urlopen(url)
count = 0
with open(tofile, "wb") as fout:
while True:
data = f.read(BUFSIZE)
if data == "":
break
count += len(data)
fout.write(data)
return count
def image_path (x):
x = x.replace(" ", "_").replace("/", "-").lower()
x = x+".jpg"
return x
def log (*msg):
print (*msg, file=sys.stderr)
ap = ArgumentParser("MRAH scaper")
ap.add_argument("--starturl", default="http://carmentis.be")
ap.add_argument("--browser", choices=("firefox", "chrome", "opera"), default="chrome", help="browser driver: firefox (default), chrome, opera")
ap.add_argument("--format", choices=("json", "csv"), default="csv", help="output format: json (default), csv")
ap.add_argument("--imagepath", default="images")
ap.add_argument("--skipimages", action="store_true", default=False)
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--sleeptime", type=float, default=None, help="sleeptime")
args = ap.parse_args()
if not args.skipimages:
try:
os.makedirs(args.imagepath)
except OSError:
pass
sleeptime = args.sleeptime
log("Opening browser...")
driver = None
if args.browser == "opera":
b = webdriver.Opera()
elif args.browser == "chrome":
b = webdriver.Chrome()
else:
b = webdriver.Firefox()
if sleeptime == None:
sleeptime = 0.5
b.get(args.starturl)
log("Perform a search and select detail mode, then press enter to start scraping items... (Ctrl-c to cancel)")
raw_input()
props = """
collectionName
inventoryNb
objectName
objectTitle
objectCulture
geography
dating
material
technique
dimensions
legalRightOwner
""".strip().splitlines()
if args.format == "csv":
from csv import DictWriter
fieldnames = props[:]
fieldnames.append("url")
if not args.skipimages:
fieldnames.extend(("imageurl", "image"))
csvout = DictWriter(sys.stdout, fieldnames=fieldnames)
csvout.writeheader()
count = 0
while True:
b.implicitly_wait(0)
count += 1
item = {}
for p in props:
try:
li = b.find_element_by_css_selector("li."+p)
name = li.find_element_by_css_selector(".tspPrefix")
span = li.find_element_by_css_selector(".tspValue")
# item[name.text] = span.text
item[p] = span.text
except NoSuchElementException as e:
pass
# permalink / bookmark
tries = 0
while tries < 5:
try:
permalink = b.find_element_by_css_selector("li.bookmark")\
.find_element_by_css_selector("input")\
.get_attribute("value")
item['url'] = permalink
break
except NoSuchElementException:
tries += 1
sleep(0.1)
# print ("PERMALINK: {0}".format(permalink))
if not args.skipimages:
imglink = b.find_element_by_css_selector("dt.detailImg a")
imglink.click()
img_src = None
tries = 0
while tries < 5:
try:
b.switch_to_window('HighResImage')
img = b.find_element_by_css_selector("img")
img_src = img.get_attribute("src")
item['imageurl'] = img_src
b.close()
b.switch_to_window("")
break
except NoSuchWindowException as e:
log("NoSuchWindowException", tries)
sleep(1)
tries += 1
if img_src:
# log("IMAGE: {0}".format(img_src))
ifilename = image_path(item['inventoryNb'])
ipath = os.path.join(args.imagepath, ifilename)
if wget(img_src, ipath):
item['image'] = ifilename
if args.format == "json":
print (json.dumps(item))
elif args.format == "csv":
csvout.writerow({k:v.encode('utf8') for k,v in item.items()})
log(u"{0} {1}/{2}".format(count, item.get('objectName', u''), item.get('objectTitle', u'')).encode("utf-8"))
next = None
try:
next = b.find_element_by_css_selector('#pageSetEntries-nextSet a')
except NoSuchElementException as e:
pass
if next == None:
log("END OF LIST")
break
b.implicitly_wait(10)
next.click()
if sleeptime:
sleep(sleeptime)
log("output {0} items".format(count))
b.close()