Selenium: Difference between revisions
No edit summary |
No edit summary |
||
Line 2: | Line 2: | ||
* http://docs.seleniumhq.org/ | * http://docs.seleniumhq.org/ | ||
== Using with python == | |||
<source lang="python"> | |||
from __future__ import print_function | |||
from selenium import webdriver | |||
from selenium.webdriver.support.ui import Select | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.common.by import By | |||
from time import sleep | |||
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException | |||
from urllib2 import urlopen | |||
import sys, json, os | |||
from argparse import ArgumentParser | |||
BUFSIZE = 1024 * 1000 | |||
def wget (url, tofile): | |||
f = urlopen(url) | |||
count = 0 | |||
with open(tofile, "wb") as fout: | |||
while True: | |||
data = f.read(BUFSIZE) | |||
if data == "": | |||
break | |||
count += len(data) | |||
fout.write(data) | |||
return count | |||
def image_path (x): | |||
x = x.replace(" ", "_").replace("/", "-").lower() | |||
x = x+".jpg" | |||
return x | |||
def log (*msg): | |||
print (*msg, file=sys.stderr) | |||
ap = ArgumentParser("MRAH scaper") | |||
ap.add_argument("--starturl", default="http://carmentis.be") | |||
ap.add_argument("--browser", choices=("firefox", "chrome", "opera"), default="chrome", help="browser driver: firefox (default), chrome, opera") | |||
ap.add_argument("--format", choices=("json", "csv"), default="csv", help="output format: json (default), csv") | |||
ap.add_argument("--imagepath", default="images") | |||
ap.add_argument("--skipimages", action="store_true", default=False) | |||
ap.add_argument("--limit", type=int, default=None) | |||
ap.add_argument("--sleeptime", type=float, default=None, help="sleeptime") | |||
args = ap.parse_args() | |||
if not args.skipimages: | |||
try: | |||
os.makedirs(args.imagepath) | |||
except OSError: | |||
pass | |||
sleeptime = args.sleeptime | |||
log("Opening browser...") | |||
driver = None | |||
if args.browser == "opera": | |||
b = webdriver.Opera() | |||
elif args.browser == "chrome": | |||
b = webdriver.Chrome() | |||
else: | |||
b = webdriver.Firefox() | |||
if sleeptime == None: | |||
sleeptime = 0.5 | |||
b.get(args.starturl) | |||
log("Perform a search and select detail mode, then press enter to start scraping items... (Ctrl-c to cancel)") | |||
raw_input() | |||
props = """ | |||
collectionName | |||
inventoryNb | |||
objectName | |||
objectTitle | |||
objectCulture | |||
geography | |||
dating | |||
material | |||
technique | |||
dimensions | |||
legalRightOwner | |||
""".strip().splitlines() | |||
if args.format == "csv": | |||
from csv import DictWriter | |||
fieldnames = props[:] | |||
fieldnames.append("url") | |||
if not args.skipimages: | |||
fieldnames.extend(("imageurl", "image")) | |||
csvout = DictWriter(sys.stdout, fieldnames=fieldnames) | |||
csvout.writeheader() | |||
count = 0 | |||
while True: | |||
b.implicitly_wait(0) | |||
count += 1 | |||
item = {} | |||
for p in props: | |||
try: | |||
li = b.find_element_by_css_selector("li."+p) | |||
name = li.find_element_by_css_selector(".tspPrefix") | |||
span = li.find_element_by_css_selector(".tspValue") | |||
# item[name.text] = span.text | |||
item[p] = span.text | |||
except NoSuchElementException as e: | |||
pass | |||
# permalink / bookmark | |||
tries = 0 | |||
while tries < 5: | |||
try: | |||
permalink = b.find_element_by_css_selector("li.bookmark")\ | |||
.find_element_by_css_selector("input")\ | |||
.get_attribute("value") | |||
item['url'] = permalink | |||
break | |||
except NoSuchElementException: | |||
tries += 1 | |||
sleep(0.1) | |||
# print ("PERMALINK: {0}".format(permalink)) | |||
if not args.skipimages: | |||
imglink = b.find_element_by_css_selector("dt.detailImg a") | |||
imglink.click() | |||
img_src = None | |||
tries = 0 | |||
while tries < 5: | |||
try: | |||
b.switch_to_window('HighResImage') | |||
img = b.find_element_by_css_selector("img") | |||
img_src = img.get_attribute("src") | |||
item['imageurl'] = img_src | |||
b.close() | |||
b.switch_to_window("") | |||
break | |||
except NoSuchWindowException as e: | |||
log("NoSuchWindowException", tries) | |||
sleep(1) | |||
tries += 1 | |||
if img_src: | |||
# log("IMAGE: {0}".format(img_src)) | |||
ifilename = image_path(item['inventoryNb']) | |||
ipath = os.path.join(args.imagepath, ifilename) | |||
if wget(img_src, ipath): | |||
item['image'] = ifilename | |||
if args.format == "json": | |||
print (json.dumps(item)) | |||
elif args.format == "csv": | |||
csvout.writerow({k:v.encode('utf8') for k,v in item.items()}) | |||
log(u"{0} {1}/{2}".format(count, item.get('objectName', u''), item.get('objectTitle', u'')).encode("utf-8")) | |||
next = None | |||
try: | |||
next = b.find_element_by_css_selector('#pageSetEntries-nextSet a') | |||
except NoSuchElementException as e: | |||
pass | |||
if next == None: | |||
log("END OF LIST") | |||
break | |||
b.implicitly_wait(10) | |||
next.click() | |||
if sleeptime: | |||
sleep(sleeptime) | |||
log("output {0} items".format(count)) | |||
b.close() | |||
</source> |
Revision as of 15:52, 20 February 2017
An automated browser engine -- useful for things like scraping and browser based installations. Has been used in projects like PlaySureVeillance and In The Company Of Bots.
Using with python
from __future__ import print_function
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common.exceptions import NoSuchElementException, NoSuchWindowException
from urllib2 import urlopen
import sys, json, os
from argparse import ArgumentParser
BUFSIZE = 1024 * 1000
def wget (url, tofile):
f = urlopen(url)
count = 0
with open(tofile, "wb") as fout:
while True:
data = f.read(BUFSIZE)
if data == "":
break
count += len(data)
fout.write(data)
return count
def image_path (x):
x = x.replace(" ", "_").replace("/", "-").lower()
x = x+".jpg"
return x
def log (*msg):
print (*msg, file=sys.stderr)
ap = ArgumentParser("MRAH scaper")
ap.add_argument("--starturl", default="http://carmentis.be")
ap.add_argument("--browser", choices=("firefox", "chrome", "opera"), default="chrome", help="browser driver: firefox (default), chrome, opera")
ap.add_argument("--format", choices=("json", "csv"), default="csv", help="output format: json (default), csv")
ap.add_argument("--imagepath", default="images")
ap.add_argument("--skipimages", action="store_true", default=False)
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--sleeptime", type=float, default=None, help="sleeptime")
args = ap.parse_args()
if not args.skipimages:
try:
os.makedirs(args.imagepath)
except OSError:
pass
sleeptime = args.sleeptime
log("Opening browser...")
driver = None
if args.browser == "opera":
b = webdriver.Opera()
elif args.browser == "chrome":
b = webdriver.Chrome()
else:
b = webdriver.Firefox()
if sleeptime == None:
sleeptime = 0.5
b.get(args.starturl)
log("Perform a search and select detail mode, then press enter to start scraping items... (Ctrl-c to cancel)")
raw_input()
props = """
collectionName
inventoryNb
objectName
objectTitle
objectCulture
geography
dating
material
technique
dimensions
legalRightOwner
""".strip().splitlines()
if args.format == "csv":
from csv import DictWriter
fieldnames = props[:]
fieldnames.append("url")
if not args.skipimages:
fieldnames.extend(("imageurl", "image"))
csvout = DictWriter(sys.stdout, fieldnames=fieldnames)
csvout.writeheader()
count = 0
while True:
b.implicitly_wait(0)
count += 1
item = {}
for p in props:
try:
li = b.find_element_by_css_selector("li."+p)
name = li.find_element_by_css_selector(".tspPrefix")
span = li.find_element_by_css_selector(".tspValue")
# item[name.text] = span.text
item[p] = span.text
except NoSuchElementException as e:
pass
# permalink / bookmark
tries = 0
while tries < 5:
try:
permalink = b.find_element_by_css_selector("li.bookmark")\
.find_element_by_css_selector("input")\
.get_attribute("value")
item['url'] = permalink
break
except NoSuchElementException:
tries += 1
sleep(0.1)
# print ("PERMALINK: {0}".format(permalink))
if not args.skipimages:
imglink = b.find_element_by_css_selector("dt.detailImg a")
imglink.click()
img_src = None
tries = 0
while tries < 5:
try:
b.switch_to_window('HighResImage')
img = b.find_element_by_css_selector("img")
img_src = img.get_attribute("src")
item['imageurl'] = img_src
b.close()
b.switch_to_window("")
break
except NoSuchWindowException as e:
log("NoSuchWindowException", tries)
sleep(1)
tries += 1
if img_src:
# log("IMAGE: {0}".format(img_src))
ifilename = image_path(item['inventoryNb'])
ipath = os.path.join(args.imagepath, ifilename)
if wget(img_src, ipath):
item['image'] = ifilename
if args.format == "json":
print (json.dumps(item))
elif args.format == "csv":
csvout.writerow({k:v.encode('utf8') for k,v in item.items()})
log(u"{0} {1}/{2}".format(count, item.get('objectName', u''), item.get('objectTitle', u'')).encode("utf-8"))
next = None
try:
next = b.find_element_by_css_selector('#pageSetEntries-nextSet a')
except NoSuchElementException as e:
pass
if next == None:
log("END OF LIST")
break
b.implicitly_wait(10)
next.click()
if sleeptime:
sleep(sleeptime)
log("output {0} items".format(count))
b.close()