User:Pedro Sá Couto/Prototyping 2nd: Difference between revisions
No edit summary |
|||
Line 1: | Line 1: | ||
=Mastodon API= | =Mastodon Python API= | ||
==Python scripts== | |||
===Scrape peers from an instance=== | |||
<source lang="python"> | |||
# import libraries | |||
from selenium import webdriver | |||
from selenium.webdriver.common.keys import Keys | |||
import os | |||
import time | |||
import datetime | |||
from pprint import pprint | |||
import requests | |||
import multiprocessing | |||
from mastodon import Mastodon | |||
from pprint import pprint | |||
# token is stored in the same folder | |||
with open('token.txt','r') as token: | |||
print(token.read()) | |||
mastodon = Mastodon(access_token=token.read(),api_base_url="https://todon.nl") | |||
peers = mastodon.instance_peers() | |||
#save in file date and then content | |||
today = datetime.date.today() | |||
text_file = open("scrape/results.txt", "a+") | |||
text_file.write("Data collected on : "+str(today)+"\n"+"\n") | |||
for n, peer in enumerate(peers): | |||
if n < 200: | |||
time.sleep(0.5) | |||
# get the url from the terminal | |||
# url ("Enter instance.social url (include https:// ): ") | |||
url = "https://"+(str(peer)) | |||
print(peer) | |||
# Tell Selenium to open a new Firefox session | |||
# and specify the path to the driver | |||
driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver') | |||
# Implicit wait tells Selenium how long it should wait before it throws an exception | |||
driver.implicitly_wait(5) | |||
driver.get(url) | |||
time.sleep(3) | |||
print ('Instance: ', "\n", peer) | |||
text_file.write("Instance:"+"\n"+(peer)+"\n") | |||
try: | |||
description = driver.find_element_by_xpath('/html/body/div[1]/div/div/div[3]/div[2]/div') | |||
print ('Description:') | |||
print(description.text) | |||
text_file.write("Description:"+"\n"+description.text+"\n"+"\n") | |||
time.sleep(1) | |||
try: | |||
# get the image source | |||
img = driver.find_element_by_css_selector('.landing-page__hero > img:nth-child(1)') | |||
src = img.get_attribute('src') | |||
# download the image | |||
Picture_request = requests.get(src) | |||
if Picture_request.status_code == 200: | |||
with open("{}/scrape/{}.jpg".format(peer), 'wb') as f: | |||
f.write(Picture_request.content) | |||
print("Printed Image") | |||
except: | |||
print("Impossible to print image") | |||
text_file.write("Impossible to print image"+"\n"+"\n") | |||
time.sleep(0.5) | |||
except: | |||
text_file.write("Impossible to check instance"+"\n"+"\n") | |||
print("Status:"+"\n"+"Impossible to check instance") | |||
time.sleep(1) | |||
# close new tab | |||
driver.close() | |||
print("Closing Window") | |||
text_file.close() | |||
# close the browser | |||
driver.close() | |||
</source> | |||
===Scrape instances.social=== | |||
# import libraries | |||
from selenium import webdriver | |||
from selenium.webdriver.common.keys import Keys | |||
import os | |||
import time | |||
import datetime | |||
from pprint import pprint | |||
import requests | |||
import multiprocessing | |||
today = datetime.date.today() | |||
text_file = open("results.txt", "a+") | |||
text_file.write("Data collected on : "+str(today)+"\n"+"\n") | |||
# get the url from the terminal | |||
# url = input("Enter instance.social url (include https:// ): ") | |||
url = "https://instances.social/list#lang=en&allowed=nudity_nocw,nudity_all,pornography_nocw,pornography_all,illegalContentLinks&prohibited=spam,advertising,spoilers_nocw&users=" | |||
# Tell Selenium to open a new Firefox session | |||
# and specify the path to the driver | |||
driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver') | |||
# Implicit wait tells Selenium how long it should wait before it throws an exception | |||
driver.implicitly_wait(10) | |||
driver.get(url) | |||
time.sleep(3) | |||
d = 1 | |||
e = [52,102,152,202,252,302,352,402] | |||
f = 0 | |||
i = 0 | |||
while True: | |||
try: | |||
driver.find_element_by_css_selector('a.list-group-item:nth-child(%d)'%d).click() | |||
instance_url = driver.find_element_by_css_selector('#modalInstanceInfoLabel') | |||
description = driver.find_element_by_id('modalInstanceInfo-description') | |||
print ('Instance:') | |||
print(instance_url.text) | |||
text_file.write("Instance: "+"\n"+instance_url.text+"\n") | |||
print ('Description:') | |||
print(description.text) | |||
text_file.write("Description: "+"\n"+description.text+"\n"+"\n") | |||
time.sleep(0.5) | |||
# open instance in new tab | |||
driver.find_element_by_css_selector('#modalInstanceInfo-btn-go').send_keys(Keys.COMMAND + Keys.ENTER) | |||
time.sleep(0.5) | |||
#go to new tab | |||
driver.switch_to.window(driver.window_handles[-1]) | |||
time.sleep(1) | |||
try: | |||
# get the image source | |||
img = driver.find_element_by_css_selector('.landing-page__hero > img:nth-child(1)') | |||
src = img.get_attribute('src') | |||
# download the image | |||
Picture_request = requests.get(src) | |||
if Picture_request.status_code == 200: | |||
with open("image%i.jpg"%i, 'wb') as f: | |||
f.write(Picture_request.content) | |||
print("Printed Image") | |||
except: | |||
print("Impossible to print image") | |||
time.sleep(0.5) | |||
# close new tab | |||
driver.close() | |||
print("Closing Window") | |||
#back to original tab | |||
driver.switch_to.window(driver.window_handles[0]) | |||
# closes pop up | |||
driver.find_element_by_css_selector('.btn.btn-secondary').click() | |||
time.sleep(1) | |||
d+=1 | |||
i+=1 | |||
except: | |||
print("This is an exception") | |||
driver.find_element_by_css_selector('#load-more-instances > a:nth-child(1)').click() | |||
d = (e[f]) | |||
f+=1 | |||
pass | |||
text_file.close() | |||
# close the browser | |||
driver.close() | |||
</code> | |||
=WeasyPrint= | =WeasyPrint= |
Revision as of 10:52, 21 March 2019
Mastodon Python API
Python scripts
Scrape peers from an instance
# import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
import datetime
from pprint import pprint
import requests
import multiprocessing
from mastodon import Mastodon
from pprint import pprint
# token is stored in the same folder
with open('token.txt','r') as token:
print(token.read())
mastodon = Mastodon(access_token=token.read(),api_base_url="https://todon.nl")
peers = mastodon.instance_peers()
#save in file date and then content
today = datetime.date.today()
text_file = open("scrape/results.txt", "a+")
text_file.write("Data collected on : "+str(today)+"\n"+"\n")
for n, peer in enumerate(peers):
if n < 200:
time.sleep(0.5)
# get the url from the terminal
# url ("Enter instance.social url (include https:// ): ")
url = "https://"+(str(peer))
print(peer)
# Tell Selenium to open a new Firefox session
# and specify the path to the driver
driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')
# Implicit wait tells Selenium how long it should wait before it throws an exception
driver.implicitly_wait(5)
driver.get(url)
time.sleep(3)
print ('Instance: ', "\n", peer)
text_file.write("Instance:"+"\n"+(peer)+"\n")
try:
description = driver.find_element_by_xpath('/html/body/div[1]/div/div/div[3]/div[2]/div')
print ('Description:')
print(description.text)
text_file.write("Description:"+"\n"+description.text+"\n"+"\n")
time.sleep(1)
try:
# get the image source
img = driver.find_element_by_css_selector('.landing-page__hero > img:nth-child(1)')
src = img.get_attribute('src')
# download the image
Picture_request = requests.get(src)
if Picture_request.status_code == 200:
with open("{}/scrape/{}.jpg".format(peer), 'wb') as f:
f.write(Picture_request.content)
print("Printed Image")
except:
print("Impossible to print image")
text_file.write("Impossible to print image"+"\n"+"\n")
time.sleep(0.5)
except:
text_file.write("Impossible to check instance"+"\n"+"\n")
print("Status:"+"\n"+"Impossible to check instance")
time.sleep(1)
# close new tab
driver.close()
print("Closing Window")
text_file.close()
# close the browser
driver.close()
Scrape instances.social
- import libraries
from selenium import webdriver from selenium.webdriver.common.keys import Keys import os import time import datetime from pprint import pprint import requests import multiprocessing
today = datetime.date.today() text_file = open("results.txt", "a+") text_file.write("Data collected on : "+str(today)+"\n"+"\n")
- get the url from the terminal
- url = input("Enter instance.social url (include https:// ): ")
- Tell Selenium to open a new Firefox session
- and specify the path to the driver
driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')
- Implicit wait tells Selenium how long it should wait before it throws an exception
driver.implicitly_wait(10) driver.get(url) time.sleep(3)
d = 1 e = [52,102,152,202,252,302,352,402] f = 0 i = 0
while True:
try:
driver.find_element_by_css_selector('a.list-group-item:nth-child(%d)'%d).click() instance_url = driver.find_element_by_css_selector('#modalInstanceInfoLabel') description = driver.find_element_by_id('modalInstanceInfo-description')
print ('Instance:') print(instance_url.text) text_file.write("Instance: "+"\n"+instance_url.text+"\n")
print ('Description:') print(description.text) text_file.write("Description: "+"\n"+description.text+"\n"+"\n")
time.sleep(0.5)
# open instance in new tab driver.find_element_by_css_selector('#modalInstanceInfo-btn-go').send_keys(Keys.COMMAND + Keys.ENTER) time.sleep(0.5)
#go to new tab driver.switch_to.window(driver.window_handles[-1])
time.sleep(1)
try: # get the image source img = driver.find_element_by_css_selector('.landing-page__hero > img:nth-child(1)') src = img.get_attribute('src') # download the image Picture_request = requests.get(src) if Picture_request.status_code == 200: with open("image%i.jpg"%i, 'wb') as f: f.write(Picture_request.content) print("Printed Image")
except: print("Impossible to print image")
time.sleep(0.5)
# close new tab driver.close() print("Closing Window")
#back to original tab driver.switch_to.window(driver.window_handles[0])
# closes pop up driver.find_element_by_css_selector('.btn.btn-secondary').click() time.sleep(1)
d+=1 i+=1
except: print("This is an exception") driver.find_element_by_css_selector('#load-more-instances > a:nth-child(1)').click() d = (e[f]) f+=1 pass
text_file.close()
- close the browser
driver.close()
WeasyPrint
GUETTING STARTED
https://weasyprint.readthedocs.io/en/latest/install.html
https://weasyprint.org/
CHECK THE WIKI PAGE ON IT
http://pzwiki.wdka.nl/mediadesign/Weasyprint
SOME CSS TESTED FOR WEASYPRINT
http://test.weasyprint.org/