User:Pedro Sá Couto/Prototyping 2nd: Difference between revisions
Line 80: | Line 80: | ||
</source> | </source> | ||
===Scrape peers from an instance=== | ===Scrape peers description and image from an instance=== | ||
#https://git.xpub.nl/pedrosaclout/scrape_peers_mastodon_ai | #https://git.xpub.nl/pedrosaclout/scrape_peers_mastodon_ai | ||
Line 96: | Line 96: | ||
import multiprocessing | import multiprocessing | ||
from mastodon import Mastodon | from mastodon import Mastodon | ||
with open('token.txt','r') as token: | with open('token.txt','r') as token: | ||
print(token.read()) | print(token.read()) | ||
Line 105: | Line 103: | ||
peers = mastodon.instance_peers() | peers = mastodon.instance_peers() | ||
today = datetime.date.today() | today = datetime.date.today() | ||
text_file = open("scrape/results.txt", "a+") | text_file = open("scrape/results.txt", "a+") | ||
Line 132: | Line 129: | ||
try: | try: | ||
about = driver.find_element_by_css_selector('.landing-page__short-description') | |||
print (' | print ('About:') | ||
print( | print(about.text) | ||
text_file.write(" | text_file.write("About:"+"\n"+about.text+"\n"+"\n") | ||
time.sleep(1) | time.sleep(1) | ||
try: | try: | ||
# get the image source | # get the image source | ||
img = driver. | img = driver.find_element_by_xpath('/html/body/div[1]/div/div/div[3]/div[1]/img') | ||
src = img.get_attribute('src') | src = img.get_attribute('src') | ||
# download the image | # download the image | ||
Picture_request = requests.get(src) | Picture_request = requests.get(src) | ||
if Picture_request.status_code == 200: | if Picture_request.status_code == 200: | ||
with open(" | with open("scrape/{}.jpg".format(peer), 'wb') as f: | ||
f.write(Picture_request.content) | f.write(Picture_request.content) | ||
print("Printed Image") | print("Printed Image") | ||
Line 152: | Line 149: | ||
print("Impossible to print image") | print("Impossible to print image") | ||
text_file.write("Impossible to print image"+"\n"+"\n") | text_file.write("Impossible to print image"+"\n"+"\n") | ||
time.sleep(0.5) | time.sleep(0.5) | ||
except: | except: | ||
Line 166: | Line 163: | ||
# close the browser | # close the browser | ||
driver.close() | driver.close() | ||
</source> | </source> | ||
Revision as of 18:51, 23 March 2019
Mastodon Python API
Python scripts
My git
https://git.xpub.nl/pedrosaclout/
Gather Direct messages
- Based on André's code in — https://git.xpub.nl/XPUB/MastodonAPI/src/branch/master/direct-msgs.py
- https://git.xpub.nl/pedrosaclout/dms_mastodon_api
Gather Answers from a coment
from mastodon import Mastodon
from pprint import pprint
import os
import time
import datetime
from pprint import pprint
today = datetime.date.today()
text_file = open("answers_results.txt", "a+")
text_file.write("Data collected on : "+str(today)+"\n"+"\n")
#toots id and instances position refer to same post
#FOR EXAMPLE
#toot_id[0] goes with instances[0]
# IMPORTANT: Ensure the order of elements in toot_id and instances
# Are the same as the order of the tokens tokens.txt
toot_id = [101767654564328802, 101767613341391125, 101767845648108564, 101767859212894722, 101767871935737959, 101767878557811327, 101772545369017811, 101767981291624379, 101767995246055609, 101772553091703710, 101768372248628715, 101768407716536393, 101768414826737939, 101768421746838431, 101771801100381784, 101771425484725792, 101771434017039442,
101771437693805317, 101782008831021451, 101795233034162198]
instances = ["https://todon.nl/", "https://meow.social/", "https://queer.party/", "https://scholar.social/", "https://eletusk.club/", "https://abdl.link/", "https://mastodon.starrevolution.org/", "https://mastodon.technology/", "https://quey.org/", "https://bsd.network/", "https://freeradical.zone/", "https://linuxrocks.online/", "https://mastodont.cat/", "https://qoto.org/", "https://mastodon.host/", "https://mastodon.gamedev.place/", "https://hotwife.social/", "https://hostux.social/", "https://mastodon.social/", "https://post.lurk.org/"]
#toots token order is also the same
#FOR EXAMPLE
#toot_id[0] goes with instances[0] and now goes with line nr1 from txt file
with open('token.txt', 'r') as token:
for n, token_line in enumerate(token.readlines()):
base_url = instances[n]
print(token_line, base_url)
# token_line é uma das linhas de token.txt
# como a les em token.readlines() n precisas de ler outra ver
# e podes usar apenas a var token_line
mastodon = Mastodon(access_token=token_line.replace('\n', ''),
api_base_url=(str(base_url)))
toot_id_current = toot_id[n]
print(toot_id_current)
status = mastodon.status_context(id=(toot_id_current))
descendants = mastodon.status_context(id=(toot_id_current))["descendants"]
print(toot_id_current)
for answer in descendants:
pprint(answer["id"])
# geral = mastodon.status(answer["id"]))
avatar = mastodon.status(answer["id"])['account']['avatar']
name = mastodon.status(answer["id"])['account']['display_name']
bot = mastodon.status(answer["id"])['account']['bot']
content = mastodon.status(answer["id"])['content']
pprint("Avatar:" + "\n" + str(avatar) + "\n" + "\n")
pprint("Name:" + "\n" + str(name) + "\n" + "\n")
pprint("Bot:" + "\n" + str(bot) + "\n" + "\n")
pprint("Content:" + "\n" + str(content) + "\n" + "\n")
text_file.write("Avatar:" + "\n" + str(avatar) + "\n" + "\n")
text_file.write("Name:" + "\n" + str(name) + "\n" + "\n")
text_file.write("Bot:" + "\n" + str(bot) + "\n" + "\n")
text_file.write("Content:" + "\n" + str(content) + "\n" + "\n" + "\n")
time.sleep(2)
Scrape peers description and image from an instance
# import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
import datetime
from pprint import pprint
import requests
import multiprocessing
from mastodon import Mastodon
with open('token.txt','r') as token:
print(token.read())
mastodon = Mastodon(access_token=token.read(),api_base_url="https://todon.nl")
peers = mastodon.instance_peers()
today = datetime.date.today()
text_file = open("scrape/results.txt", "a+")
text_file.write("Data collected on : "+str(today)+"\n"+"\n")
for n, peer in enumerate(peers):
if n < 200:
time.sleep(0.5)
# get the url from the terminal
# url ("Enter instance.social url (include https:// ): ")
url = "https://"+(str(peer))
print(peer)
# Tell Selenium to open a new Firefox session
# and specify the path to the driver
driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')
# Implicit wait tells Selenium how long it should wait before it throws an exception
driver.implicitly_wait(5)
driver.get(url)
time.sleep(3)
print ('Instance: ', "\n", peer)
text_file.write("Instance:"+"\n"+(peer)+"\n")
try:
about = driver.find_element_by_css_selector('.landing-page__short-description')
print ('About:')
print(about.text)
text_file.write("About:"+"\n"+about.text+"\n"+"\n")
time.sleep(1)
try:
# get the image source
img = driver.find_element_by_xpath('/html/body/div[1]/div/div/div[3]/div[1]/img')
src = img.get_attribute('src')
# download the image
Picture_request = requests.get(src)
if Picture_request.status_code == 200:
with open("scrape/{}.jpg".format(peer), 'wb') as f:
f.write(Picture_request.content)
print("Printed Image")
except:
print("Impossible to print image")
text_file.write("Impossible to print image"+"\n"+"\n")
time.sleep(0.5)
except:
text_file.write("Impossible to check instance"+"\n"+"\n")
print("Status:"+"\n"+"Impossible to check instance")
time.sleep(1)
# close new tab
driver.close()
print("Closing Window")
text_file.close()
# close the browser
driver.close()
Scrapes about more page of peers from an instance
# import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
import datetime
from pprint import pprint
import requests
import multiprocessing
from mastodon import Mastodon
with open('token.txt','r') as token:
print(token.read())
mastodon = Mastodon(access_token=token.read(),api_base_url="https://todon.nl")
peers = mastodon.instance_peers()
today = datetime.date.today()
text_file = open("AboutMore/results.txt", "a+")
text_file.write("Data collected on : "+str(today)+"\n"+"\n")
for n, peer in enumerate(peers):
if n < 200:
time.sleep(0.5)
# get the url from the terminal
# url ("Enter instance.social url (include https:// ): ")
url = "https://"+(str(peer)+"/about/more")
print(peer)
# Tell Selenium to open a new Firefox session
# and specify the path to the driver
driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')
# Implicit wait tells Selenium how long it should wait before it throws an exception
driver.implicitly_wait(5)
driver.get(url)
time.sleep(3)
print ('Instance: ', "\n", peer)
text_file.write("Instance:"+"\n"+(peer)+"\n")
try:
about_more = driver.find_element_by_css_selector('.rich-formatting')
print ('About more:')
print(about_more.text)
text_file.write("About more:"+"\n"+about_more.text+"\n"+"\n")
time.sleep(1)
try:
# get the image source
img = driver.find_element_by_xpath('/html/body/div/div[2]/div/div[1]/div/div/img')
src = img.get_attribute('src')
# download the image
Picture_request = requests.get(src)
if Picture_request.status_code == 200:
with open("AboutMore/{}.jpg".format(peer), 'wb') as f:
f.write(Picture_request.content)
print("Printed Image")
except:
print("Impossible to print image")
text_file.write("Impossible to print image"+"\n"+"\n")
time.sleep(0.5)
except:
text_file.write("No about more"+"\n"+"\n")
print("Status:"+"\n"+"No about more")
time.sleep(1)
# close new tab
driver.close()
print("Closing Window")
text_file.close()
# close the browser
driver.close()
Scrape instances.social
# import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
import datetime
from pprint import pprint
import requests
import multiprocessing
today = datetime.date.today()
text_file = open("results.txt", "a+")
text_file.write("Data collected on : "+str(today)+"\n"+"\n")
# get the url from the terminal
url = input("Enter instance.social url (include https:// ): ")
#For example, if you want to look into instances on which everything is allowed
#url = "https://instances.social/list#lang=en&allowed=nudity_nocw,nudity_all,pornography_nocw,pornography_all,illegalContentLinks,spam,advertising,spoilers_nocw&prohibited=&users="
# Tell Selenium to open a new Firefox session
# and specify the path to the driver
driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')
# Implicit wait tells Selenium how long it should wait before it throws an exception
driver.implicitly_wait(10)
driver.get(url)
time.sleep(3)
d = 1
e = [52,102,152,202,252,302,352,402]
f = 0
i = 0
while True:
try:
driver.find_element_by_css_selector('a.list-group-item:nth-child(%d)'%d).click()
instance_url = driver.find_element_by_css_selector('#modalInstanceInfoLabel')
description = driver.find_element_by_id('modalInstanceInfo-description')
print ('Instance:')
print(instance_url.text)
text_file.write("Instance: "+"\n"+instance_url.text+"\n")
print ('Description:')
print(description.text)
text_file.write("Description: "+"\n"+description.text+"\n"+"\n")
time.sleep(0.5)
# open instance in new tab
driver.find_element_by_css_selector('#modalInstanceInfo-btn-go').send_keys(Keys.COMMAND + Keys.ENTER)
time.sleep(0.5)
#go to new tab
driver.switch_to.window(driver.window_handles[-1])
time.sleep(1)
try:
# get the image source
img = driver.find_element_by_css_selector('.landing-page__hero > img:nth-child(1)')
src = img.get_attribute('src')
# download the image
Picture_request = requests.get(src)
if Picture_request.status_code == 200:
with open("image%i.jpg"%i, 'wb') as f:
f.write(Picture_request.content)
print("Printed Image")
except:
print("Impossible to print image")
time.sleep(0.5)
# close new tab
driver.close()
print("Closing Window")
#back to original tab
driver.switch_to.window(driver.window_handles[0])
# closes pop up
driver.find_element_by_css_selector('.btn.btn-secondary').click()
time.sleep(1)
d+=1
i+=1
except:
print("This is an exception")
driver.find_element_by_css_selector('#load-more-instances > a:nth-child(1)').click()
d = (e[f])
f+=1
pass
text_file.close()
# close the browser
driver.close()
WeasyPrint
GUETTING STARTED
https://weasyprint.readthedocs.io/en/latest/install.html
https://weasyprint.org/
CHECK THE WIKI PAGE ON IT
http://pzwiki.wdka.nl/mediadesign/Weasyprint
SOME CSS TESTED FOR WEASYPRINT
http://test.weasyprint.org/