User:Pedro Sá Couto/Prototyping 2nd: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
Line 1: Line 1:
=Mastodon API=
=Mastodon Python API=
 
==Python scripts==
 
===Scrape peers from an instance===
 
<source lang="python">
 
# import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
import datetime
from pprint import pprint
import requests
import multiprocessing
from mastodon import Mastodon
from pprint import pprint
 
# token is stored in the same folder
with open('token.txt','r') as token:
    print(token.read())
    mastodon = Mastodon(access_token=token.read(),api_base_url="https://todon.nl")
 
peers = mastodon.instance_peers()
 
#save in file date and then content
today = datetime.date.today()
text_file = open("scrape/results.txt", "a+")
text_file.write("Data collected on : "+str(today)+"\n"+"\n")
 
 
for n, peer in enumerate(peers):
    if n < 200:
        time.sleep(0.5)
        # get the url from the terminal
        # url ("Enter instance.social url (include https:// ): ")
        url = "https://"+(str(peer))
        print(peer)
 
        # Tell Selenium to open a new Firefox session
        # and specify the path to the driver
        driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')
 
        # Implicit wait tells Selenium how long it should wait before it throws an exception
        driver.implicitly_wait(5)
        driver.get(url)
        time.sleep(3)
 
        print ('Instance: ', "\n", peer)
        text_file.write("Instance:"+"\n"+(peer)+"\n")
 
        try:
            description = driver.find_element_by_xpath('/html/body/div[1]/div/div/div[3]/div[2]/div')
            print ('Description:')
            print(description.text)
            text_file.write("Description:"+"\n"+description.text+"\n"+"\n")
            time.sleep(1)
 
            try:
                # get the image source
                img = driver.find_element_by_css_selector('.landing-page__hero > img:nth-child(1)')
                src = img.get_attribute('src')
                # download the image
                Picture_request = requests.get(src)
                if Picture_request.status_code == 200:
                    with open("{}/scrape/{}.jpg".format(peer), 'wb') as f:
                        f.write(Picture_request.content)
                        print("Printed Image")
 
            except:
                print("Impossible to print image")
                text_file.write("Impossible to print image"+"\n"+"\n")
                time.sleep(0.5)   
 
        except:
            text_file.write("Impossible to check instance"+"\n"+"\n")
            print("Status:"+"\n"+"Impossible to check instance")
            time.sleep(1)
 
        # close new tab
        driver.close()
        print("Closing Window")
 
text_file.close()
# close the browser
driver.close()
</source>
 
===Scrape instances.social===
 
# import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
import datetime
from pprint import pprint
import requests
import multiprocessing
 
today = datetime.date.today()
text_file = open("results.txt", "a+")
text_file.write("Data collected on : "+str(today)+"\n"+"\n")
 
# get the url from the terminal
# url = input("Enter instance.social url (include https:// ): ")
url = "https://instances.social/list#lang=en&allowed=nudity_nocw,nudity_all,pornography_nocw,pornography_all,illegalContentLinks&prohibited=spam,advertising,spoilers_nocw&users="
 
# Tell Selenium to open a new Firefox session
# and specify the path to the driver
driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')
 
# Implicit wait tells Selenium how long it should wait before it throws an exception
driver.implicitly_wait(10)
driver.get(url)
time.sleep(3)
 
d = 1
e = [52,102,152,202,252,302,352,402]
f = 0
i = 0
 
 
while True:
    try:
 
        driver.find_element_by_css_selector('a.list-group-item:nth-child(%d)'%d).click()
        instance_url = driver.find_element_by_css_selector('#modalInstanceInfoLabel')
        description = driver.find_element_by_id('modalInstanceInfo-description')
 
        print ('Instance:')
        print(instance_url.text)
        text_file.write("Instance: "+"\n"+instance_url.text+"\n")
 
        print ('Description:')
        print(description.text)
        text_file.write("Description: "+"\n"+description.text+"\n"+"\n")
 
        time.sleep(0.5)
 
        # open instance in new tab
        driver.find_element_by_css_selector('#modalInstanceInfo-btn-go').send_keys(Keys.COMMAND + Keys.ENTER)
        time.sleep(0.5)
 
        #go to new tab
        driver.switch_to.window(driver.window_handles[-1])
 
        time.sleep(1)
 
        try:
            # get the image source
            img = driver.find_element_by_css_selector('.landing-page__hero > img:nth-child(1)')
            src = img.get_attribute('src')
            # download the image
            Picture_request = requests.get(src)
            if Picture_request.status_code == 200:
                with open("image%i.jpg"%i, 'wb') as f:
                    f.write(Picture_request.content)
                    print("Printed Image")
 
        except:
            print("Impossible to print image")
 
        time.sleep(0.5)
 
        # close new tab
        driver.close()
        print("Closing Window")
 
        #back to original tab
        driver.switch_to.window(driver.window_handles[0])
 
 
        # closes pop up
        driver.find_element_by_css_selector('.btn.btn-secondary').click()
        time.sleep(1)
 
        d+=1
        i+=1
 
    except:
        print("This is an exception")
        driver.find_element_by_css_selector('#load-more-instances > a:nth-child(1)').click()
        d = (e[f])
        f+=1
        pass
 
text_file.close()
# close the browser
driver.close()
</code>
 


=WeasyPrint=
=WeasyPrint=

Revision as of 11:52, 21 March 2019

Mastodon Python API

Python scripts

Scrape peers from an instance

# import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
import datetime
from pprint import pprint
import requests
import multiprocessing
from mastodon import Mastodon
from pprint import pprint

# token is stored in the same folder
with open('token.txt','r') as token:
    print(token.read())
    mastodon = Mastodon(access_token=token.read(),api_base_url="https://todon.nl")

peers = mastodon.instance_peers()

#save in file date and then content
today = datetime.date.today()
text_file = open("scrape/results.txt", "a+")
text_file.write("Data collected on : "+str(today)+"\n"+"\n")


for n, peer in enumerate(peers):
    if n < 200:
        time.sleep(0.5)
        # get the url from the terminal
        # url ("Enter instance.social url (include https:// ): ")
        url = "https://"+(str(peer))
        print(peer)

        # Tell Selenium to open a new Firefox session
        # and specify the path to the driver
        driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')

        # Implicit wait tells Selenium how long it should wait before it throws an exception
        driver.implicitly_wait(5)
        driver.get(url)
        time.sleep(3)

        print ('Instance: ', "\n", peer)
        text_file.write("Instance:"+"\n"+(peer)+"\n")

        try:
            description = driver.find_element_by_xpath('/html/body/div[1]/div/div/div[3]/div[2]/div')
            print ('Description:')
            print(description.text)
            text_file.write("Description:"+"\n"+description.text+"\n"+"\n")
            time.sleep(1)

            try:
                # get the image source
                img = driver.find_element_by_css_selector('.landing-page__hero > img:nth-child(1)')
                src = img.get_attribute('src')
                # download the image
                Picture_request = requests.get(src)
                if Picture_request.status_code == 200:
                    with open("{}/scrape/{}.jpg".format(peer), 'wb') as f:
                        f.write(Picture_request.content)
                        print("Printed Image")

            except:
                print("Impossible to print image")
                text_file.write("Impossible to print image"+"\n"+"\n")
                time.sleep(0.5)    

        except:
            text_file.write("Impossible to check instance"+"\n"+"\n")
            print("Status:"+"\n"+"Impossible to check instance")
            time.sleep(1)

        # close new tab
        driver.close()
        print("Closing Window")

text_file.close()
# close the browser
driver.close()

Scrape instances.social

  1. import libraries

from selenium import webdriver from selenium.webdriver.common.keys import Keys import os import time import datetime from pprint import pprint import requests import multiprocessing

today = datetime.date.today() text_file = open("results.txt", "a+") text_file.write("Data collected on : "+str(today)+"\n"+"\n")

  1. get the url from the terminal
  2. url = input("Enter instance.social url (include https:// ): ")

url = "https://instances.social/list#lang=en&allowed=nudity_nocw,nudity_all,pornography_nocw,pornography_all,illegalContentLinks&prohibited=spam,advertising,spoilers_nocw&users="

  1. Tell Selenium to open a new Firefox session
  2. and specify the path to the driver

driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')

  1. Implicit wait tells Selenium how long it should wait before it throws an exception

driver.implicitly_wait(10) driver.get(url) time.sleep(3)

d = 1 e = [52,102,152,202,252,302,352,402] f = 0 i = 0


while True:

   try:
       driver.find_element_by_css_selector('a.list-group-item:nth-child(%d)'%d).click()
       instance_url = driver.find_element_by_css_selector('#modalInstanceInfoLabel')
       description = driver.find_element_by_id('modalInstanceInfo-description')
       print ('Instance:')
       print(instance_url.text)
       text_file.write("Instance: "+"\n"+instance_url.text+"\n")
       print ('Description:')
       print(description.text)
       text_file.write("Description: "+"\n"+description.text+"\n"+"\n")
       time.sleep(0.5)
       # open instance in new tab
       driver.find_element_by_css_selector('#modalInstanceInfo-btn-go').send_keys(Keys.COMMAND + Keys.ENTER)
       time.sleep(0.5)
       #go to new tab
       driver.switch_to.window(driver.window_handles[-1])
       time.sleep(1)
       try:
           # get the image source
           img = driver.find_element_by_css_selector('.landing-page__hero > img:nth-child(1)')
           src = img.get_attribute('src')
           # download the image
           Picture_request = requests.get(src)
           if Picture_request.status_code == 200:
               with open("image%i.jpg"%i, 'wb') as f:
                   f.write(Picture_request.content)
                   print("Printed Image")
       except:
           print("Impossible to print image")
       time.sleep(0.5)
       # close new tab
       driver.close()
       print("Closing Window")
       #back to original tab
       driver.switch_to.window(driver.window_handles[0])


       # closes pop up
       driver.find_element_by_css_selector('.btn.btn-secondary').click()
       time.sleep(1)
       d+=1
       i+=1
   except:
       print("This is an exception")
       driver.find_element_by_css_selector('#load-more-instances > a:nth-child(1)').click()
       d = (e[f])
       f+=1
       pass

text_file.close()

  1. close the browser

driver.close()


WeasyPrint

GUETTING STARTED
https://weasyprint.readthedocs.io/en/latest/install.html
https://weasyprint.org/

CHECK THE WIKI PAGE ON IT
http://pzwiki.wdka.nl/mediadesign/Weasyprint

SOME CSS TESTED FOR WEASYPRINT
http://test.weasyprint.org/