User:Alice/IFL
My contribution to Xpub Library
Research questions
- How can we represent the books in the collection in a different way - using the idea of stacks as mixtapes, depending on study path and reading time
Reading time
For this first challenge, I looked into representing a text through the amount of time it takes to read (similar to Medium). I adapted this script (license) using BeautifulSoup to extract the text from an html file and print out the estimated reading time in minutes and hours.
import bs4
import urllib.request, re
from math import ceil
def extract_text(url):
html = urllib.request.urlopen(url).read()
soup = bs4.BeautifulSoup(html, 'html.parser')
texts = soup.findAll(text=True)
return texts
def is_visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif isinstance(element, bs4.element.Comment):
return False
elif element.string == "\n":
return False
return True
def filter_visible_text(page_texts):
return filter(is_visible, page_texts)
WPM = 180
WORD_LENGTH = 5
def count_words_in_text(text_list, word_length):
total_words = 0
for current_text in text_list:
total_words += len(current_text)/word_length
return total_words
def estimate_reading_time(url):
texts = extract_text(url)
filtered_text = filter_visible_text(texts)
total_words = count_words_in_text(filtered_text, WORD_LENGTH)
minutes = ceil(total_words/WPM)
hours = minutes/60
return [minutes, hours]
html_file = 'file:///home/alice/Documents/Reader%20final/txt/where_is.html'
values = estimate_reading_time(html_file)
print('It will take you', values[0], ' minutes or', values[1], 'hours to read this text')