User:Pedro Sá Couto/TW/REPUBLISHING FLOW
< User:Pedro Sá Couto | TW
STEPS
Republishing is separated into 6 steps:
1. Move the book from the webserver to a work directory
- 1.1 Replacing all spaces with underscores
2. Creating the watermark from the gathered form in Tactical Watermarks
- 2.1 Create the watermark in pdf with reportlab
- 2.2 Convert to a png
3. Append the watermark to the pdf
- 3.1 Burst the pdf cover
- 3.2 Rotate the watermark with PIL
- 3.3 Overlay the watermark with PIL
- 3.4 OCR the new cover
- 3.5 Resize the OCRed cover to fit the book
- 3.6 Merge the cover and the pdf into one
4. OCR the pdf if not OCRed already
5. Delete all the unwanted traces
6. Save the file in a directory open to Library Genesis Staff
FLOW
RUN.SH
To activate the stream I use ./run.sh
sudo chmod 777 *
./movebookfolder.sh
./watermarkformtxt.sh
./appendwatermarktopdf.sh
./republish.sh
./deletetraces.sh
1. Moving the book from the webserver to a work place
cd /home/psc/tacticalbooks/republish/
for name in *; do mv "$name" "${name// /_}"; done
cd `ls -td -- /home/psc/tacticalbooks/republish/* | head -n 1`
for name in *; do mv "$name" "${name// /_}"; done
mv `ls -td -- /home/psc/tacticalbooks/republish/* | head -n 1` /home/psc/scripts/autorepublish/inprogress
2. Creating the watermark from the gathered form in Tactical Watermarks
Bash
cd `ls -td -- /home/psc/scripts/autorepublish/inprogress/* | head -n 1`
mv `ls -td -- *.txt | head -n 1` watermark.txt
mv `ls -td -- *.txt | head -n 1` /home/psc/scripts/autorepublish/watermark
cd /home/psc/scripts/autorepublish/watermark
python3 watermark.py
rm watermark.txt
convert -density 300 -trim watermark.pdf -quality 100 watermark.png
mv /home/psc/scripts/autorepublish/watermark/watermark.png `ls -td -- /home/psc/scripts/autorepublish/inprogress/* | head -n 1`
Python watermark.py
from reportlab.pdfgen import canvas
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from reportlab.lib import colors
from reportlab.lib.colors import pink, green, brown, white, black
import textwrap
from textwrap import wrap
from reportlab.lib.units import inch
from reportlab.lib.pagesizes import letter
from reportlab.lib.pagesizes import A4
from reportlab.lib.units import inch
from reportlab.lib.units import cm
from reportlab.lib.colors import HexColor
import datetime
from reportlab.lib.utils import ImageReader
import ast
with open('watermark.txt', 'r') as f:
watermark = ast.literal_eval(f.read())
#The Ruler
def DrawTheRuler(pdf):
pdf.drawString(30,960, '|')
pdf.drawString(60,960, '|')
pdf.drawString(90,960, '|')
pdf.drawString(120,960, '|')
pdf.drawString(150,960, '|')
pdf.drawString(180,960, '|')
pdf.drawString(210,960, '|')
pdf.drawString(240,960, '|')
pdf.drawString(270,960, '|')
pdf.drawString(300,960, '|')
pdf.drawString(330,960, '|')
pdf.drawString(360,960, '|')
pdf.drawString(390,960, '|')
pdf.drawString(420,960, '|')
pdf.drawString(450,960, '|')
pdf.drawString(480,960, '|')
pdf.drawString(510,960, '|')
pdf.drawString(540,960, '|')
pdf.drawString(570,960, '|')
pdf.drawString(600,960, '|')
pdf.drawString(630,960, '|')
pdf.drawString(10,940, '—')
pdf.drawString(10,910, '—')
pdf.drawString(10,880, '—')
pdf.drawString(10,850, '—')
pdf.drawString(10,820, '—')
pdf.drawString(10,790, '—')
pdf.drawString(10,760, '—')
pdf.drawString(10,730, '—')
pdf.drawString(10,700, '—')
pdf.drawString(10,670, '—')
pdf.drawString(10,640, '—')
pdf.drawString(10,610, '—')
pdf.drawString(10,580, '—')
pdf.drawString(10,550, '—')
pdf.drawString(10,520, '—')
pdf.drawString(10,490, '—')
pdf.drawString(10,460, '—')
pdf.drawString(10,430, '—')
pdf.drawString(10,400, '—')
pdf.drawString(10,370, '—')
pdf.drawString(10,340, '—')
pdf.drawString(10,310, '—')
pdf.drawString(10,280, '—')
pdf.drawString(10,250, '—')
pdf.drawString(10,220, '—')
pdf.drawString(10,190, '—')
pdf.drawString(10,160, '—')
pdf.drawString(10,130, '—')
pdf.drawString(10,100, '—')
pdf.drawString(10,70, '—')
pdf.drawString(10,40, '—')
# Unchanged
fileName = "watermark.pdf"
documentTitle = "TACTICAL WATERMARKS"
title = "TACTICAL WATERMARKS"
subTitle = "REPUBLISHED THROUGH"
#Create the file
pdf = canvas.Canvas(fileName)
#Set the background
pdf.setFillColor(HexColor(0xceff00))
pdf.rect(0,0,660, 1000,fill=1)
#Change color back to black
pdf.setFillColorRGB(0,0,0)
##Draw the Ruler
DrawTheRuler(pdf)
# Set the title
pdf.setTitle(documentTitle)
#Set the Height and Width
pdf.setPageSize((660, 1000))
#DRAW LOGO
logojump = ImageReader('logojump.png')
pdf.drawImage(logojump, 560, 10, width=(300/3),height=(300/3), mask='auto')
logostar = ImageReader('logostar.png')
pdf.drawImage(logostar, 100, 400, width=(300),height=(300), mask='auto')
#TITLE
# Register a new font
pdfmetrics.registerFont(
TTFont('header', 'LyonJeanTrue.ttf')
)
# Draw
pdf.setFont('header', 45)
pdf.drawString(40, 30, title)
#SUBTITLE
# Register a new font for the subtitle
pdfmetrics.registerFont(
TTFont('subtitle', 'Favorit_Medium.ttf')
)
# Draw
pdf.setFont('subtitle', 14)
pdf.setFillColorRGB(0, 0, 0)
pdf.drawString(40, 80, subTitle)
# DATE
date = datetime.datetime.now()
# Draw
pdf.setFont('subtitle', 10)
pdf.setFillColorRGB(0, 0, 0)
pdf.drawString(30, 980, str(date))
# HEADER
header = "UPLOADERS SIGNATURE"
pdf.setFont('subtitle', 35)
pdf.setFillColorRGB(0, 0, 0)
pdf.drawString(30, 920, header)
# BODY
# Register a new font for the body
pdfmetrics.registerFont(
TTFont('body', 'Favorit_Regular.ttf')
)
pdfmetrics.registerFont(
TTFont('bodyitalic', 'Favorit_Regular_Italic.ttf')
)
# ID
# Q
id = "Name—Nickname—Pseudonim of the uploader"
pdf.setFont('bodyitalic', 14)
pdf.setFillColorRGB(0, 0, 0)
pdf.drawString(60, 890, id)
#A
id_answer = watermark[0]
pdf.setFont('body', 14)
pdf.setFillColorRGB(0, 0, 0)
pdf.drawString(30, 870, id_answer)
# DID YOU DIGITISE?
# Q
digitise = "Did you digitise the file?"
pdf.setFont('bodyitalic', 14)
pdf.setFillColorRGB(0, 0, 0)
pdf.drawString(60, 835, digitise)
#A
digitise_answer = watermark[1]
pdf.setFont('body', 14)
pdf.setFillColorRGB(0, 0, 0)
pdf.drawString(30, 815, digitise_answer)
# HOW LONG?
# Q
howlong = "How long did it take to scan?"
pdf.setFont('bodyitalic', 14)
pdf.setFillColorRGB(0, 0, 0)
pdf.drawString(60, 780, howlong)
#A
howlong_answer = watermark[2]
pdf.setFont('body', 14)
pdf.setFillColorRGB(0, 0, 0)
pdf.drawString(30, 760, howlong_answer)
# WHERE DID YOU FIND IT?
# Q
where = "Where was the source found?"
pdf.setFont('bodyitalic', 14)
pdf.setFillColorRGB(0, 0, 0)
pdf.drawString(60, 725, where)
#A
where_answer = watermark[3]
pdf.setFont('body', 14)
pdf.setFillColorRGB(0, 0, 0)
pdf.drawString(30, 705, where_answer)
#ANECDOTE
# Q
sharing = "Why are you sharing this file?"
anecdote = "You can tell an anecdote"
personal = "You can leave a personal message!"
pdf.setFont('bodyitalic', 14)
pdf.setFillColorRGB(0, 0, 0)
pdf.drawString(60, 670, sharing)
pdf.drawString(60, 655, anecdote)
pdf.drawString(60, 640, personal)
#A
strs = watermark[4]
#Wrap the lines
textLines = wrap(strs, 75)
text = pdf.beginText(40, 615)
text.setFont("body", 14)
text.setFillColor(colors.black)
for line in textLines:
text.textLine(line)
# Draw
pdf.drawText(text)
# Save the pdf
pdf.save()
3. Append the watermark to the pdf
Bash
cd `ls -td -- /home/psc/scripts/autorepublish/inprogress/* | head -n 1`
cp `ls -td -- *.pdf | head -n 1` /home/psc/scripts/autorepublish/overlay
cp `ls -td -- *.png | head -n 1` /home/psc/scripts/autorepublish/overlay
cd /home/psc/scripts/autorepublish/overlay
mv `ls -td -- *.pdf | head -n 1` target.pdf
python3 burstcover.py
python3 rotatelogo.py
python3 overlaylogo.py
tesseract page1.png out pdf
python3 resizecover.py
gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=name.pdf cover.pdf target.pdf
var1=`ls -td -- /home/psc/scripts/autorepublish/inprogress/*/*.pdf | head -n 1`
mv name.pdf $var1
3.1 Burst the pdf cover
from pdf2image import convert_from_path
dpi = 300
pdf_file = 'target.pdf'
pages = convert_from_path(pdf_file ,dpi )
page = pages[0]
page.save('page1.png'.format(0), 'PNG')
3.2 Rotate the watermark with PIL
from PIL import Image
import PIL.ImageOps
#open both the watermark
logo = Image.open('watermark.png')
#rotate the watermark
rotatedlogo = logo.rotate(10, expand=True)
rotatedlogo.save('rotated.png')
3.3 Overlay the watermark with PIL
3.4 OCR the new cover
from PIL import Image
import os
filepath = 'page1.png'
basewidth, height = Image.open(filepath).size
basenumber, ext1 = os.path.splitext(str(basewidth*0.6))
base = int(basenumber)
#rescaling the logo
finallogo = Image.open("rotated.png")
wpercent = (base/float(finallogo.size[0]))
hsize = int((float(finallogo.size[1])*float(wpercent)))
finallogo = finallogo.resize((base,hsize), Image.ANTIALIAS)
finallogo.save('rotatedwatermark.png')
background = Image.open("page1.png")
foreground = Image.open('rotatedwatermark.png')
background.paste(foreground, (40, 70), foreground.convert('RGBA'))
background.save("page1.png")
3.5 Resize the OCRed cover to fit the book
from pdfrw import PdfReader
from PIL import Image
import os
from time import sleep
from pdf2image import convert_from_path
import cv2
import numpy as np
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf = PdfReader('target.pdf')
measures = pdf.pages[0].MediaBox
basenumberwidth, ext1 = os.path.splitext(measures[2])
basenumberheight, ext2 = os.path.splitext(measures[3])
basewidth = int(basenumberwidth)
baseheight = int(basenumberheight)
print(basewidth)
print(baseheight)
fileName = ("out.pdf")
pdfFile = PdfFileReader(open(fileName, 'rb'))
# Getting only first page!
newPage = pdfFile.getPage(0)
newHeight = baseheight
newWidth = basewidth
newPage.scaleTo(newWidth, newHeight)
writer = PdfFileWriter()
writer.addPage(newPage)
with open('cover.pdf', 'wb') as f:
writer.write(f)
4. OCR the pdf if not OCRed already
5. Delete all the unwanted traces
cd /home/psc/scripts/autorepublish/watermark/
rm watermark.pdf
cd /home/psc/scripts/autorepublish/overlay/
rm watermark.png rotated.png rotatedwatermark.png target.pdf cover.pdf out.pdf
now=$(date +%d-%b-%H_%M_%S)
mv page1.png /home/psc/tacticalbooks/covers/"$now".png
6. Save the file in a directory open to Library Genesis Staff
cd `ls -td -- /home/psc/scripts/autorepublish/inprogress/* | head -n 1`
rm watermark.png
mv `ls -td -- /home/psc/scripts/autorepublish/inprogress/*/*.pdf | head -n 1` /home/libgen/books
rm -r `ls -td -- /home/psc/scripts/autorepublish/inprogress/* | head -n 1`