|
|
Line 4: |
Line 4: |
|
| |
|
| ==Merge PDF== | | ==Merge PDF== |
| This shell script uses pdftk to merge all ocr pdf's created.
| | TEXT HERE |
| <source lang="shell"> | | <source lang="shell"> |
| #!/bin/bash
| |
| #line 3 means here
| |
| # cd "$(dirname "$0")"
| |
|
| |
| cd ocred
| |
| pwd
| |
| pdftk *.pdf cat output final.pdf
| |
|
| |
| </source> | | </source> |
|
| |
|
| ==Crop Bounding Box== | | ==Crop Bounding Box== |
| While capturing the pages of the book a bounding box is created. With this script, you iterate through a folder and crop the images.
| | TEXT HERE |
| <source lang="python"> | | <source lang="python"> |
| import cv2
| |
| import time
| |
| import logging
| |
|
| |
| d = 1
| |
|
| |
| while True:
| |
| try:
| |
| threshold = 25
| |
| time.sleep(1)
| |
|
| |
| input = ('input%d.jpg'%d)
| |
| page = ('page%d.jpg'%d)
| |
|
| |
| print("Value of d is:",d,"\n","Page name:",input)
| |
| img = cv2.imread(input, 0) # load grayscale version
| |
|
| |
| # the indeces where the useful region starts and ends
| |
| hStrart = 0
| |
| hEnd = img.shape[0]
| |
| vStart = 0
| |
| vEnd = img.shape[1]
| |
|
| |
| # get row and column maxes for each row and column
| |
| hMax = img.max(1)
| |
| vMax = img.max(0)
| |
|
| |
| hDone_flag = False
| |
| vDone_flag = False
| |
|
| |
| # go through the list of max and begin where the pixel value is greater
| |
| # than the threshold
| |
| for i in range(hMax.size):
| |
| if not hDone_flag:
| |
| if hMax[i] > threshold:
| |
| hStart = i
| |
| hDone_flag = True
| |
|
| |
| if hDone_flag:
| |
| if hMax[i] < threshold:
| |
| hEnd = i
| |
| break
| |
|
| |
| for i in range(vMax.size):
| |
| if not vDone_flag:
| |
| if vMax[i] > threshold:
| |
| vStart = i
| |
| vDone_flag = True
| |
|
| |
| if vDone_flag:
| |
| if vMax[i] < threshold:
| |
| vEnd = i
| |
| break
| |
|
| |
| # load the color image and choose only the useful area from it
| |
| img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:]
| |
|
| |
| # write the cropped image
| |
| cv2.imwrite(page, img2)
| |
|
| |
| d+=1
| |
| print("Value of d is:", d)
| |
|
| |
| except:
| |
| logging.exception("message")
| |
| print("All pages must be ready!")
| |
| break
| |
|
| |
| </source> | | </source> |
|
| |
|
|
| |
|
| ==OCR== | | ==OCR== |
| OCR all the jpegs in one batch, dividing them into searchable pdfs.
| | TEXT HERE |
| <source lang="python"> | | <source lang="python"> |
| # import libraries
| |
| from PIL import Image
| |
| import pytesseract
| |
| import time
| |
|
| |
| i = 1
| |
|
| |
| while True:
| |
| try:
| |
| img = Image.open("split/page%i.jpg"%i)
| |
| print(img)
| |
| pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf')
| |
| time.sleep(1)
| |
| file = open(("ocred/page%i.pdf"%i), "w+b")
| |
| file.write(bytearray(pdf))
| |
| file.close()
| |
| i+=1
| |
| print(i)
| |
|
| |
| except:
| |
| print("All pages must be ready!")
| |
| break
| |
|
| |
| </source> | | </source> |
|
| |
|
| ==Rotate JPGS== | | ==Rotate JPGS== |
| The book scanner takes a picture of a book page in a landscape format. These have to be processed and rotated. This script iterates with a different behaviour through the even and odd pages.
| | TEXT HERE |
| <source lang="python"> | | <source lang="python"> |
| from PIL import Image
| |
| import time
| |
|
| |
| i = 1
| |
|
| |
| while True:
| |
|
| |
| page = Image.open("split/input%i.jpg"%i)
| |
|
| |
| if i % 2 == 0:
| |
| #check where the for loop is
| |
| print("trying even")
| |
|
| |
| #rotate image by 90 degrees
| |
| angle = 90
| |
| out = page.rotate(angle, expand=True)
| |
| out.save('rotated/input%i.jpg'%i)
| |
| print('This is an even page number')
| |
|
| |
| time.sleep(2)
| |
| print("variable i: ", i)
| |
|
| |
| else:
| |
| #check where the for loop is
| |
| print("trying odd")
| |
|
| |
| #rotate image by 90 degrees
| |
| angle = 270
| |
| out = page.rotate(angle, expand=True)
| |
| out.save('rotated/input%i.jpg'%i)
| |
| print('This is an even page number')
| |
|
| |
| time.sleep(1)
| |
| print("variable i: ", i)
| |
|
| |
| i+=1
| |
|
| |
| </source> | | </source> |
|
| |
|
| ==Burst PDF== | | ==Burst PDF== |
| Burst a pdf into separate jpegs.
| | TEXT HERE |
| <source lang="python"> | | <source lang="python"> |
| #Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/
| |
|
| |
| import pdf2image
| |
| from PIL import Image
| |
| import time
| |
|
| |
| #DECLARE CONSTANTS
| |
| PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): "))
| |
| DPI = 200
| |
| FIRST_PAGE = None
| |
| LAST_PAGE = None
| |
| FORMAT = 'jpg'
| |
| THREAD_COUNT = 1
| |
| USERPWD = None
| |
| USE_CROPBOX = False
| |
| STRICT = False
| |
|
| |
| def pdftopil():
| |
| #This method reads a pdf and converts it into a sequence of images
| |
| #PDF_PATH sets the path to the PDF file
| |
| #dpi parameter assists in adjusting the resolution of the image
| |
| #first_page parameter allows you to set a first page to be processed by pdftoppm
| |
| #last_page parameter allows you to set a last page to be processed by pdftoppm
| |
| #fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF)
| |
| #thread_count parameter allows you to set how many thread will be used for conversion.
| |
| #userpw parameter allows you to set a password to unlock the converted PDF
| |
| #use_cropbox parameter allows you to use the crop box instead of the media box when converting
| |
| #strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError
| |
|
| |
| start_time = time.time()
| |
| pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT)
| |
| print ("Time taken : " + str(time.time() - start_time))
| |
| return pil_images
| |
|
| |
| def save_images(pil_images):
| |
| d = 1
| |
| for image in pil_images:
| |
| image.save(("split/input%d"%d) + ".jpg")
| |
| d += 1
| |
|
| |
| if __name__ == "__main__":
| |
| pil_images = pdftopil()
| |
| save_images(pil_images)
| |
|
| |
| </source> | | </source> |
|
| |
|