User:Alexander Roidl/pdf2html

From XPUB & Lens-Based wiki
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

PDF2HTML

PDF to .txt

import PyPDF2


def get_text(file_path, filename):
    read_pdf =file_path

    with open(read_pdf,'rb') as pdf_file, open("app/uploads/"+filename+'.txt', 'w') as text_file:
        read_pdf = PyPDF2.PdfFileReader(pdf_file)
        number_of_pages = read_pdf.getNumPages()
        for page_number in range(number_of_pages):   # use xrange in Py2
            page = read_pdf.getPage(page_number)
            page_content = page.extractText()
            text_file.write(page_content)



def extract_text(file_path, filename):
    try:
        get_text(file_path, filename)
    except:
        with open(filename+'.txt', 'w') as text_file:
            page_content = ""
            text_file.write(page_content)

PDF2HTMLEX

+

  • very exact representation of every PDF, seems stable

-

  • not maintained anymore
  • heavy processing (high cpu usage)
  • takes long
  • little modification possible

poppler

+

  • very simple
  • lightweight
  • fast

-

  • not very accurate
  • one image per page

PyPDF2

import PyPDF2
pdfFileObject = open('sample.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
count = pdfReader.numPages
for i in range(count):
    page = pdfReader.getPage(i)
    print(page.extractText())