User:Alexander Roidl/pdf2html

From Media Design: Networked & Lens-Based wiki
Jump to navigation Jump to search

PDF2HTML

PDF to .txt

import PyPDF2


def get_text(file_path, filename):
    read_pdf =file_path

    with open(read_pdf,'rb') as pdf_file, open("app/uploads/"+filename+'.txt', 'w') as text_file:
        read_pdf = PyPDF2.PdfFileReader(pdf_file)
        number_of_pages = read_pdf.getNumPages()
        for page_number in range(number_of_pages):   # use xrange in Py2
            page = read_pdf.getPage(page_number)
            page_content = page.extractText()
            text_file.write(page_content)



def extract_text(file_path, filename):
    try:
        get_text(file_path, filename)
    except:
        with open(filename+'.txt', 'w') as text_file:
            page_content = ""
            text_file.write(page_content)

PDF2HTMLEX

+

  • very exact representation of every PDF, seems stable

-

  • not maintained anymore
  • heavy processing (high cpu usage)
  • takes long
  • little modification possible

poppler

+

  • very simple
  • lightweight
  • fast

-

  • not very accurate
  • one image per page

PyPDF2

import PyPDF2
pdfFileObject = open('sample.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
count = pdfReader.numPages
for i in range(count):
    page = pdfReader.getPage(i)
    print(page.extractText())