User:Alexander Roidl/pdf2html
PDF2HTML
PDF to .txt
import PyPDF2 def get_text(file_path, filename): read_pdf =file_path with open(read_pdf,'rb') as pdf_file, open("app/uploads/"+filename+'.txt', 'w') as text_file: read_pdf = PyPDF2.PdfFileReader(pdf_file) number_of_pages = read_pdf.getNumPages() for page_number in range(number_of_pages): # use xrange in Py2 page = read_pdf.getPage(page_number) page_content = page.extractText() text_file.write(page_content) def extract_text(file_path, filename): try: get_text(file_path, filename) except: with open(filename+'.txt', 'w') as text_file: page_content = "" text_file.write(page_content)
PDF2HTMLEX
+
- very exact representation of every PDF, seems stable
-
- not maintained anymore
- heavy processing (high cpu usage)
- takes long
- little modification possible
poppler
+
- very simple
- lightweight
- fast
-
- not very accurate
- one image per page
PyPDF2
import PyPDF2 pdfFileObject = open('sample.pdf', 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObject) count = pdfReader.numPages for i in range(count): page = pdfReader.getPage(i) print(page.extractText())