User:Alexander Roidl/pdf2html: Difference between revisions
No edit summary |
No edit summary |
||
Line 1: | Line 1: | ||
= PDF2HTML = | = PDF2HTML = | ||
==PDF to .txt== | |||
<pre> | |||
import PyPDF2 | |||
def get_text(file_path, filename): | |||
read_pdf =file_path | |||
with open(read_pdf,'rb') as pdf_file, open("app/uploads/"+filename+'.txt', 'w') as text_file: | |||
read_pdf = PyPDF2.PdfFileReader(pdf_file) | |||
number_of_pages = read_pdf.getNumPages() | |||
for page_number in range(number_of_pages): # use xrange in Py2 | |||
page = read_pdf.getPage(page_number) | |||
page_content = page.extractText() | |||
text_file.write(page_content) | |||
def extract_text(file_path, filename): | |||
try: | |||
get_text(file_path, filename) | |||
except: | |||
with open(filename+'.txt', 'w') as text_file: | |||
page_content = "" | |||
text_file.write(page_content) | |||
</pre> | |||
==PDF2HTMLEX== | ==PDF2HTMLEX== |
Latest revision as of 21:08, 10 June 2018
PDF2HTML
PDF to .txt
import PyPDF2 def get_text(file_path, filename): read_pdf =file_path with open(read_pdf,'rb') as pdf_file, open("app/uploads/"+filename+'.txt', 'w') as text_file: read_pdf = PyPDF2.PdfFileReader(pdf_file) number_of_pages = read_pdf.getNumPages() for page_number in range(number_of_pages): # use xrange in Py2 page = read_pdf.getPage(page_number) page_content = page.extractText() text_file.write(page_content) def extract_text(file_path, filename): try: get_text(file_path, filename) except: with open(filename+'.txt', 'w') as text_file: page_content = "" text_file.write(page_content)
PDF2HTMLEX
+
- very exact representation of every PDF, seems stable
-
- not maintained anymore
- heavy processing (high cpu usage)
- takes long
- little modification possible
poppler
+
- very simple
- lightweight
- fast
-
- not very accurate
- one image per page
PyPDF2
import PyPDF2 pdfFileObject = open('sample.pdf', 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObject) count = pdfReader.numPages for i in range(count): page = pdfReader.getPage(i) print(page.extractText())