from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser, PDFDocument
import pandas as pd
def Pdf_to_txt(fp):
praser_pdf = PDFParser(fp)
document = PDFDocument(praser_pdf)
praser_pdf.set_document(document)
document.set_parser(praser_pdf)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in document.get_pages():
interpreter.process_page(page)
layout = device.get_result()
title = []
lin1, lin2, lin3, lin4, lin5, lin6, lin7, lin8 = [], [], [], [], [], [], [], []
num = 0
for lin in layout:
if isinstance(lin, LTTextBoxHorizontal):
results = lin.get_text()
results = results.strip('\n')
print("results: " + results)
if num == 0:
title.append(results)
elif num == 1:
lin1.append(results)
elif num == 2:
lin2.append(results)
elif num == 3:
lin3.append(results)
elif num == 4:
lin4.append(results)
elif num == 5:
lin5.append(results)
elif num == 6:
lin6.append(results)
elif num == 7:
lin7.append(results)
elif num == 8:
lin8.append(results)
num = 0
num += 1
Lin_num = len(lin8)
data = {'Lin1': lin1[:Lin_num], 'Lin2': lin2[:Lin_num], 'Lin3': lin3[:Lin_num], 'Lin4': lin4[:Lin_num], 'Lin5': lin5[:Lin_num], 'Lin6': lin6[:Lin_num], 'Lin7': lin7[:Lin_num], 'Lin8': lin8[:Lin_num]}
df = pd.DataFrame(data, columns=['Lin1', 'Lin2', 'Lin3', 'Lin4', 'Lin5', 'Lin6', 'Lin7', 'Lin8'])
file_name = title[0] + '_page' + str((i + 1))
df.to_csv('tool/pdf解析/%s.txt' % file_name, index=False, sep='\t')
if __name__ == '__main__':
filename = 'E:/tempFile/Monthly_new_MA_listing_April_2017.pdf'
fp = open(filename, 'rb')
Pdf_to_txt(fp)