提取PDF文本内容可以使用Python中的几个不同的库,以下是几个常用的方法:
方法一:使用PyPDF2库
import PyPDF2
def extract_text_from_pdf(file_path):
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
num_pages = pdf_reader.numPages
text = ''
for page_num in range(num_pages):
page = pdf_reader.getPage(page_num)
text += page.extractText()
return text
pdf_text = extract_text_from_pdf('example.pdf')
print(pdf_text)
方法二:使用pdfminer库
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from io import StringIO
def extract_text_from_pdf(pdf_path):
resource_manager = PDFResourceManager()
return_string = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(resource_manager, return_string, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)
with open(pdf_path, 'rb') as file:
for page in PDFPage.get_pages(file, check_extractable=True):
interpreter.process_page(page)
text = return_string.getvalue()
return text
方法三:使用Spire.PDF库
from spire.pdf import *
def extract_text_from_pdf(pdf_path):
pdf = PdfDocument()
pdf.LoadFromFile(pdf_path)
extractedText = open('Output/提取文本.txt', 'w', encoding='utf-8')
for i in range(pdf.Pages.Count):
page = pdf.Pages.get_Item(i)
text = page.ExtractText()
extractedText.write(text + '\n')
extractedText.close()
方法四:使用pdfplumber库
import pdfplumber
def extract_text_from_pdf(file_path):
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
print(page.extract_text())
方法五:使用tika库
from tika import parser
def extract_text_from_pdf(file_path):
text_raw = parser.from_file(file_path)
print(text_raw['content'].strip())
请根据您的需求选择合适的方法,并确保已安装相应的Python库。