|
@@ -0,0 +1,95 @@
|
|
|
+import base64
|
|
|
+import concurrent.futures
|
|
|
+
|
|
|
+import fitz
|
|
|
+import requests
|
|
|
+
|
|
|
+from tool.tyqw import getAiOcrBase64
|
|
|
+
|
|
|
+
|
|
|
+def pdf_pages_to_base64(pdf_path):
|
|
|
+ # 打开 PDF 文件
|
|
|
+ pdf_document = fitz.open(pdf_path)
|
|
|
+ base64_images = []
|
|
|
+
|
|
|
+ # 遍历 PDF 的每一页
|
|
|
+ for page_number in range(pdf_document.page_count):
|
|
|
+ # 获取当前页
|
|
|
+ page = pdf_document.load_page(page_number)
|
|
|
+ # 将页面渲染为图像
|
|
|
+ pix = page.get_pixmap()
|
|
|
+ # 将图像数据转换为 PNG 字节流
|
|
|
+ image_bytes = pix.tobytes(output="png")
|
|
|
+ # 将字节流转换为 Base64 编码
|
|
|
+ encoded_string = base64.b64encode(image_bytes)
|
|
|
+ base64_images.append(encoded_string.decode('utf-8'))
|
|
|
+
|
|
|
+ # 关闭 PDF 文件
|
|
|
+ pdf_document.close()
|
|
|
+ return base64_images
|
|
|
+
|
|
|
+
|
|
|
+def urlpdf_pages_to_base64(pdf_url, page_num):
|
|
|
+ # 打开 PDF 文件
|
|
|
+ response = requests.get(pdf_url)
|
|
|
+ response.raise_for_status() # 检查请求是否成功
|
|
|
+
|
|
|
+ # 打开 PDF 文件字节流
|
|
|
+ pdf_document = fitz.open("pdf", response.content)
|
|
|
+ base64_images = []
|
|
|
+
|
|
|
+ # 遍历 PDF 的每一页
|
|
|
+ for page_number in range(min(pdf_document.page_count, page_num)):
|
|
|
+ # 获取当前页
|
|
|
+ page = pdf_document.load_page(page_number)
|
|
|
+ # 将页面渲染为图像
|
|
|
+ pix = page.get_pixmap()
|
|
|
+ # 将图像数据转换为 PNG 字节流
|
|
|
+ image_bytes = pix.tobytes(output="png")
|
|
|
+ # 将字节流转换为 Base64 编码
|
|
|
+ encoded_string = base64.b64encode(image_bytes)
|
|
|
+ base64_images.append(encoded_string.decode('utf-8'))
|
|
|
+
|
|
|
+ # 关闭 PDF 文件
|
|
|
+ pdf_document.close()
|
|
|
+ return base64_images
|
|
|
+
|
|
|
+
|
|
|
+def pdf2ai(pdf_url, prompt, page_num=10):
|
|
|
+ base64_images = urlpdf_pages_to_base64(pdf_url, page_num)
|
|
|
+ markdown_output = "# 提取结果\n\n"
|
|
|
+
|
|
|
+ # 使用 ThreadPoolExecutor 进行并行处理
|
|
|
+ with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
|
+ # 提交任务到线程池
|
|
|
+ future_to_page = {executor.submit(getAiOcrBase64, prompt, base64_image): i + 1 for i, base64_image in
|
|
|
+ enumerate(base64_images)}
|
|
|
+
|
|
|
+ # 存储每个页面的结果
|
|
|
+ results = []
|
|
|
+
|
|
|
+ for future in concurrent.futures.as_completed(future_to_page):
|
|
|
+ try:
|
|
|
+ content = future.result()
|
|
|
+ page_number = future_to_page[future]
|
|
|
+ results.append((page_number, content))
|
|
|
+ except Exception as exc:
|
|
|
+ print(f'生成第 {future_to_page[future]} 页内容时发生错误: {exc}')
|
|
|
+
|
|
|
+ # 按照页数排序
|
|
|
+ results.sort(key=lambda x: x[0])
|
|
|
+
|
|
|
+ # 构建最终的 Markdown 输出
|
|
|
+ for page_number, content in results:
|
|
|
+ markdown_output += f"## 第 {page_number} 页\n\n"
|
|
|
+ markdown_output += f" {content}\n\n"
|
|
|
+
|
|
|
+ return markdown_output
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ pdf_url = 'http://42.194.163.46:9007/ywd/%E6%89%AB%E6%8F%8F%E5%85%A8%E8%83%BD%E7%8E%8B%202025-03-20%2021.38.pdf'
|
|
|
+ prompt = "提取有效信息,表格信息以表格输出,去掉信息提示和页码相关信息。"
|
|
|
+
|
|
|
+ result = pdf2ai(pdf_url, prompt, 3)
|
|
|
+ print(result)
|