10 miesięcy temu · 9b356b6fff
--- a/README.md
+++ b/README.md
@@ -0,0 +1 @@
 
				+#执行版本 pip install --no-cache-dir -r requirements.txt
			
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,5 +3,5 @@ services:
 
				   my_python_app:
			
 
				     build: .
			
 
				     ports:
			
 
				-      - "29015:29015"
			
 
				+      - "9020:29015"
			
 
				     restart: always   
			
--- a/main.py
+++ b/main.py
@@ -1,6 +1,8 @@
 
				+import uvicorn
			
 
				 from fastapi import FastAPI
			
 
				 from pydantic import BaseModel
			
 
				 
			
 
				+from tool.pdf2md import pdf2ai
			
 
				 from utils import dmQuery
			
 
				 from utils.shellUtil import execute_command_on_linux
			
 
				 
			
@@ -19,6 +21,11 @@ class Item(BaseModel):
 
				     sql: str
			
 
				 
			
 
				 
			
 
				+class Pdf2md(BaseModel):
			
 
				+    pdfUrl: str
			
 
				+    prompt: str
			
 
				+    pageNum: int
			
 
				+
			
 
				 
			
 
				 @app.put("/exsql")
			
 
				 async def create_item( item: Item, q: str = None):
			
@@ -42,3 +49,16 @@ async def create_item( item: Item, q: str = None):
 
				 
			
 
				     return result
			
 
				 
			
 
				+
			
 
				+@app.put("/pdf2md")
			
 
				+async def pdf2md(item: Pdf2md, q: str = None):
			
 
				+    pdf_url = 'http://42.194.163.46:9007/ywd/%E6%89%AB%E6%8F%8F%E5%85%A8%E8%83%BD%E7%8E%8B%202025-03-20%2021.38.pdf'
			
 
				+    prompt = "提取有效信息，只输出图片有效信息无效信息不用。"
			
 
				+
			
 
				+    result = pdf2ai(item.pdfUrl, item.prompt, item.pageNum)
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    uvicorn.run(app, host="0.0.0.0", port=29015, timeout_keep_alive=600)
			
--- a/requirements.txt
+++ b/requirements.txt
--- a/tool/__init__.py
+++ b/tool/__init__.py
@@ -0,0 +1,2 @@
 
				+from . import pdf2md
			
 
				+from . import tyqw
			
--- a/tool/pdf2md.py
+++ b/tool/pdf2md.py
@@ -0,0 +1,95 @@
 
				+import base64
			
 
				+import concurrent.futures
			
 
				+
			
 
				+import fitz
			
 
				+import requests
			
 
				+
			
 
				+from tool.tyqw import getAiOcrBase64
			
 
				+
			
 
				+
			
 
				+def pdf_pages_to_base64(pdf_path):
			
 
				+    # 打开 PDF 文件
			
 
				+    pdf_document = fitz.open(pdf_path)
			
 
				+    base64_images = []
			
 
				+
			
 
				+    # 遍历 PDF 的每一页
			
 
				+    for page_number in range(pdf_document.page_count):
			
 
				+        # 获取当前页
			
 
				+        page = pdf_document.load_page(page_number)
			
 
				+        # 将页面渲染为图像
			
 
				+        pix = page.get_pixmap()
			
 
				+        # 将图像数据转换为 PNG 字节流
			
 
				+        image_bytes = pix.tobytes(output="png")
			
 
				+        # 将字节流转换为 Base64 编码
			
 
				+        encoded_string = base64.b64encode(image_bytes)
			
 
				+        base64_images.append(encoded_string.decode('utf-8'))
			
 
				+
			
 
				+    # 关闭 PDF 文件
			
 
				+    pdf_document.close()
			
 
				+    return base64_images
			
 
				+
			
 
				+
			
 
				+def urlpdf_pages_to_base64(pdf_url, page_num):
			
 
				+    # 打开 PDF 文件
			
 
				+    response = requests.get(pdf_url)
			
 
				+    response.raise_for_status()  # 检查请求是否成功
			
 
				+
			
 
				+    # 打开 PDF 文件字节流
			
 
				+    pdf_document = fitz.open("pdf", response.content)
			
 
				+    base64_images = []
			
 
				+
			
 
				+    # 遍历 PDF 的每一页
			
 
				+    for page_number in range(min(pdf_document.page_count, page_num)):
			
 
				+        # 获取当前页
			
 
				+        page = pdf_document.load_page(page_number)
			
 
				+        # 将页面渲染为图像
			
 
				+        pix = page.get_pixmap()
			
 
				+        # 将图像数据转换为 PNG 字节流
			
 
				+        image_bytes = pix.tobytes(output="png")
			
 
				+        # 将字节流转换为 Base64 编码
			
 
				+        encoded_string = base64.b64encode(image_bytes)
			
 
				+        base64_images.append(encoded_string.decode('utf-8'))
			
 
				+
			
 
				+    # 关闭 PDF 文件
			
 
				+    pdf_document.close()
			
 
				+    return base64_images
			
 
				+
			
 
				+
			
 
				+def pdf2ai(pdf_url, prompt, page_num=10):
			
 
				+    base64_images = urlpdf_pages_to_base64(pdf_url, page_num)
			
 
				+    markdown_output = "# 提取结果\n\n"
			
 
				+
			
 
				+    # 使用 ThreadPoolExecutor 进行并行处理
			
 
				+    with concurrent.futures.ThreadPoolExecutor() as executor:
			
 
				+        # 提交任务到线程池
			
 
				+        future_to_page = {executor.submit(getAiOcrBase64, prompt, base64_image): i + 1 for i, base64_image in
			
 
				+                          enumerate(base64_images)}
			
 
				+
			
 
				+        # 存储每个页面的结果
			
 
				+        results = []
			
 
				+
			
 
				+        for future in concurrent.futures.as_completed(future_to_page):
			
 
				+            try:
			
 
				+                content = future.result()
			
 
				+                page_number = future_to_page[future]
			
 
				+                results.append((page_number, content))
			
 
				+            except Exception as exc:
			
 
				+                print(f'生成第 {future_to_page[future]} 页内容时发生错误: {exc}')
			
 
				+
			
 
				+    # 按照页数排序
			
 
				+    results.sort(key=lambda x: x[0])
			
 
				+
			
 
				+    # 构建最终的 Markdown 输出
			
 
				+    for page_number, content in results:
			
 
				+        markdown_output += f"## 第 {page_number} 页\n\n"
			
 
				+        markdown_output += f" {content}\n\n"
			
 
				+
			
 
				+    return markdown_output
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pdf_url = 'http://42.194.163.46:9007/ywd/%E6%89%AB%E6%8F%8F%E5%85%A8%E8%83%BD%E7%8E%8B%202025-03-20%2021.38.pdf'
			
 
				+    prompt = "提取有效信息，表格信息以表格输出，去掉信息提示和页码相关信息。"
			
 
				+
			
 
				+    result = pdf2ai(pdf_url, prompt, 3)
			
 
				+    print(result)
			
--- a/tool/pdf2md222.py
+++ b/tool/pdf2md222.py
@@ -0,0 +1,95 @@
 
				+import base64
			
 
				+import concurrent.futures
			
 
				+
			
 
				+import fitz
			
 
				+import requests
			
 
				+
			
 
				+from tool.tyqw import getAiOcrBase64
			
 
				+
			
 
				+
			
 
				+def pdf_pages_to_base64(pdf_path):
			
 
				+    # 打开 PDF 文件
			
 
				+    pdf_document = fitz.open(pdf_path)
			
 
				+    base64_images = []
			
 
				+
			
 
				+    # 遍历 PDF 的每一页
			
 
				+    for page_number in range(pdf_document.page_count):
			
 
				+        # 获取当前页
			
 
				+        page = pdf_document.load_page(page_number)
			
 
				+        # 将页面渲染为图像
			
 
				+        pix = page.get_pixmap()
			
 
				+        # 将图像数据转换为 PNG 字节流
			
 
				+        image_bytes = pix.tobytes(output="png")
			
 
				+        # 将字节流转换为 Base64 编码
			
 
				+        encoded_string = base64.b64encode(image_bytes)
			
 
				+        base64_images.append(encoded_string.decode('utf-8'))
			
 
				+
			
 
				+    # 关闭 PDF 文件
			
 
				+    pdf_document.close()
			
 
				+    return base64_images
			
 
				+
			
 
				+
			
 
				+def urlpdf_pages_to_base64(pdf_url, page_num):
			
 
				+    # 打开 PDF 文件
			
 
				+    response = requests.get(pdf_url)
			
 
				+    response.raise_for_status()  # 检查请求是否成功
			
 
				+
			
 
				+    # 打开 PDF 文件字节流
			
 
				+    pdf_document = fitz.open("pdf", response.content)
			
 
				+    base64_images = []
			
 
				+
			
 
				+    # 遍历 PDF 的每一页
			
 
				+    for page_number in range(min(pdf_document.page_count, page_num)):
			
 
				+        # 获取当前页
			
 
				+        page = pdf_document.load_page(page_number)
			
 
				+        # 将页面渲染为图像
			
 
				+        pix = page.get_pixmap()
			
 
				+        # 将图像数据转换为 PNG 字节流
			
 
				+        image_bytes = pix.tobytes(output="png")
			
 
				+        # 将字节流转换为 Base64 编码
			
 
				+        encoded_string = base64.b64encode(image_bytes)
			
 
				+        base64_images.append(encoded_string.decode('utf-8'))
			
 
				+
			
 
				+    # 关闭 PDF 文件
			
 
				+    pdf_document.close()
			
 
				+    return base64_images
			
 
				+
			
 
				+
			
 
				+def pdf2ai(pdf_url, prompt, page_num=10):
			
 
				+    base64_images = urlpdf_pages_to_base64(pdf_url, page_num)
			
 
				+    markdown_output = "# 提取结果\n\n"
			
 
				+
			
 
				+    # 使用 ThreadPoolExecutor 进行并行处理
			
 
				+    with concurrent.futures.ThreadPoolExecutor() as executor:
			
 
				+        # 提交任务到线程池
			
 
				+        future_to_page = {executor.submit(getAiOcrBase64, prompt, base64_image): i + 1 for i, base64_image in
			
 
				+                          enumerate(base64_images)}
			
 
				+
			
 
				+        # 存储每个页面的结果
			
 
				+        results = []
			
 
				+
			
 
				+        for future in concurrent.futures.as_completed(future_to_page):
			
 
				+            try:
			
 
				+                content = future.result()
			
 
				+                page_number = future_to_page[future]
			
 
				+                results.append((page_number, content))
			
 
				+            except Exception as exc:
			
 
				+                print(f'生成第 {future_to_page[future]} 页内容时发生错误: {exc}')
			
 
				+
			
 
				+    # 按照页数排序
			
 
				+    results.sort(key=lambda x: x[0])
			
 
				+
			
 
				+    # 构建最终的 Markdown 输出
			
 
				+    for page_number, content in results:
			
 
				+        markdown_output += f"## 第 {page_number} 页\n\n"
			
 
				+        markdown_output += f" {content}\n\n"
			
 
				+
			
 
				+    return markdown_output
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pdf_url = 'http://42.194.163.46:9007/ywd/%E6%89%AB%E6%8F%8F%E5%85%A8%E8%83%BD%E7%8E%8B%202025-03-20%2021.38.pdf'
			
 
				+    prompt = "提取有效信息，表格信息以表格输出，去掉信息提示和页码相关信息。"
			
 
				+
			
 
				+    result = pdf2ai(pdf_url, prompt, 3)
			
 
				+    print(result)
			
--- a/tool/test_ocr.pdf
+++ b/tool/test_ocr.pdf
--- a/tool/tyqw.py
+++ b/tool/tyqw.py
@@ -0,0 +1,96 @@
 
				+import base64
			
 
				+
			
 
				+from openai import OpenAI
			
 
				+
			
 
				+
			
 
				+# 读取本地图片文件并转换为Base64编码
			
 
				+def encode_image_to_base64(image_path, ):
			
 
				+    with open(image_path, "rb") as image_file:
			
 
				+        return base64.b64encode(image_file.read()).decode("utf-8")
			
 
				+
			
 
				+
			
 
				+def getAiOcr(prompt, image_path):
			
 
				+    # 初始化OpenAI客户端
			
 
				+    client = OpenAI(
			
 
				+        # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx",
			
 
				+        api_key="sk-383770c29f4c41f0bec52a640af80e64",
			
 
				+        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
			
 
				+    )
			
 
				+
			
 
				+    # 本地图片路径
			
 
				+    if prompt is None:
			
 
				+        image_path = "G:\\wd\\sfz33.jpg"  # 替换为你的本地图片路径
			
 
				+
			
 
				+    # 将本地图片编码为Base64
			
 
				+    image_base64 = encode_image_to_base64(image_path)
			
 
				+
			
 
				+    # 创建请求
			
 
				+    if prompt is None:
			
 
				+        prompt = "Convert the provided image of a PDF document strictly into valid Markdown. If there are mathematical formulas in the document, use Mathjax."
			
 
				+
			
 
				+    completion = client.chat.completions.create(
			
 
				+        model="qwen-vl-plus",  # 此处以qwen-vl-plus为例，可按需更换模型名称。
			
 
				+        messages=[
			
 
				+            {"role": "system", "content": "You are a tool to parse documents."},
			
 
				+            {
			
 
				+                "role": "user",
			
 
				+                "content": [
			
 
				+                    {"type": "text", "text": prompt},
			
 
				+                    {
			
 
				+                        "type": "image_url",
			
 
				+                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}  # 使用Base64编码的图片数据
			
 
				+                    }
			
 
				+                ]
			
 
				+            }
			
 
				+        ]
			
 
				+    )
			
 
				+
			
 
				+    # 打印结果
			
 
				+    print(completion.model_dump_json())
			
 
				+    print(completion.choices[0].message.content)
			
 
				+    return completion.choices[0].message.content
			
 
				+
			
 
				+
			
 
				+def getAiOcrBase64(prompt, image_base64):
			
 
				+    # 初始化OpenAI客户端
			
 
				+    client = OpenAI(
			
 
				+        # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx",
			
 
				+        api_key="sk-383770c29f4c41f0bec52a640af80e64",
			
 
				+        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
			
 
				+    )
			
 
				+
			
 
				+    # 本地图片路径
			
 
				+    if prompt is None:
			
 
				+        image_path = "G:\\wd\\sfz33.jpg"  # 替换为你的本地图片路径
			
 
				+
			
 
				+    # 将本地图片编码为Base64
			
 
				+    # image_base64 = encode_image_to_base64(image_path)
			
 
				+
			
 
				+    # 创建请求
			
 
				+    if prompt is None:
			
 
				+        prompt = "Convert the provided image of a PDF document strictly into valid Markdown. If there are mathematical formulas in the document, use Mathjax."
			
 
				+
			
 
				+    completion = client.chat.completions.create(
			
 
				+        model="qwen-vl-plus",  # 此处以qwen-vl-plus为例，可按需更换模型名称。
			
 
				+        messages=[
			
 
				+            {"role": "system", "content": "You are a tool to parse documents."},
			
 
				+            {
			
 
				+                "role": "user",
			
 
				+                "content": [
			
 
				+                    {"type": "text", "text": prompt},
			
 
				+                    {
			
 
				+                        "type": "image_url",
			
 
				+                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}  # 使用Base64编码的图片数据
			
 
				+                    }
			
 
				+                ]
			
 
				+            }
			
 
				+        ]
			
 
				+    )
			
 
				+
			
 
				+    # 打印结果
			
 
				+
			
 
				+    return completion.choices[0].message.content
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    getAiOcr('提取有效信息', "G:\\wd\\sfz33.jpg")
		`@@ -0,0 +1 @@`
		`+#执行版本 pip install --no-cache-dir -r requirements.txt`