zrd 3 miesięcy temu
rodzic
commit
9b356b6fff
9 zmienionych plików z 310 dodań i 1 usunięć
  1. 1 0
      README.md
  2. 1 1
      docker-compose.yml
  3. 20 0
      main.py
  4. BIN
      requirements.txt
  5. 2 0
      tool/__init__.py
  6. 95 0
      tool/pdf2md.py
  7. 95 0
      tool/pdf2md222.py
  8. BIN
      tool/test_ocr.pdf
  9. 96 0
      tool/tyqw.py

+ 1 - 0
README.md

@@ -0,0 +1 @@
+#执行版本 pip install --no-cache-dir -r requirements.txt

+ 1 - 1
docker-compose.yml

@@ -3,5 +3,5 @@ services:
   my_python_app:
     build: .
     ports:
-      - "29015:29015"
+      - "9020:29015"
     restart: always   

+ 20 - 0
main.py

@@ -1,6 +1,8 @@
+import uvicorn
 from fastapi import FastAPI
 from pydantic import BaseModel
 
+from tool.pdf2md import pdf2ai
 from utils import dmQuery
 from utils.shellUtil import execute_command_on_linux
 
@@ -19,6 +21,11 @@ class Item(BaseModel):
     sql: str
 
 
+class Pdf2md(BaseModel):
+    pdfUrl: str
+    prompt: str
+    pageNum: int
+
 
 @app.put("/exsql")
 async def create_item( item: Item, q: str = None):
@@ -42,3 +49,16 @@ async def create_item( item: Item, q: str = None):
 
     return result
 
+
+@app.put("/pdf2md")
+async def pdf2md(item: Pdf2md, q: str = None):
+    pdf_url = 'http://42.194.163.46:9007/ywd/%E6%89%AB%E6%8F%8F%E5%85%A8%E8%83%BD%E7%8E%8B%202025-03-20%2021.38.pdf'
+    prompt = "提取有效信息,只输出图片有效信息无效信息不用。"
+
+    result = pdf2ai(item.pdfUrl, item.prompt, item.pageNum)
+
+    return result
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=29015, timeout_keep_alive=600)

BIN
requirements.txt


+ 2 - 0
tool/__init__.py

@@ -0,0 +1,2 @@
+from . import pdf2md
+from . import tyqw

+ 95 - 0
tool/pdf2md.py

@@ -0,0 +1,95 @@
+import base64
+import concurrent.futures
+
+import fitz
+import requests
+
+from tool.tyqw import getAiOcrBase64
+
+
+def pdf_pages_to_base64(pdf_path):
+    # 打开 PDF 文件
+    pdf_document = fitz.open(pdf_path)
+    base64_images = []
+
+    # 遍历 PDF 的每一页
+    for page_number in range(pdf_document.page_count):
+        # 获取当前页
+        page = pdf_document.load_page(page_number)
+        # 将页面渲染为图像
+        pix = page.get_pixmap()
+        # 将图像数据转换为 PNG 字节流
+        image_bytes = pix.tobytes(output="png")
+        # 将字节流转换为 Base64 编码
+        encoded_string = base64.b64encode(image_bytes)
+        base64_images.append(encoded_string.decode('utf-8'))
+
+    # 关闭 PDF 文件
+    pdf_document.close()
+    return base64_images
+
+
+def urlpdf_pages_to_base64(pdf_url, page_num):
+    # 打开 PDF 文件
+    response = requests.get(pdf_url)
+    response.raise_for_status()  # 检查请求是否成功
+
+    # 打开 PDF 文件字节流
+    pdf_document = fitz.open("pdf", response.content)
+    base64_images = []
+
+    # 遍历 PDF 的每一页
+    for page_number in range(min(pdf_document.page_count, page_num)):
+        # 获取当前页
+        page = pdf_document.load_page(page_number)
+        # 将页面渲染为图像
+        pix = page.get_pixmap()
+        # 将图像数据转换为 PNG 字节流
+        image_bytes = pix.tobytes(output="png")
+        # 将字节流转换为 Base64 编码
+        encoded_string = base64.b64encode(image_bytes)
+        base64_images.append(encoded_string.decode('utf-8'))
+
+    # 关闭 PDF 文件
+    pdf_document.close()
+    return base64_images
+
+
+def pdf2ai(pdf_url, prompt, page_num=10):
+    base64_images = urlpdf_pages_to_base64(pdf_url, page_num)
+    markdown_output = "# 提取结果\n\n"
+
+    # 使用 ThreadPoolExecutor 进行并行处理
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        # 提交任务到线程池
+        future_to_page = {executor.submit(getAiOcrBase64, prompt, base64_image): i + 1 for i, base64_image in
+                          enumerate(base64_images)}
+
+        # 存储每个页面的结果
+        results = []
+
+        for future in concurrent.futures.as_completed(future_to_page):
+            try:
+                content = future.result()
+                page_number = future_to_page[future]
+                results.append((page_number, content))
+            except Exception as exc:
+                print(f'生成第 {future_to_page[future]} 页内容时发生错误: {exc}')
+
+    # 按照页数排序
+    results.sort(key=lambda x: x[0])
+
+    # 构建最终的 Markdown 输出
+    for page_number, content in results:
+        markdown_output += f"## 第 {page_number} 页\n\n"
+        markdown_output += f" {content}\n\n"
+
+    return markdown_output
+
+
+if __name__ == "__main__":
+    pdf_url = 'http://42.194.163.46:9007/ywd/%E6%89%AB%E6%8F%8F%E5%85%A8%E8%83%BD%E7%8E%8B%202025-03-20%2021.38.pdf'
+    prompt = "提取有效信息,表格信息以表格输出,去掉信息提示和页码相关信息。"
+
+    result = pdf2ai(pdf_url, prompt, 3)
+    print(result)

+ 95 - 0
tool/pdf2md222.py

@@ -0,0 +1,95 @@
+import base64
+import concurrent.futures
+
+import fitz
+import requests
+
+from tool.tyqw import getAiOcrBase64
+
+
+def pdf_pages_to_base64(pdf_path):
+    # 打开 PDF 文件
+    pdf_document = fitz.open(pdf_path)
+    base64_images = []
+
+    # 遍历 PDF 的每一页
+    for page_number in range(pdf_document.page_count):
+        # 获取当前页
+        page = pdf_document.load_page(page_number)
+        # 将页面渲染为图像
+        pix = page.get_pixmap()
+        # 将图像数据转换为 PNG 字节流
+        image_bytes = pix.tobytes(output="png")
+        # 将字节流转换为 Base64 编码
+        encoded_string = base64.b64encode(image_bytes)
+        base64_images.append(encoded_string.decode('utf-8'))
+
+    # 关闭 PDF 文件
+    pdf_document.close()
+    return base64_images
+
+
+def urlpdf_pages_to_base64(pdf_url, page_num):
+    # 打开 PDF 文件
+    response = requests.get(pdf_url)
+    response.raise_for_status()  # 检查请求是否成功
+
+    # 打开 PDF 文件字节流
+    pdf_document = fitz.open("pdf", response.content)
+    base64_images = []
+
+    # 遍历 PDF 的每一页
+    for page_number in range(min(pdf_document.page_count, page_num)):
+        # 获取当前页
+        page = pdf_document.load_page(page_number)
+        # 将页面渲染为图像
+        pix = page.get_pixmap()
+        # 将图像数据转换为 PNG 字节流
+        image_bytes = pix.tobytes(output="png")
+        # 将字节流转换为 Base64 编码
+        encoded_string = base64.b64encode(image_bytes)
+        base64_images.append(encoded_string.decode('utf-8'))
+
+    # 关闭 PDF 文件
+    pdf_document.close()
+    return base64_images
+
+
+def pdf2ai(pdf_url, prompt, page_num=10):
+    base64_images = urlpdf_pages_to_base64(pdf_url, page_num)
+    markdown_output = "# 提取结果\n\n"
+
+    # 使用 ThreadPoolExecutor 进行并行处理
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        # 提交任务到线程池
+        future_to_page = {executor.submit(getAiOcrBase64, prompt, base64_image): i + 1 for i, base64_image in
+                          enumerate(base64_images)}
+
+        # 存储每个页面的结果
+        results = []
+
+        for future in concurrent.futures.as_completed(future_to_page):
+            try:
+                content = future.result()
+                page_number = future_to_page[future]
+                results.append((page_number, content))
+            except Exception as exc:
+                print(f'生成第 {future_to_page[future]} 页内容时发生错误: {exc}')
+
+    # 按照页数排序
+    results.sort(key=lambda x: x[0])
+
+    # 构建最终的 Markdown 输出
+    for page_number, content in results:
+        markdown_output += f"## 第 {page_number} 页\n\n"
+        markdown_output += f" {content}\n\n"
+
+    return markdown_output
+
+
+if __name__ == "__main__":
+    pdf_url = 'http://42.194.163.46:9007/ywd/%E6%89%AB%E6%8F%8F%E5%85%A8%E8%83%BD%E7%8E%8B%202025-03-20%2021.38.pdf'
+    prompt = "提取有效信息,表格信息以表格输出,去掉信息提示和页码相关信息。"
+
+    result = pdf2ai(pdf_url, prompt, 3)
+    print(result)

BIN
tool/test_ocr.pdf


+ 96 - 0
tool/tyqw.py

@@ -0,0 +1,96 @@
+import base64
+
+from openai import OpenAI
+
+
+# 读取本地图片文件并转换为Base64编码
+def encode_image_to_base64(image_path, ):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+
+
+def getAiOcr(prompt, image_path):
+    # 初始化OpenAI客户端
+    client = OpenAI(
+        # 若没有配置环境变量,请用百炼API Key将下行替换为:api_key="sk-xxx",
+        api_key="sk-383770c29f4c41f0bec52a640af80e64",
+        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+    )
+
+    # 本地图片路径
+    if prompt is None:
+        image_path = "G:\\wd\\sfz33.jpg"  # 替换为你的本地图片路径
+
+    # 将本地图片编码为Base64
+    image_base64 = encode_image_to_base64(image_path)
+
+    # 创建请求
+    if prompt is None:
+        prompt = "Convert the provided image of a PDF document strictly into valid Markdown. If there are mathematical formulas in the document, use Mathjax."
+
+    completion = client.chat.completions.create(
+        model="qwen-vl-plus",  # 此处以qwen-vl-plus为例,可按需更换模型名称。
+        messages=[
+            {"role": "system", "content": "You are a tool to parse documents."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}  # 使用Base64编码的图片数据
+                    }
+                ]
+            }
+        ]
+    )
+
+    # 打印结果
+    print(completion.model_dump_json())
+    print(completion.choices[0].message.content)
+    return completion.choices[0].message.content
+
+
+def getAiOcrBase64(prompt, image_base64):
+    # 初始化OpenAI客户端
+    client = OpenAI(
+        # 若没有配置环境变量,请用百炼API Key将下行替换为:api_key="sk-xxx",
+        api_key="sk-383770c29f4c41f0bec52a640af80e64",
+        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+    )
+
+    # 本地图片路径
+    if prompt is None:
+        image_path = "G:\\wd\\sfz33.jpg"  # 替换为你的本地图片路径
+
+    # 将本地图片编码为Base64
+    # image_base64 = encode_image_to_base64(image_path)
+
+    # 创建请求
+    if prompt is None:
+        prompt = "Convert the provided image of a PDF document strictly into valid Markdown. If there are mathematical formulas in the document, use Mathjax."
+
+    completion = client.chat.completions.create(
+        model="qwen-vl-plus",  # 此处以qwen-vl-plus为例,可按需更换模型名称。
+        messages=[
+            {"role": "system", "content": "You are a tool to parse documents."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}  # 使用Base64编码的图片数据
+                    }
+                ]
+            }
+        ]
+    )
+
+    # 打印结果
+
+    return completion.choices[0].message.content
+
+
+if __name__ == "__main__":
+    getAiOcr('提取有效信息', "G:\\wd\\sfz33.jpg")