|
@@ -0,0 +1,97 @@
|
|
|
|
+import os
|
|
|
|
+from datetime import datetime
|
|
|
|
+
|
|
|
|
+import fitz # PyMuPDF
|
|
|
|
+from minio import Minio
|
|
|
|
+from minio.error import S3Error
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def pdf_to_images(pdf_path, output_folder, dpi=300):
|
|
|
|
+ """
|
|
|
|
+ 将PDF每一页转换为图片
|
|
|
|
+ :param pdf_path: PDF文件路径
|
|
|
|
+ :param output_folder: 图片输出目录
|
|
|
|
+ :param dpi: 图片分辨率
|
|
|
|
+ :return: 生成的图片路径列表
|
|
|
|
+ """
|
|
|
|
+ # 创建输出目录
|
|
|
|
+ os.makedirs(output_folder, exist_ok=True)
|
|
|
|
+
|
|
|
|
+ # 打开PDF文件
|
|
|
|
+ pdf_document = fitz.open(pdf_path)
|
|
|
|
+ image_paths = []
|
|
|
|
+
|
|
|
|
+ # 遍历每一页
|
|
|
|
+ for page_num in range(len(pdf_document)):
|
|
|
|
+ page = pdf_document.load_page(page_num)
|
|
|
|
+
|
|
|
|
+ # 将页面转换为图片(使用RGB格式)
|
|
|
|
+ pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72), dpi=dpi, colorspace="rgb")
|
|
|
|
+
|
|
|
|
+ # 生成图片文件名
|
|
|
|
+ image_name = f"page_{page_num+1}.png"
|
|
|
|
+ image_path = os.path.join(output_folder, image_name)
|
|
|
|
+
|
|
|
|
+ # 保存图片
|
|
|
|
+ pix.save(image_path)
|
|
|
|
+ image_paths.append(image_path)
|
|
|
|
+ print(f"已生成第 {page_num+1} 页图片: {image_path}")
|
|
|
|
+
|
|
|
|
+ pdf_document.close()
|
|
|
|
+ return image_paths
|
|
|
|
+
|
|
|
|
+def upload_to_minio(image_paths, bucket_name):
|
|
|
|
+ """
|
|
|
|
+ 上传图片到Minio
|
|
|
|
+ :param image_paths: 图片路径列表
|
|
|
|
+ :param bucket_name: Minio存储桶名称
|
|
|
|
+ """
|
|
|
|
+ # 初始化Minio客户端
|
|
|
|
+ minio_client = Minio(
|
|
|
|
+ endpoint="localhost:9000", # Minio服务器地址
|
|
|
|
+ access_key="your-access-key", # 访问密钥
|
|
|
|
+ secret_key="your-secret-key", # 私有密钥
|
|
|
|
+ secure=False # 是否使用HTTPS
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ # 确保存储桶存在
|
|
|
|
+ found = minio_client.bucket_exists(bucket_name)
|
|
|
|
+ if not found:
|
|
|
|
+ minio_client.make_bucket(bucket_name)
|
|
|
|
+ print(f"创建存储桶: {bucket_name}")
|
|
|
|
+
|
|
|
|
+ # 上传所有图片
|
|
|
|
+ for image_path in image_paths:
|
|
|
|
+ object_name = f"pdf_images/{datetime.now().strftime('%Y%m%d')}/{os.path.basename(image_path)}"
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ minio_client.fput_object(
|
|
|
|
+ bucket_name=bucket_name,
|
|
|
|
+ object_name=object_name,
|
|
|
|
+ file_path=image_path,
|
|
|
|
+ content_type="image/png"
|
|
|
|
+ )
|
|
|
|
+ print(f"成功上传: {object_name}")
|
|
|
|
+ except S3Error as exc:
|
|
|
|
+ print(f"上传失败: {exc}")
|
|
|
|
+
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
+ # 配置参数
|
|
|
|
+ PDF_PATH = "G:\\wx\\33333333.pdf" # 输入PDF文件路径
|
|
|
|
+ OUTPUT_FOLDER = "G:\\wx\\temp_images" # 临时图片存储目录
|
|
|
|
+ BUCKET_NAME = "documents" # Minio存储桶名称
|
|
|
|
+
|
|
|
|
+ # 执行转换和上传
|
|
|
|
+ try:
|
|
|
|
+ # 步骤1: PDF转图片
|
|
|
|
+ images = pdf_to_images(PDF_PATH, OUTPUT_FOLDER)
|
|
|
|
+
|
|
|
|
+ # 步骤2: 上传到Minio
|
|
|
|
+ #upload_to_minio(images, BUCKET_NAME)
|
|
|
|
+
|
|
|
|
+ finally:
|
|
|
|
+ # 清理临时文件(可选)
|
|
|
|
+ # for img in images:
|
|
|
|
+ # os.remove(img)
|
|
|
|
+ # os.rmdir(OUTPUT_FOLDER)
|
|
|
|
+ pass
|