import os from datetime import datetime import fitz # PyMuPDF from minio import Minio from minio.error import S3Error def pdf_to_images(pdf_path, output_folder, dpi=300): """ 将PDF每一页转换为图片 :param pdf_path: PDF文件路径 :param output_folder: 图片输出目录 :param dpi: 图片分辨率 :return: 生成的图片路径列表 """ # 创建输出目录 os.makedirs(output_folder, exist_ok=True) # 打开PDF文件 pdf_document = fitz.open(pdf_path) image_paths = [] # 遍历每一页 for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) # 将页面转换为图片(使用RGB格式) pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72), dpi=dpi, colorspace="rgb") # 生成图片文件名 image_name = f"page_{page_num+1}.png" image_path = os.path.join(output_folder, image_name) # 保存图片 pix.save(image_path) image_paths.append(image_path) print(f"已生成第 {page_num+1} 页图片: {image_path}") pdf_document.close() return image_paths def upload_to_minio(image_paths, bucket_name): """ 上传图片到Minio :param image_paths: 图片路径列表 :param bucket_name: Minio存储桶名称 """ # 初始化Minio客户端 minio_client = Minio( endpoint="localhost:9000", # Minio服务器地址 access_key="your-access-key", # 访问密钥 secret_key="your-secret-key", # 私有密钥 secure=False # 是否使用HTTPS ) # 确保存储桶存在 found = minio_client.bucket_exists(bucket_name) if not found: minio_client.make_bucket(bucket_name) print(f"创建存储桶: {bucket_name}") # 上传所有图片 for image_path in image_paths: object_name = f"pdf_images/{datetime.now().strftime('%Y%m%d')}/{os.path.basename(image_path)}" try: minio_client.fput_object( bucket_name=bucket_name, object_name=object_name, file_path=image_path, content_type="image/png" ) print(f"成功上传: {object_name}") except S3Error as exc: print(f"上传失败: {exc}") if __name__ == "__main__": # 配置参数 PDF_PATH = "G:\\wx\\33333333.pdf" # 输入PDF文件路径 OUTPUT_FOLDER = "G:\\wx\\temp_images" # 临时图片存储目录 BUCKET_NAME = "documents" # Minio存储桶名称 # 执行转换和上传 try: # 步骤1: PDF转图片 images = pdf_to_images(PDF_PATH, OUTPUT_FOLDER) # 步骤2: 上传到Minio #upload_to_minio(images, BUCKET_NAME) finally: # 清理临时文件(可选) # for img in images: # os.remove(img) # os.rmdir(OUTPUT_FOLDER) pass