12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- import os
- from datetime import datetime
- import fitz # PyMuPDF
- from minio import Minio
- from minio.error import S3Error
- def pdf_to_images(pdf_path, output_folder, dpi=300):
- """
- 将PDF每一页转换为图片
- :param pdf_path: PDF文件路径
- :param output_folder: 图片输出目录
- :param dpi: 图片分辨率
- :return: 生成的图片路径列表
- """
- # 创建输出目录
- os.makedirs(output_folder, exist_ok=True)
- # 打开PDF文件
- pdf_document = fitz.open(pdf_path)
- image_paths = []
- # 遍历每一页
- for page_num in range(len(pdf_document)):
- page = pdf_document.load_page(page_num)
- # 将页面转换为图片(使用RGB格式)
- pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72), dpi=dpi, colorspace="rgb")
- # 生成图片文件名
- image_name = f"page_{page_num+1}.png"
- image_path = os.path.join(output_folder, image_name)
- # 保存图片
- pix.save(image_path)
- image_paths.append(image_path)
- print(f"已生成第 {page_num+1} 页图片: {image_path}")
- pdf_document.close()
- return image_paths
- def upload_to_minio(image_paths, bucket_name):
- """
- 上传图片到Minio
- :param image_paths: 图片路径列表
- :param bucket_name: Minio存储桶名称
- """
- # 初始化Minio客户端
- minio_client = Minio(
- endpoint="localhost:9000", # Minio服务器地址
- access_key="your-access-key", # 访问密钥
- secret_key="your-secret-key", # 私有密钥
- secure=False # 是否使用HTTPS
- )
- # 确保存储桶存在
- found = minio_client.bucket_exists(bucket_name)
- if not found:
- minio_client.make_bucket(bucket_name)
- print(f"创建存储桶: {bucket_name}")
- # 上传所有图片
- for image_path in image_paths:
- object_name = f"pdf_images/{datetime.now().strftime('%Y%m%d')}/{os.path.basename(image_path)}"
- try:
- minio_client.fput_object(
- bucket_name=bucket_name,
- object_name=object_name,
- file_path=image_path,
- content_type="image/png"
- )
- print(f"成功上传: {object_name}")
- except S3Error as exc:
- print(f"上传失败: {exc}")
- if __name__ == "__main__":
- # 配置参数
- PDF_PATH = "G:\\wx\\33333333.pdf" # 输入PDF文件路径
- OUTPUT_FOLDER = "G:\\wx\\temp_images" # 临时图片存储目录
- BUCKET_NAME = "documents" # Minio存储桶名称
- # 执行转换和上传
- try:
- # 步骤1: PDF转图片
- images = pdf_to_images(PDF_PATH, OUTPUT_FOLDER)
- # 步骤2: 上传到Minio
- #upload_to_minio(images, BUCKET_NAME)
- finally:
- # 清理临时文件(可选)
- # for img in images:
- # os.remove(img)
- # os.rmdir(OUTPUT_FOLDER)
- pass
|