test.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. import os
  2. from datetime import datetime
  3. import fitz # PyMuPDF
  4. from minio import Minio
  5. from minio.error import S3Error
  6. def pdf_to_images(pdf_path, output_folder, dpi=300):
  7. """
  8. 将PDF每一页转换为图片
  9. :param pdf_path: PDF文件路径
  10. :param output_folder: 图片输出目录
  11. :param dpi: 图片分辨率
  12. :return: 生成的图片路径列表
  13. """
  14. # 创建输出目录
  15. os.makedirs(output_folder, exist_ok=True)
  16. # 打开PDF文件
  17. pdf_document = fitz.open(pdf_path)
  18. image_paths = []
  19. # 遍历每一页
  20. for page_num in range(len(pdf_document)):
  21. page = pdf_document.load_page(page_num)
  22. # 将页面转换为图片(使用RGB格式)
  23. pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72), dpi=dpi, colorspace="rgb")
  24. # 生成图片文件名
  25. image_name = f"page_{page_num+1}.png"
  26. image_path = os.path.join(output_folder, image_name)
  27. # 保存图片
  28. pix.save(image_path)
  29. image_paths.append(image_path)
  30. print(f"已生成第 {page_num+1} 页图片: {image_path}")
  31. pdf_document.close()
  32. return image_paths
  33. def upload_to_minio(image_paths, bucket_name):
  34. """
  35. 上传图片到Minio
  36. :param image_paths: 图片路径列表
  37. :param bucket_name: Minio存储桶名称
  38. """
  39. # 初始化Minio客户端
  40. minio_client = Minio(
  41. endpoint="localhost:9000", # Minio服务器地址
  42. access_key="your-access-key", # 访问密钥
  43. secret_key="your-secret-key", # 私有密钥
  44. secure=False # 是否使用HTTPS
  45. )
  46. # 确保存储桶存在
  47. found = minio_client.bucket_exists(bucket_name)
  48. if not found:
  49. minio_client.make_bucket(bucket_name)
  50. print(f"创建存储桶: {bucket_name}")
  51. # 上传所有图片
  52. for image_path in image_paths:
  53. object_name = f"pdf_images/{datetime.now().strftime('%Y%m%d')}/{os.path.basename(image_path)}"
  54. try:
  55. minio_client.fput_object(
  56. bucket_name=bucket_name,
  57. object_name=object_name,
  58. file_path=image_path,
  59. content_type="image/png"
  60. )
  61. print(f"成功上传: {object_name}")
  62. except S3Error as exc:
  63. print(f"上传失败: {exc}")
  64. if __name__ == "__main__":
  65. # 配置参数
  66. PDF_PATH = "G:\\wx\\33333333.pdf" # 输入PDF文件路径
  67. OUTPUT_FOLDER = "G:\\wx\\temp_images" # 临时图片存储目录
  68. BUCKET_NAME = "documents" # Minio存储桶名称
  69. # 执行转换和上传
  70. try:
  71. # 步骤1: PDF转图片
  72. images = pdf_to_images(PDF_PATH, OUTPUT_FOLDER)
  73. # 步骤2: 上传到Minio
  74. #upload_to_minio(images, BUCKET_NAME)
  75. finally:
  76. # 清理临时文件(可选)
  77. # for img in images:
  78. # os.remove(img)
  79. # os.rmdir(OUTPUT_FOLDER)
  80. pass