Browse Source

feat: 添加PDF处理工具,支持将PDF文档转换为图像

zhch158_admin 2 months ago
parent
commit
98fdff2935
2 changed files with 65 additions and 0 deletions
  1. 5 0
      zhch/utils/__init__.py
  2. 60 0
      zhch/utils/doc_utils.py

+ 5 - 0
zhch/utils/__init__.py

@@ -22,6 +22,11 @@ from .file_utils import (
     collect_pid_files,
 )
 
+from .doc_utils import (
+    load_images_from_pdf,
+    fitz_doc_to_image,
+)
+
 __all__ = [
     # CUDA环境检测
     'check_nvidia_environment',

+ 60 - 0
zhch/utils/doc_utils.py

@@ -0,0 +1,60 @@
+import fitz
+import numpy as np
+import enum
+from pydantic import BaseModel, Field
+from PIL import Image
+
+
+class SupportedPdfParseMethod(enum.Enum):
+    OCR = 'ocr'
+    TXT = 'txt'
+
+
+class PageInfo(BaseModel):
+    """The width and height of page
+    """
+    w: float = Field(description='the width of page')
+    h: float = Field(description='the height of page')
+
+
+def fitz_doc_to_image(doc, target_dpi=200, origin_dpi=None) -> dict:
+    """Convert fitz.Document to image, Then convert the image to numpy array.
+
+    Args:
+        doc (_type_): pymudoc page
+        dpi (int, optional): reset the dpi of dpi. Defaults to 200.
+
+    Returns:
+        dict:  {'img': numpy array, 'width': width, 'height': height }
+    """
+    from PIL import Image
+    mat = fitz.Matrix(target_dpi / 72, target_dpi / 72)
+    pm = doc.get_pixmap(matrix=mat, alpha=False)
+
+    if pm.width > 4500 or pm.height > 4500:
+        mat = fitz.Matrix(72 / 72, 72 / 72)  # use fitz default dpi
+        pm = doc.get_pixmap(matrix=mat, alpha=False)
+
+    image = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
+    return image
+
+
+def load_images_from_pdf(pdf_file, dpi=200, start_page_id=0, end_page_id=None) -> list:
+    images = []
+    with fitz.open(pdf_file) as doc:
+        pdf_page_num = doc.page_count
+        end_page_id = (
+            end_page_id
+            if end_page_id is not None and end_page_id >= 0
+            else pdf_page_num - 1
+        )
+        if end_page_id > pdf_page_num - 1:
+            print('end_page_id is out of range, use images length')
+            end_page_id = pdf_page_num - 1
+
+        for index in range(0, doc.page_count):
+            if start_page_id <= index <= end_page_id:
+                page = doc[index]
+                img = fitz_doc_to_image(page, target_dpi=dpi)
+                images.append(img)
+    return images