2 months ago · 98fdff2935
--- a/zhch/utils/__init__.py
+++ b/zhch/utils/__init__.py
@@ -22,6 +22,11 @@ from .file_utils import (
 
				     collect_pid_files,
			
 
				 )
			
 
				 
			
 
				+from .doc_utils import (
			
 
				+    load_images_from_pdf,
			
 
				+    fitz_doc_to_image,
			
 
				+)
			
 
				+
			
 
				 __all__ = [
			
 
				     # CUDA环境检测
			
 
				     'check_nvidia_environment',
			
--- a/zhch/utils/doc_utils.py
+++ b/zhch/utils/doc_utils.py
@@ -0,0 +1,60 @@
 
				+import fitz
			
 
				+import numpy as np
			
 
				+import enum
			
 
				+from pydantic import BaseModel, Field
			
 
				+from PIL import Image
			
 
				+
			
 
				+
			
 
				+class SupportedPdfParseMethod(enum.Enum):
			
 
				+    OCR = 'ocr'
			
 
				+    TXT = 'txt'
			
 
				+
			
 
				+
			
 
				+class PageInfo(BaseModel):
			
 
				+    """The width and height of page
			
 
				+    """
			
 
				+    w: float = Field(description='the width of page')
			
 
				+    h: float = Field(description='the height of page')
			
 
				+
			
 
				+
			
 
				+def fitz_doc_to_image(doc, target_dpi=200, origin_dpi=None) -> dict:
			
 
				+    """Convert fitz.Document to image, Then convert the image to numpy array.
			
 
				+
			
 
				+    Args:
			
 
				+        doc (_type_): pymudoc page
			
 
				+        dpi (int, optional): reset the dpi of dpi. Defaults to 200.
			
 
				+
			
 
				+    Returns:
			
 
				+        dict:  {'img': numpy array, 'width': width, 'height': height }
			
 
				+    """
			
 
				+    from PIL import Image
			
 
				+    mat = fitz.Matrix(target_dpi / 72, target_dpi / 72)
			
 
				+    pm = doc.get_pixmap(matrix=mat, alpha=False)
			
 
				+
			
 
				+    if pm.width > 4500 or pm.height > 4500:
			
 
				+        mat = fitz.Matrix(72 / 72, 72 / 72)  # use fitz default dpi
			
 
				+        pm = doc.get_pixmap(matrix=mat, alpha=False)
			
 
				+
			
 
				+    image = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
			
 
				+    return image
			
 
				+
			
 
				+
			
 
				+def load_images_from_pdf(pdf_file, dpi=200, start_page_id=0, end_page_id=None) -> list:
			
 
				+    images = []
			
 
				+    with fitz.open(pdf_file) as doc:
			
 
				+        pdf_page_num = doc.page_count
			
 
				+        end_page_id = (
			
 
				+            end_page_id
			
 
				+            if end_page_id is not None and end_page_id >= 0
			
 
				+            else pdf_page_num - 1
			
 
				+        )
			
 
				+        if end_page_id > pdf_page_num - 1:
			
 
				+            print('end_page_id is out of range, use images length')
			
 
				+            end_page_id = pdf_page_num - 1
			
 
				+
			
 
				+        for index in range(0, doc.page_count):
			
 
				+            if start_page_id <= index <= end_page_id:
			
 
				+                page = doc[index]
			
 
				+                img = fitz_doc_to_image(page, target_dpi=dpi)
			
 
				+                images.append(img)
			
 
				+    return images