|
|
@@ -4,6 +4,7 @@ import fitz
|
|
|
import numpy as np
|
|
|
from loguru import logger
|
|
|
|
|
|
+from magic_pdf.libs.clean_memory import clean_memory
|
|
|
from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config
|
|
|
from magic_pdf.model.model_list import MODEL
|
|
|
import magic_pdf.model as model_config
|
|
|
@@ -23,7 +24,7 @@ def remove_duplicates_dicts(lst):
|
|
|
return unique_dicts
|
|
|
|
|
|
|
|
|
-def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
|
|
|
+def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
|
|
|
try:
|
|
|
from PIL import Image
|
|
|
except ImportError:
|
|
|
@@ -32,18 +33,28 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
|
|
|
|
|
|
images = []
|
|
|
with fitz.open("pdf", pdf_bytes) as doc:
|
|
|
+ pdf_page_num = doc.page_count
|
|
|
+ end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
|
|
|
+ if end_page_id > pdf_page_num - 1:
|
|
|
+ logger.warning("end_page_id is out of range, use images length")
|
|
|
+ end_page_id = pdf_page_num - 1
|
|
|
+
|
|
|
for index in range(0, doc.page_count):
|
|
|
- page = doc[index]
|
|
|
- mat = fitz.Matrix(dpi / 72, dpi / 72)
|
|
|
- pm = page.get_pixmap(matrix=mat, alpha=False)
|
|
|
+ if start_page_id <= index <= end_page_id:
|
|
|
+ page = doc[index]
|
|
|
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
|
|
|
+ pm = page.get_pixmap(matrix=mat, alpha=False)
|
|
|
+
|
|
|
+ # If the width or height exceeds 9000 after scaling, do not scale further.
|
|
|
+ if pm.width > 9000 or pm.height > 9000:
|
|
|
+ pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
|
|
|
|
|
- # If the width or height exceeds 9000 after scaling, do not scale further.
|
|
|
- if pm.width > 9000 or pm.height > 9000:
|
|
|
- pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
|
|
+ img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
|
|
|
+ img = np.array(img)
|
|
|
+ img_dict = {"img": img, "width": pm.width, "height": pm.height}
|
|
|
+ else:
|
|
|
+ img_dict = {"img": [], "width": 0, "height": 0}
|
|
|
|
|
|
- img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
|
|
|
- img = np.array(img)
|
|
|
- img_dict = {"img": img, "width": pm.width, "height": pm.height}
|
|
|
images.append(img_dict)
|
|
|
return images
|
|
|
|
|
|
@@ -111,14 +122,14 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
|
|
|
model_manager = ModelSingleton()
|
|
|
custom_model = model_manager.get_model(ocr, show_log, lang)
|
|
|
|
|
|
- images = load_images_from_pdf(pdf_bytes)
|
|
|
-
|
|
|
- # end_page_id = end_page_id if end_page_id else len(images) - 1
|
|
|
- end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(images) - 1
|
|
|
+ with fitz.open("pdf", pdf_bytes) as doc:
|
|
|
+ pdf_page_num = doc.page_count
|
|
|
+ end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
|
|
|
+ if end_page_id > pdf_page_num - 1:
|
|
|
+ logger.warning("end_page_id is out of range, use images length")
|
|
|
+ end_page_id = pdf_page_num - 1
|
|
|
|
|
|
- if end_page_id > len(images) - 1:
|
|
|
- logger.warning("end_page_id is out of range, use images length")
|
|
|
- end_page_id = len(images) - 1
|
|
|
+ images = load_images_from_pdf(pdf_bytes, start_page_id=start_page_id, end_page_id=end_page_id)
|
|
|
|
|
|
model_json = []
|
|
|
doc_analyze_start = time.time()
|
|
|
@@ -135,6 +146,11 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
|
|
|
page_dict = {"layout_dets": result, "page_info": page_info}
|
|
|
model_json.append(page_dict)
|
|
|
|
|
|
+ gc_start = time.time()
|
|
|
+ clean_memory()
|
|
|
+ gc_time = round(time.time() - gc_start, 2)
|
|
|
+ logger.info(f"gc time: {gc_time}")
|
|
|
+
|
|
|
doc_analyze_time = round(time.time() - doc_analyze_start, 2)
|
|
|
doc_analyze_speed = round( (end_page_id + 1 - start_page_id) / doc_analyze_time, 2)
|
|
|
logger.info(f"doc analyze time: {round(time.time() - doc_analyze_start, 2)},"
|