Эх сурвалжийг харах

cut_image不报错公式图片,增加parse_union_pdf逻辑

赵小蒙 1 жил өмнө
parent
commit
51bb3b3646

+ 9 - 24
magic_pdf/libs/pdf_image_tools.py

@@ -5,7 +5,7 @@ from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.hash_utils import compute_sha256
 
 
-def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter, upload=True):
+def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter):
     """
     从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
     save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
@@ -19,17 +19,16 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
     # 新版本生成平铺路径
     img_hash256_path = f"{compute_sha256(img_path)}.jpg"
 
-    if upload:
-        # 将坐标转换为fitz.Rect对象
-        rect = fitz.Rect(*bbox)
-        # 配置缩放倍数为3倍
-        zoom = fitz.Matrix(3, 3)
-        # 截取图片
-        pix = page.get_pixmap(clip=rect, matrix=zoom)
+    # 将坐标转换为fitz.Rect对象
+    rect = fitz.Rect(*bbox)
+    # 配置缩放倍数为3倍
+    zoom = fitz.Matrix(3, 3)
+    # 截取图片
+    pix = page.get_pixmap(clip=rect, matrix=zoom)
 
-        byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
+    byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
 
-        imageWriter.write(data=byte_data, path=img_hash256_path, mode="binary")
+    imageWriter.write(data=byte_data, path=img_hash256_path, mode="binary")
 
     return img_hash256_path
 
@@ -74,18 +73,4 @@ def save_images_by_bboxes(page_num: int, page: fitz.Page, pdf_bytes_md5: str,
         image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
         table_info.append({"bbox": bbox, "image_path": image_path})
 
-    for bbox in equation_inline_bboxes:
-        if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
-            logger.warning(f"equation_inline_bboxes: 错误的box, {bbox}")
-            continue
-        image_path = cut_image(bbox[:4], page_num, page, return_path("equations_inline"), imageWriter, upload=False)
-        inline_eq_info.append({'bbox': bbox[:4], "image_path": image_path, "latex_text": bbox[4]})
-
-    for bbox in equation_interline_bboxes:
-        if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
-            logger.warning(f"equation_interline_bboxes: 错误的box, {bbox}")
-            continue
-        image_path = cut_image(bbox[:4], page_num, page, return_path("equation_interline"), imageWriter, upload=False)
-        interline_eq_info.append({"bbox": bbox[:4], "image_path": image_path, "latex_text": bbox[4]})
-
     return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info

+ 27 - 3
magic_pdf/spark/spark_api.py

@@ -12,7 +12,7 @@
 其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
 
 """
-
+from loguru import logger
 
 from magic_pdf.io import AbsReaderWriter
 from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
@@ -31,7 +31,6 @@ def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter
         debug_mode=is_debug,
     )
     return pdf_info_dict
-    pass
 
 
 def parse_ocr_pdf(pdf_bytes:bytes,  pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
@@ -52,4 +51,29 @@ def parse_union_pdf(pdf_bytes:bytes,  pdf_models:list, imageWriter: AbsReaderWri
     """
     ocr和文本混合的pdf,全部解析出来
     """
-    pass
+    def parse_pdf(method):
+        try:
+            return method(
+                pdf_bytes,
+                pdf_models,
+                imageWriter,
+                start_page_id=start_page,
+                debug_mode=is_debug,
+            )
+        except Exception as e:
+            logger.error(f"{method.__name__} error: {e}")
+            return None
+
+    pdf_info_dict = parse_pdf(parse_pdf_by_txt)
+    if pdf_info_dict is None or pdf_info_dict.get("need_drop", False):
+        logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
+        pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
+
+    if pdf_info_dict is None:
+        raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
+
+    return pdf_info_dict
+
+
+def spark_json_extractor(jso:dict):
+    pass