Explorar el Código

切图逻辑重构

赵小蒙 hace 1 año
padre
commit
d438b97a0a

+ 7 - 45
magic_pdf/libs/pdf_image_tools.py

@@ -1,11 +1,12 @@
+from loguru import logger
+
 from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.libs.commons import fitz
-from loguru import logger
 from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.hash_utils import compute_sha256
 
 
-def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter:AbsReaderWriter):
+def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
     """
     从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
     save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
@@ -19,6 +20,10 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
     # 新版本生成平铺路径
     img_hash256_path = f"{compute_sha256(img_path)}.jpg"
 
+    if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
+        logger.warning(f"image_bboxes: 错误的box, {bbox}")
+        return img_hash256_path
+
     # 将坐标转换为fitz.Rect对象
     rect = fitz.Rect(*bbox)
     # 配置缩放倍数为3倍
@@ -31,46 +36,3 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
     imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)
 
     return img_hash256_path
-
-
-def save_images_by_bboxes(page_num: int, page: fitz.Page, pdf_bytes_md5: str,
-                          image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
-                          equation_inline_bboxes: list,
-                          equation_interline_bboxes: list, imageWriter) -> dict:
-    """
-    返回一个dict, key为bbox, 值是图片地址
-    """
-    image_info = []
-    image_backup_info = []
-    table_info = []
-    inline_eq_info = []
-    interline_eq_info = []
-
-    # 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
-
-    def return_path(type):
-        return join_path(pdf_bytes_md5, type)
-
-    for bbox in image_bboxes:
-        if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
-            logger.warning(f"image_bboxes: 错误的box, {bbox}")
-            continue
-
-        image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
-        image_info.append({"bbox": bbox, "image_path": image_path})
-
-    for bbox in images_overlap_backup:
-        if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
-            logger.warning(f"images_overlap_backup: 错误的box, {bbox}")
-            continue
-        image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
-        image_backup_info.append({"bbox": bbox, "image_path": image_path})
-
-    for bbox in table_bboxes:
-        if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
-            logger.warning(f"table_bboxes: 错误的box, {bbox}")
-            continue
-        image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
-        table_info.append({"bbox": bbox, "image_path": image_path})
-
-    return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info

+ 6 - 7
magic_pdf/pdf_parse_by_ocr.py

@@ -16,7 +16,7 @@ from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
 from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
 from magic_pdf.pre_proc.detect_header import parse_headers
 from magic_pdf.pre_proc.detect_page_number import parse_pageNos
-from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
+from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
 from magic_pdf.pre_proc.ocr_dict_merge import (
     merge_spans_to_line_by_layout, merge_lines_to_block,
@@ -27,7 +27,6 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
 from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
 
 
-
 def parse_pdf_by_ocr(
         pdf_bytes,
         pdf_model_output,
@@ -148,7 +147,7 @@ def parse_pdf_by_ocr(
         spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
 
         '''对image和table截图'''
-        spans = cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter)
+        spans = ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter)
 
         '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
         displayed_list = []
@@ -202,10 +201,10 @@ def parse_pdf_by_ocr(
 
         '''构造pdf_info_dict'''
         page_info = ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
-                                             images, tables, interline_equations, inline_equations,
-                                             dropped_text_block, dropped_image_block, dropped_table_block,
-                                             dropped_equation_block,
-                                             need_remove_spans_bboxes_dict)
+                                                 images, tables, interline_equations, inline_equations,
+                                                 dropped_text_block, dropped_image_block, dropped_table_block,
+                                                 dropped_equation_block,
+                                                 need_remove_spans_bboxes_dict)
         pdf_info_dict[f"page_{page_id}"] = page_info
 
     """分段"""

+ 2 - 3
magic_pdf/pdf_parse_by_txt.py

@@ -17,6 +17,7 @@ from magic_pdf.libs.hash_utils import compute_md5
 from magic_pdf.libs.markdown_utils import escape_special_markdown_char
 from magic_pdf.libs.safe_filename import sanitize_filename
 from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
+from magic_pdf.pre_proc.cut_image import txt_save_images_by_bboxes
 from magic_pdf.pre_proc.detect_images import parse_images
 from magic_pdf.pre_proc.detect_tables import parse_tables  # 获取tables的bbox
 from magic_pdf.pre_proc.detect_equation import parse_equations  # 获取equations的bbox
@@ -48,8 +49,6 @@ from para.exceptions import (
 )
 '''
 
-from magic_pdf.libs.commons import read_file, join_path
-from magic_pdf.libs.pdf_image_tools import save_images_by_bboxes
 from magic_pdf.post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
 from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
 from magic_pdf.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
@@ -194,7 +193,7 @@ def parse_pdf_by_txt(
         """
 
         # 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容
-        image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = save_images_by_bboxes(
+        image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = txt_save_images_by_bboxes(
             page_id,
             page,
             pdf_bytes_md5,

+ 2 - 2
magic_pdf/pdf_parse_for_train.py

@@ -26,6 +26,7 @@ from magic_pdf.libs.drop_reason import DropReason
 from magic_pdf.libs.markdown_utils import escape_special_markdown_char
 from magic_pdf.libs.safe_filename import sanitize_filename
 from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
+from magic_pdf.pre_proc.cut_image import txt_save_images_by_bboxes
 from magic_pdf.pre_proc.detect_images import parse_images
 from magic_pdf.pre_proc.detect_tables import parse_tables  # 获取tables的bbox
 from magic_pdf.pre_proc.detect_equation import parse_equations  # 获取equations的bbox
@@ -62,7 +63,6 @@ from para.exceptions import (
 """
 
 from magic_pdf.libs.commons import read_file, join_path
-from magic_pdf.libs.pdf_image_tools import save_images_by_bboxes
 from magic_pdf.post_proc.remove_footnote import (
     merge_footnote_blocks,
     remove_footnote_blocks,
@@ -323,7 +323,7 @@ def parse_pdf_for_train(
 
         # 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容
         image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = (
-            save_images_by_bboxes(
+            txt_save_images_by_bboxes(
                 book_name,
                 page_id,
                 page,

+ 52 - 0
magic_pdf/pre_proc/cut_image.py

@@ -0,0 +1,52 @@
+from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.ocr_content_type import ContentType
+from magic_pdf.libs.pdf_image_tools import cut_image
+
+
+def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
+    def return_path(type):
+        return join_path(pdf_bytes_md5, type)
+
+    for span in spans:
+        span_type = span['type']
+        if span_type == ContentType.Image:
+            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
+                                           imageWriter=imageWriter)
+        elif span_type == ContentType.Table:
+            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
+                                           imageWriter=imageWriter)
+
+    return spans
+
+
+def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
+                              image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
+                              equation_inline_bboxes: list,
+                              equation_interline_bboxes: list, imageWriter) -> dict:
+    """
+    返回一个dict, key为bbox, 值是图片地址
+    """
+    image_info = []
+    image_backup_info = []
+    table_info = []
+    inline_eq_info = []
+    interline_eq_info = []
+
+    # 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
+
+    def return_path(type):
+        return join_path(pdf_bytes_md5, type)
+
+    for bbox in image_bboxes:
+        image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
+        image_info.append({"bbox": bbox, "image_path": image_path})
+
+    for bbox in images_overlap_backup:
+        image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
+        image_backup_info.append({"bbox": bbox, "image_path": image_path})
+
+    for bbox in table_bboxes:
+        image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
+        table_info.append({"bbox": bbox, "image_path": image_path})
+
+    return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info

+ 0 - 18
magic_pdf/pre_proc/ocr_cut_image.py

@@ -1,18 +0,0 @@
-from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.ocr_content_type import ContentType
-from magic_pdf.libs.pdf_image_tools import cut_image
-
-
-def cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
-
-    def return_path(type):
-        return join_path(pdf_bytes_md5, type)
-
-    for span in spans:
-        span_type = span['type']
-        if span_type == ContentType.Image:
-            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'), imageWriter=imageWriter)
-        elif span_type == ContentType.Table:
-            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'), imageWriter=imageWriter)
-
-    return spans