赵小蒙 1 год назад
Родитель
Сommit
a5f8de9882
3 измененных файлов с 43 добавлено и 3 удалено
  1. 3 2
      demo/ocr_demo.py
  2. 21 1
      magic_pdf/pdf_parse_by_ocr.py
  3. 19 0
      magic_pdf/pre_proc/ocr_cut_image.py

+ 3 - 2
demo/ocr_demo.py

@@ -30,8 +30,8 @@ def read_json_file(file_path):
 
 
 if __name__ == '__main__':
-    ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
-    ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1.json"
+    ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0_org.pdf"
+    ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_0.json"
     try:
         ocr_pdf_model_info = read_json_file(ocr_json_file_path)
         pth = Path(ocr_json_file_path)
@@ -43,6 +43,7 @@ if __name__ == '__main__':
             ocr_pdf_path,
             None,
             ocr_pdf_model_info,
+            save_path,
             book_name,
             debug_mode=True)
         parent_dir = os.path.dirname(text_content_save_path)

+ 21 - 1
magic_pdf/pdf_parse_by_ocr.py

@@ -1,3 +1,4 @@
+import json
 import os
 import time
 
@@ -10,6 +11,7 @@ from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
 from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
 from magic_pdf.pre_proc.detect_header import parse_headers
 from magic_pdf.pre_proc.detect_page_number import parse_pageNos
+from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
 from magic_pdf.pre_proc.ocr_dict_merge import remove_overlaps_min_spans, merge_spans_to_line_by_layout
 from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
@@ -28,6 +30,7 @@ def parse_pdf_by_ocr(
         pdf_path,
         s3_pdf_profile,
         pdf_model_output,
+        save_path,
         book_name,
         pdf_model_profile=None,
         image_s3_config=None,
@@ -148,6 +151,10 @@ def parse_pdf_by_ocr(
         # 删除remove_span_block_bboxes中的bbox
         spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
 
+        # 对image和table截图
+        spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
+
+
         # 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
 
         # 模型识别错误的行间公式, type类型转换成行内公式
@@ -161,7 +168,7 @@ def parse_pdf_by_ocr(
 
         # 将spans合并成line(在layout内,从上到下,从左到右)
         lines = merge_spans_to_line_by_layout(spans, layout_bboxes)
-        # logger.info(lines)
+
 
         # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
         blocks = []
@@ -175,4 +182,17 @@ def parse_pdf_by_ocr(
         page_info = construct_page_component(page_id, blocks, layout_bboxes)
         pdf_info_dict[f"page_{page_id}"] = page_info
 
+        # 在测试时,保存调试信息
+        if debug_mode:
+            params_file_save_path = join_path(save_tmp_path, "md", book_name, "preproc_out.json")
+            page_draw_rect_save_path = join_path(save_tmp_path, "md", book_name, "layout.pdf")
+
+            with open(params_file_save_path, "w", encoding="utf-8") as f:
+                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+            # 先检测本地 page_draw_rect_save_path 是否存在,如果存在则删除
+            if os.path.exists(page_draw_rect_save_path):
+                os.remove(page_draw_rect_save_path)
+            # 绘制bbox和layout到pdf
+
+
     return pdf_info_dict

+ 19 - 0
magic_pdf/pre_proc/ocr_cut_image.py

@@ -0,0 +1,19 @@
+from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.pdf_image_tools import cut_image
+
+
+def cut_image_and_table(spans, page, page_id, book_name, save_path):
+    def s3_return_path(type):
+        return join_path(book_name, type)
+
+    def img_save_path(type):
+        return join_path(save_path, s3_return_path(type))
+
+    for span in spans:
+        span_type = span['type']
+        if span_type == 'image':
+            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('image'))
+        elif span_type == 'table':
+            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('table'))
+
+    return spans