浏览代码

refactor: clean up unused OCR area calculation and update demo PDF path

myhloli 5 月之前
父节点
当前提交
3334157f15
共有 3 个文件被更改,包括 10 次插入19 次删除
  1. 0 13
      mineru/backend/pipeline/batch_analyze.py
  2. 3 0
      mineru/backend/pipeline/model_json_to_middle_json.py
  3. 7 6
      mineru/cli/common.py

+ 0 - 13
mineru/backend/pipeline/batch_analyze.py

@@ -230,19 +230,6 @@ class BatchAnalyze:
                         ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'],
                                                               new_image, _lang)
 
-                        # if res["category_id"] == 3 and ocr_res_list_dict['ocr_enable']:
-                        #     # ocr_result_list中所有bbox的面积之和
-                        #     ocr_res_area = sum(
-                        #         get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
-                        #     # 求ocr_res_area和res的面积的比值
-                        #     res_area = get_coords_and_area(res)[4]
-                        #     if res_area > 0:
-                        #         ratio = ocr_res_area / res_area
-                        #         if ratio > 0.25:
-                        #             res["category_id"] = 1
-                        #         else:
-                        #             continue
-
                         ocr_res_list_dict['layout_res'].extend(ocr_result_list)
 
         # 表格识别 table recognition

+ 3 - 0
mineru/backend/pipeline/model_json_to_middle_json.py

@@ -48,6 +48,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
     """获取所有的spans信息"""
     spans = magic_model.get_all_spans()
 
+    """某些图可能是文本块,通过简单的规则判断一下"""
     if len(maybe_text_image_blocks) > 0:
         for block in maybe_text_image_blocks:
             span_in_block_list = []
@@ -64,8 +65,10 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
                     if ratio > 0.25 and ocr:
                         # 移除block的group_id
                         block.pop('group_id', None)
+                        # 符合文本图的条件就把块加入到文本块列表中
                         text_blocks.append(block)
                     else:
+                        # 如果不符合文本图的条件,就把块加回到图片块列表中
                         img_body_blocks.append(block)
             else:
                 img_body_blocks.append(block)

+ 7 - 6
mineru/cli/common.py

@@ -215,9 +215,10 @@ def do_parse(
 
 
 if __name__ == "__main__":
-    pdf_path = "../../demo/pdfs/demo2.pdf"
-    with open(pdf_path, "rb") as f:
-        try:
-           do_parse("./output", [Path(pdf_path).stem], [f.read()],["ch"], end_page_id=20,)
-        except Exception as e:
-            logger.exception(e)
+    pdf_path = "../../demo/pdfs/hello-algo-1.1.0-zh-c-word转换的span有问题.pdf"
+    # pdf_path = "C:/Users/zhaoxiaomeng/Downloads/input_img_0.jpg"
+
+    try:
+       do_parse("./output", [Path(pdf_path).stem], [read_fn(Path(pdf_path))],["ch"], end_page_id=20,)
+    except Exception as e:
+        logger.exception(e)