5 月之前 · 3334157f15
--- a/mineru/backend/pipeline/batch_analyze.py
+++ b/mineru/backend/pipeline/batch_analyze.py
@@ -230,19 +230,6 @@ class BatchAnalyze:
 
				                         ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'],
			
 
				                                                               new_image, _lang)
			
 
				 
			
 
				-                        # if res["category_id"] == 3 and ocr_res_list_dict['ocr_enable']:
			
 
				-                        #     # ocr_result_list中所有bbox的面积之和
			
 
				-                        #     ocr_res_area = sum(
			
 
				-                        #         get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
			
 
				-                        #     # 求ocr_res_area和res的面积的比值
			
 
				-                        #     res_area = get_coords_and_area(res)[4]
			
 
				-                        #     if res_area > 0:
			
 
				-                        #         ratio = ocr_res_area / res_area
			
 
				-                        #         if ratio > 0.25:
			
 
				-                        #             res["category_id"] = 1
			
 
				-                        #         else:
			
 
				-                        #             continue
			
 
				-
			
 
				                         ocr_res_list_dict['layout_res'].extend(ocr_result_list)
			
 
				 
			
 
				         # 表格识别 table recognition
			
--- a/mineru/backend/pipeline/model_json_to_middle_json.py
+++ b/mineru/backend/pipeline/model_json_to_middle_json.py
@@ -48,6 +48,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
 
				     """获取所有的spans信息"""
			
 
				     spans = magic_model.get_all_spans()
			
 
				 
			
 
				+    """某些图可能是文本块，通过简单的规则判断一下"""
			
 
				     if len(maybe_text_image_blocks) > 0:
			
 
				         for block in maybe_text_image_blocks:
			
 
				             span_in_block_list = []
			
@@ -64,8 +65,10 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
 
				                     if ratio > 0.25 and ocr:
			
 
				                         # 移除block的group_id
			
 
				                         block.pop('group_id', None)
			
 
				+                        # 符合文本图的条件就把块加入到文本块列表中
			
 
				                         text_blocks.append(block)
			
 
				                     else:
			
 
				+                        # 如果不符合文本图的条件，就把块加回到图片块列表中
			
 
				                         img_body_blocks.append(block)
			
 
				             else:
			
 
				                 img_body_blocks.append(block)
			
--- a/mineru/cli/common.py
+++ b/mineru/cli/common.py
@@ -215,9 +215,10 @@ def do_parse(
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    pdf_path = "../../demo/pdfs/demo2.pdf"
			
 
				-    with open(pdf_path, "rb") as f:
			
 
				-        try:
			
 
				-           do_parse("./output", [Path(pdf_path).stem], [f.read()],["ch"], end_page_id=20,)
			
 
				-        except Exception as e:
			
 
				-            logger.exception(e)
			
 
				+    pdf_path = "../../demo/pdfs/hello-algo-1.1.0-zh-c-word转换的span有问题.pdf"
			
 
				+    # pdf_path = "C:/Users/zhaoxiaomeng/Downloads/input_img_0.jpg"
			
 
				+
			
 
				+    try:
			
 
				+       do_parse("./output", [Path(pdf_path).stem], [read_fn(Path(pdf_path))],["ch"], end_page_id=20,)
			
 
				+    except Exception as e:
			
 
				+        logger.exception(e)