Эх сурвалжийг харах

refactor: enhance HTML table extraction and add block content conversion to HTML

myhloli 3 сар өмнө
parent
commit
28553212c3

+ 6 - 3
mineru/backend/pipeline/batch_analyze.py

@@ -256,9 +256,12 @@ class BatchAnalyze:
                 html_code, table_cell_bboxes, logic_points, elapse = table_model.predict(table_res_dict['table_img'])
                 # 判断是否返回正常
                 if html_code:
-                    expected_ending = html_code.strip().endswith('</html>') or html_code.strip().endswith('</table>')
-                    if expected_ending:
-                        table_res_dict['table_res']['html'] = html_code
+                    # 检查html_code是否包含'<table>'和'</table>'
+                    if '<table>' in html_code and '</table>' in html_code:
+                        # 选用<table>到</table>的内容,放入table_res_dict['table_res']['html']
+                        start_index = html_code.find('<table>')
+                        end_index = html_code.rfind('</table>') + len('</table>')
+                        table_res_dict['table_res']['html'] = html_code[start_index:end_index]
                     else:
                         logger.warning(
                             'table recognition processing fails, not found expected HTML table end'

+ 6 - 11
mineru/backend/vlm/vlm_magic_model.py

@@ -5,7 +5,7 @@ from loguru import logger
 
 from mineru.utils.enum_class import ContentType, BlockType, SplitFlag
 from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
-from mineru.utils.format_utils import convert_otsl_to_html
+from mineru.utils.format_utils import block_content_to_html
 from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_distance_v3
 
 
@@ -40,6 +40,10 @@ class MagicModel:
                 block_type = block_info[1].strip()
                 block_content = block_info[2].strip()
 
+                # 如果bbox是0,0,999,999,且type为text,按notes增加表格处理
+                if x1 == 0 and y1 == 0 and x2 == 999 and y2 == 999 and block_type == "text":
+                    block_content = block_content_to_html(block_content)
+
                 # print(f"坐标: {block_bbox}")
                 # print(f"类型: {block_type}")
                 # print(f"内容: {block_content}")
@@ -77,16 +81,7 @@ class MagicModel:
                     "type": span_type,
                 }
                 if span_type == ContentType.TABLE:
-                    if "<fcel>" in block_content or "<ecel>" in block_content:
-                        lines = block_content.split("\n\n")
-                        new_lines = []
-                        for line in lines:
-                            if "<fcel>" in line or "<ecel>" in line:
-                                line = convert_otsl_to_html(line)
-                            new_lines.append(line)
-                        span["html"] = "\n\n".join(new_lines)
-                    else:
-                        span["html"] = block_content
+                    span["html"] = block_content_to_html(block_content)
             elif span_type in [ContentType.INTERLINE_EQUATION]:
                 span = {
                     "bbox": block_bbox,

+ 10 - 0
mineru/utils/format_utils.py

@@ -317,3 +317,13 @@ def convert_otsl_to_html(otsl_content: str):
     )
 
     return export_to_html(table_data)
+
+
+def block_content_to_html(block_content: str) -> str:
+    lines = block_content.split("\n\n")
+    new_lines = []
+    for line in lines:
+        if "<fcel>" in line or "<ecel>" in line:
+            line = convert_otsl_to_html(line)
+        new_lines.append(line)
+    return "\n\n".join(new_lines)