Bläddra i källkod

fix(compare_ocr_results): 更新测试文件路径和输出目录以反映新的数据结构
fix(content_extractor): 改进表格匹配正则表达式以支持带属性的表格

zhch158_admin 6 dagar sedan
förälder
incheckning
4715244ada
2 ändrade filer med 6 tillägg och 6 borttagningar
  1. 3 3
      ocr_comparator/compare_ocr_results.py
  2. 3 3
      ocr_comparator/content_extractor.py

+ 3 - 3
ocr_comparator/compare_ocr_results.py

@@ -90,9 +90,9 @@ if __name__ == "__main__":
         # 测试流水表格对比
         import time
         result = compare_ocr_results(
-            file1_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/dotsocr_vllm_results_cell_bbox/B用户_扫描流水_page_008.md',
-            file2_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru_vllm_results/B用户_扫描流水_page_008.md',
-            output_file=f'/Users/zhch158/workspace/repository.git/ocr_verify/output/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
+            file1_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/bank_statement_yusys_v4/B用户_扫描流水_page_001.md',
+            file2_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/bank_statement_yusys_v3/B用户_扫描流水_page_001.md',
+            output_file=f'/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/logs/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
             output_format='both',
             ignore_images=True,
             table_mode='flow_list',  # 使用流水表格模式

+ 3 - 3
ocr_comparator/content_extractor.py

@@ -61,14 +61,14 @@ class ContentExtractor:
                 ]
             }
         """
-        # 查找所有表格的位置
-        table_pattern = r'<table>.*?</table>'
+        # 匹配一个可能带任意属性的 <table ...> 到对应的 </table> 区间
+        table_pattern = r'<table\b[^>]*>.*?</table>'
         tables = []
         paragraph_blocks = []
         
         last_pos = 0
         
-        for match in re.finditer(table_pattern, content, re.DOTALL):
+        for match in re.finditer(table_pattern, content, re.DOTALL | re.IGNORECASE):
             start_pos = match.start()
             end_pos = match.end()