Forráskód Böngészése

feat: 更新OCR结果比较功能,添加日期时间格式检测和解析逻辑

zhch158_admin 1 hete
szülő
commit
cc3e15d2d7
2 módosított fájl, 67 hozzáadás és 20 törlés
  1. 2 2
      comparator/compare_ocr_results.py
  2. 65 18
      comparator/data_type_detector.py

+ 2 - 2
comparator/compare_ocr_results.py

@@ -80,8 +80,8 @@ if __name__ == "__main__":
         # 测试流水表格对比
         import time
         result = compare_ocr_results(
-            file1_path='/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/paddleocr_vl_results_cell_bbox/2023年度报告母公司_page_003.md',
-            file2_path='/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/mineru_vllm_results_cell_bbox/2023年度报告母公司_page_003.md',
+            file1_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/dotsocr_vllm_results_cell_bbox/B用户_扫描流水_page_008.md',
+            file2_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru_vllm_results/B用户_扫描流水_page_008.md',
             output_file=f'/Users/zhch158/workspace/repository.git/ocr_verify/output/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
             output_format='both',
             ignore_images=True,

+ 65 - 18
comparator/data_type_detector.py

@@ -30,6 +30,18 @@ class DataTypeDetector:
         
         clean_text = re.sub(r'[\s-]', '', text)
         
+        # 排除日期格式 yyyymmdd
+        if len(clean_text) == 8 and clean_text.isdigit():
+            # 检查是否为合法日期
+            try:
+                year = int(clean_text[:4])
+                month = int(clean_text[4:6])
+                day = int(clean_text[6:8])
+                if 1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 31:
+                    return False  # 这是日期,不是文本型数字
+            except ValueError:
+                pass
+        
         if clean_text.isdigit() and len(clean_text) > 15:
             return True
         
@@ -39,6 +51,37 @@ class DataTypeDetector:
         return False
     
     @staticmethod
+    def is_datetime(text: str) -> bool:
+        """判断文本是否为日期时间格式"""
+        if not text:
+            return False
+        
+        datetime_patterns = [
+            r'\d{4}[-/]\d{1,2}[-/]\d{1,2}',
+            r'\d{4}[-/]\d{1,2}[-/]\d{1,2}\s*\d{1,2}:\d{1,2}:\d{1,2}',
+            r'\d{4}年\d{1,2}月\d{1,2}日',
+            r'^\d{8}$',  # yyyymmdd 格式
+        ]
+        
+        for pattern in datetime_patterns:
+            if re.search(pattern, str(text).strip()):
+                # 对于 yyyymmdd 格式,验证日期合法性
+                if pattern == r'^\d{8}$':
+                    try:
+                        clean_text = str(text).strip()
+                        year = int(clean_text[:4])
+                        month = int(clean_text[4:6])
+                        day = int(clean_text[6:8])
+                        if 1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 31:
+                            return True
+                    except (ValueError, IndexError):
+                        continue
+                else:
+                    return True
+        
+        return False
+    
+    @staticmethod
     def parse_number(text: str) -> float:
         """解析数字,处理千分位和货币符号"""
         if not text:
@@ -65,17 +108,33 @@ class DataTypeDetector:
     def extract_datetime(text: str) -> str:
         """提取并标准化日期时间"""
         patterns = [
+            # yyyy-mm-dd hh:mm:ss
             (r'(\d{4})[-/](\d{1,2})[-/](\d{1,2})\s*(\d{1,2}):(\d{1,2}):(\d{1,2})', 
              lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)} {m.group(4).zfill(2)}:{m.group(5).zfill(2)}:{m.group(6).zfill(2)}"),
+            # yyyy-mm-dd
             (r'(\d{4})[-/](\d{1,2})[-/](\d{1,2})', 
              lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)}"),
+            # yyyy年mm月dd日
             (r'(\d{4})年(\d{1,2})月(\d{1,2})日', 
              lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)}"),
+            # yyyymmdd
+            (r'^(\d{4})(\d{2})(\d{2})$',
+             lambda m: f"{m.group(1)}{m.group(2)}{m.group(3)}"),
         ]
         
         for pattern, formatter in patterns:
-            match = re.search(pattern, text)
+            match = re.search(pattern, str(text).strip())
             if match:
+                if pattern == r'^(\d{4})(\d{2})(\d{2})$':
+                    # 验证日期合法性
+                    try:
+                        year = int(match.group(1))
+                        month = int(match.group(2))
+                        day = int(match.group(3))
+                        if not (1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 31):
+                            continue
+                    except ValueError:
+                        continue
                 return formatter(match)
         
         return text
@@ -90,28 +149,16 @@ class DataTypeDetector:
         if not non_empty_values:
             return 'text'
         
+        # 先检测日期时间(优先级最高,避免 yyyymmdd 被误判为文本型数字)
+        datetime_count = sum(1 for v in non_empty_values[:5] if DataTypeDetector.is_datetime(v))
+        if datetime_count >= len(non_empty_values[:5]) * 0.6:
+            return 'datetime'
+        
         # 检测文本型数字
         text_number_count = sum(1 for v in non_empty_values[:5] if DataTypeDetector.is_text_number(v))
         if text_number_count >= len(non_empty_values[:5]) * 0.6:
             return 'text'
         
-        # 检测日期时间
-        datetime_patterns = [
-            r'\d{4}[-/]\d{1,2}[-/]\d{1,2}',
-            r'\d{4}[-/]\d{1,2}[-/]\d{1,2}\s*\d{1,2}:\d{1,2}:\d{1,2}',
-            r'\d{4}年\d{1,2}月\d{1,2}日',
-        ]
-        
-        datetime_count = 0
-        for value in non_empty_values[:5]:
-            for pattern in datetime_patterns:
-                if re.search(pattern, value):
-                    datetime_count += 1
-                    break
-        
-        if datetime_count >= len(non_empty_values[:5]) * 0.6:
-            return 'datetime'
-        
         # 检测数字
         numeric_count = sum(1 for v in non_empty_values[:5] 
                            if DataTypeDetector.is_numeric(v) and not DataTypeDetector.is_text_number(v))