há 1 mês atrás · 2e1eecddfd
--- a/ocr_by_vlm.py
+++ b/ocr_by_vlm.py
@@ -7,89 +7,11 @@ from pathlib import Path
 
				 from openai import OpenAI
			
 
				 from dotenv import load_dotenv
			
 
				 from typing import Any, Dict, List
			
 
				+from normalize_financial_numbers import normalize_financial_numbers, normalize_markdown_table
			
 
				 
			
 
				 # 加载环境变量
			
 
				 load_dotenv()
			
 
				 
			
 
				-def normalize_financial_numbers(text: str) -> str:
			
 
				-    """
			
 
				-    标准化财务数字：将全角字符转换为半角字符
			
 
				-    
			
 
				-    Args:
			
 
				-        text: 原始文本
			
 
				-    
			
 
				-    Returns:
			
 
				-        标准化后的文本
			
 
				-    """
			
 
				-    if not text:
			
 
				-        return text
			
 
				-    
			
 
				-    # 定义全角到半角的映射
			
 
				-    fullwidth_to_halfwidth = {
			
 
				-        '０': '0', '１': '1', '２': '2', '３': '3', '４': '4',
			
 
				-        '５': '5', '６': '6', '７': '7', '８': '8', '９': '9',
			
 
				-        '，': ',',  # 全角逗号转半角逗号
			
 
				-        '。': '.',  # 全角句号转半角句号  
			
 
				-        '．': '.',  # 全角句点转半角句点
			
 
				-        '：': ':',  # 全角冒号转半角冒号
			
 
				-        '；': ';',  # 全角分号转半角分号
			
 
				-        '（': '(',  # 全角左括号转半角左括号
			
 
				-        '）': ')',  # 全角右括号转半角右括号
			
 
				-        '－': '-',  # 全角减号转半角减号
			
 
				-        '＋': '+',  # 全角加号转半角加号
			
 
				-        '％': '%',  # 全角百分号转半角百分号
			
 
				-    }
			
 
				-    
			
 
				-    # 执行字符替换
			
 
				-    normalized_text = text
			
 
				-    for fullwidth, halfwidth in fullwidth_to_halfwidth.items():
			
 
				-        normalized_text = normalized_text.replace(fullwidth, halfwidth)
			
 
				-    
			
 
				-    # 特别处理金额格式：识别数字模式并标准化
			
 
				-    # 匹配金额模式：数字+全角逗号+数字+小数点+数字
			
 
				-    amount_pattern = r'(\d+(?:[，,]\d{3})*(?:[。．.]\d{2})?)'
			
 
				-    
			
 
				-    def normalize_amount(match):
			
 
				-        amount = match.group(1)
			
 
				-        # 将全角逗号替换为半角逗号
			
 
				-        amount = amount.replace('，', ',')
			
 
				-        # 将全角句号、句点替换为半角小数点
			
 
				-        amount = re.sub(r'[。．]', '.', amount)
			
 
				-        return amount
			
 
				-    
			
 
				-    normalized_text = re.sub(amount_pattern, normalize_amount, normalized_text)
			
 
				-    
			
 
				-    return normalized_text
			
 
				-
			
 
				-def normalize_markdown_table(markdown_content: str) -> str:
			
 
				-    """
			
 
				-    专门处理Markdown表格中的数字标准化
			
 
				-    
			
 
				-    Args:
			
 
				-        markdown_content: Markdown内容
			
 
				-    
			
 
				-    Returns:
			
 
				-        标准化后的Markdown内容
			
 
				-    """
			
 
				-    # 使用BeautifulSoup处理HTML表格
			
 
				-    from bs4 import BeautifulSoup
			
 
				-    
			
 
				-    soup = BeautifulSoup(markdown_content, 'html.parser')
			
 
				-    tables = soup.find_all('table')
			
 
				-    
			
 
				-    for table in tables:
			
 
				-        cells = table.find_all(['td', 'th'])
			
 
				-        for cell in cells:
			
 
				-            original_text = cell.get_text()
			
 
				-            normalized_text = normalize_financial_numbers(original_text)
			
 
				-            
			
 
				-            # 如果内容发生了变化，更新单元格内容
			
 
				-            if original_text != normalized_text:
			
 
				-                cell.string = normalized_text
			
 
				-    
			
 
				-    # 返回更新后的HTML
			
 
				-    return str(soup)
			
 
				-
			
 
				 def ocr_with_vlm(image_path, output_dir="./output", 
			
 
				                         api_key=None, api_base=None, model_id=None, 
			
 
				                         temperature=0.1, max_tokens=4096, timeout=180,