|
|
@@ -7,89 +7,11 @@ from pathlib import Path
|
|
|
from openai import OpenAI
|
|
|
from dotenv import load_dotenv
|
|
|
from typing import Any, Dict, List
|
|
|
+from normalize_financial_numbers import normalize_financial_numbers, normalize_markdown_table
|
|
|
|
|
|
# 加载环境变量
|
|
|
load_dotenv()
|
|
|
|
|
|
-def normalize_financial_numbers(text: str) -> str:
|
|
|
- """
|
|
|
- 标准化财务数字:将全角字符转换为半角字符
|
|
|
-
|
|
|
- Args:
|
|
|
- text: 原始文本
|
|
|
-
|
|
|
- Returns:
|
|
|
- 标准化后的文本
|
|
|
- """
|
|
|
- if not text:
|
|
|
- return text
|
|
|
-
|
|
|
- # 定义全角到半角的映射
|
|
|
- fullwidth_to_halfwidth = {
|
|
|
- '0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
|
|
|
- '5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
|
|
|
- ',': ',', # 全角逗号转半角逗号
|
|
|
- '。': '.', # 全角句号转半角句号
|
|
|
- '.': '.', # 全角句点转半角句点
|
|
|
- ':': ':', # 全角冒号转半角冒号
|
|
|
- ';': ';', # 全角分号转半角分号
|
|
|
- '(': '(', # 全角左括号转半角左括号
|
|
|
- ')': ')', # 全角右括号转半角右括号
|
|
|
- '-': '-', # 全角减号转半角减号
|
|
|
- '+': '+', # 全角加号转半角加号
|
|
|
- '%': '%', # 全角百分号转半角百分号
|
|
|
- }
|
|
|
-
|
|
|
- # 执行字符替换
|
|
|
- normalized_text = text
|
|
|
- for fullwidth, halfwidth in fullwidth_to_halfwidth.items():
|
|
|
- normalized_text = normalized_text.replace(fullwidth, halfwidth)
|
|
|
-
|
|
|
- # 特别处理金额格式:识别数字模式并标准化
|
|
|
- # 匹配金额模式:数字+全角逗号+数字+小数点+数字
|
|
|
- amount_pattern = r'(\d+(?:[,,]\d{3})*(?:[。..]\d{2})?)'
|
|
|
-
|
|
|
- def normalize_amount(match):
|
|
|
- amount = match.group(1)
|
|
|
- # 将全角逗号替换为半角逗号
|
|
|
- amount = amount.replace(',', ',')
|
|
|
- # 将全角句号、句点替换为半角小数点
|
|
|
- amount = re.sub(r'[。.]', '.', amount)
|
|
|
- return amount
|
|
|
-
|
|
|
- normalized_text = re.sub(amount_pattern, normalize_amount, normalized_text)
|
|
|
-
|
|
|
- return normalized_text
|
|
|
-
|
|
|
-def normalize_markdown_table(markdown_content: str) -> str:
|
|
|
- """
|
|
|
- 专门处理Markdown表格中的数字标准化
|
|
|
-
|
|
|
- Args:
|
|
|
- markdown_content: Markdown内容
|
|
|
-
|
|
|
- Returns:
|
|
|
- 标准化后的Markdown内容
|
|
|
- """
|
|
|
- # 使用BeautifulSoup处理HTML表格
|
|
|
- from bs4 import BeautifulSoup
|
|
|
-
|
|
|
- soup = BeautifulSoup(markdown_content, 'html.parser')
|
|
|
- tables = soup.find_all('table')
|
|
|
-
|
|
|
- for table in tables:
|
|
|
- cells = table.find_all(['td', 'th'])
|
|
|
- for cell in cells:
|
|
|
- original_text = cell.get_text()
|
|
|
- normalized_text = normalize_financial_numbers(original_text)
|
|
|
-
|
|
|
- # 如果内容发生了变化,更新单元格内容
|
|
|
- if original_text != normalized_text:
|
|
|
- cell.string = normalized_text
|
|
|
-
|
|
|
- # 返回更新后的HTML
|
|
|
- return str(soup)
|
|
|
-
|
|
|
def ocr_with_vlm(image_path, output_dir="./output",
|
|
|
api_key=None, api_base=None, model_id=None,
|
|
|
temperature=0.1, max_tokens=4096, timeout=180,
|