normalize_financial_numbers.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. import re
  2. import os
  3. from pathlib import Path
  4. def _normalize_amount_token(token: str) -> str:
  5. """
  6. 规范单个金额 token 中逗号/小数点的用法,统一输出美式格式(千分位逗号 + 点小数)。
  7. 算法:
  8. 1. 找小数分隔符:优先取最后一个 '.'(若其后恰好为 1-2 位纯数字),
  9. 次选最后一个 ','(同条件);均不满足则视为纯整数。
  10. 2. 整数部分去除所有逗号和点,得到纯数字串,重新按三位一组插入千分位逗号。
  11. 3. 与小数部分拼接,统一输出 xxx,xxx.xx 格式。
  12. """
  13. if not token:
  14. return token
  15. # 只处理含分隔符的数字串,避免误改年份/ID 等纯数字
  16. if not re.fullmatch(r"[+-]?\d[\d,\.]*\d", token):
  17. return token
  18. if "," not in token and "." not in token:
  19. return token
  20. sign = ""
  21. core = token
  22. if core[0] in "+-":
  23. sign, core = core[0], core[1:]
  24. # 条件1:去符号后为纯整数(无分隔符),无需处理
  25. if core.isdigit():
  26. return token
  27. # 条件2:最后一个小数点之前无逗号/小数点(整数部分是纯数字)→ 已是正确小数格式,直接返回
  28. dot_pos = core.rfind('.')
  29. if dot_pos != -1 and core[:dot_pos].isdigit() and core[dot_pos + 1:].isdigit():
  30. return token
  31. # 步骤 1:确定小数分隔符('.' 优先于 ',')
  32. dec_digits: str | None = None
  33. int_part = core
  34. for sep in (".", ","):
  35. pos = core.rfind(sep)
  36. if pos == -1:
  37. continue
  38. after = core[pos + 1 :]
  39. if 1 <= len(after) <= 2 and after.isdigit():
  40. dec_digits = after
  41. int_part = core[:pos]
  42. break
  43. # 步骤 2:整数部分去除所有分隔符,得到纯数字串
  44. int_digits = re.sub(r"[,.]", "", int_part)
  45. if not int_digits or not int_digits.isdigit():
  46. return token # 无法解析,保留原样
  47. # 步骤 2.5:整数部分本身没有分隔符(如 1101,55 中的 1101)
  48. # → 原数字未使用千分位,只修正小数点符号,不添加千分位
  49. if int_part == int_digits:
  50. result = sign + int_digits
  51. if dec_digits is not None:
  52. result += "." + dec_digits
  53. return result
  54. # 步骤 3:重新做千分位分组
  55. n = len(int_digits)
  56. rem = n % 3 or 3
  57. groups = [int_digits[:rem]] + [int_digits[i : i + 3] for i in range(rem, n, 3)]
  58. result = sign + ",".join(groups)
  59. if dec_digits is not None:
  60. result += "." + dec_digits
  61. return result
  62. def normalize_financial_numbers(text: str) -> str:
  63. """
  64. 标准化财务数字:将全角字符转换为半角字符,并纠正常见的逗号/小数点错用。
  65. """
  66. if not text:
  67. return text
  68. # 定义全角到半角的映射
  69. fullwidth_to_halfwidth = {
  70. '0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
  71. '5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
  72. ',': ',', # 全角逗号转半角逗号
  73. '。': '.', # 全角句号转半角句号
  74. '.': '.', # 全角句点转半角句点
  75. ':': ':', # 全角冒号转半角冒号
  76. ';': ';', # 全角分号转半角分号
  77. '(': '(', # 全角左括号转半角左括号
  78. ')': ')', # 全角右括号转半角右括号
  79. '-': '-', # 全角减号转半角减号
  80. '+': '+', # 全角加号转半角加号
  81. '%': '%', # 全角百分号转半角百分号
  82. }
  83. # 第一步:执行基础字符替换(全角 -> 半角)
  84. normalized_text = text
  85. for fullwidth, halfwidth in fullwidth_to_halfwidth.items():
  86. normalized_text = normalized_text.replace(fullwidth, halfwidth)
  87. # 第二步:处理数字序列中的空格和分隔符(保留原有逻辑)
  88. number_sequence_pattern = r'(\d+(?:\s*[,,]\s*\d+)*(?:\s*[。..]\s*\d+)?)'
  89. def normalize_number_sequence(match):
  90. sequence = match.group(1)
  91. sequence = re.sub(r'(\d)\s*[,,]\s*(\d)', r'\1,\2', sequence)
  92. sequence = re.sub(r'(\d)\s*[。..]\s*(\d)', r'\1.\2', sequence)
  93. return sequence
  94. normalized_text = re.sub(number_sequence_pattern, normalize_number_sequence, normalized_text)
  95. # 第三步:对疑似金额 token 做逗号/小数点纠错
  96. amount_pattern = r'(?P<tok>[+-]?\d[\d,\.]*\d)'
  97. def _amount_sub(m: re.Match) -> str:
  98. tok = m.group('tok')
  99. return _normalize_amount_token(tok)
  100. normalized_text = re.sub(amount_pattern, _amount_sub, normalized_text)
  101. return normalized_text
  102. def normalize_markdown_table(markdown_content: str) -> str:
  103. """
  104. 专门处理Markdown表格中的数字标准化
  105. 注意:保留原始markdown中的换行符,只替换表格内的文本内容
  106. Args:
  107. markdown_content: Markdown内容
  108. Returns:
  109. 标准化后的Markdown内容
  110. """
  111. # 使用BeautifulSoup处理HTML表格
  112. from bs4 import BeautifulSoup, Tag
  113. import re
  114. # 使用正则表达式找到所有表格的位置,并保留其前后的内容
  115. # 匹配完整的HTML表格标签(包括嵌套)
  116. table_pattern = r'(<table[^>]*>.*?</table>)'
  117. def normalize_table_match(match):
  118. """处理单个表格匹配,保留原始格式,并追加数字标准化说明注释。"""
  119. table_html = match.group(1)
  120. original_table_html = table_html # 保存原始HTML用于比较
  121. # 解析表格HTML
  122. soup = BeautifulSoup(table_html, 'html.parser')
  123. tables = soup.find_all('table')
  124. # 记录本表格中所有数值修改
  125. changes: list[dict] = []
  126. for table in tables:
  127. if not isinstance(table, Tag):
  128. continue
  129. # 通过 tr / td(th) 计算行列位置
  130. for row_idx, tr in enumerate(table.find_all('tr')): # type: ignore[reportAttributeAccessIssue]
  131. cells = tr.find_all(['td', 'th']) # type: ignore[reportAttributeAccessIssue]
  132. for col_idx, cell in enumerate(cells):
  133. if not isinstance(cell, Tag):
  134. continue
  135. # 与 normalize_json_table 一致:整格取文本、只标准化一次、再写回
  136. original_text = cell.get_text()
  137. normalized_text = normalize_financial_numbers(original_text)
  138. if original_text == normalized_text:
  139. continue
  140. # 记录一条修改
  141. changes.append(
  142. {
  143. "row": row_idx,
  144. "col": col_idx,
  145. "old": original_text,
  146. "new": normalized_text,
  147. }
  148. )
  149. # 整格替换为标准化后的文本(与 normalize_json_table 的 cell.string = normalized_text 一致)
  150. cell.string = normalized_text
  151. # 如果没有任何数值修改,直接返回原始 HTML
  152. if not changes:
  153. return original_table_html
  154. # 获取修改后的HTML
  155. modified_html = str(soup)
  156. # 在表格后追加注释,说明哪些单元格被修改
  157. lines = ["<!-- 数字标准化说明:"]
  158. for ch in changes:
  159. lines.append(
  160. f" - [row={ch['row']},col={ch['col']}] {ch['old']} -> {ch['new']}"
  161. )
  162. lines.append("-->")
  163. comment = "\n".join(lines)
  164. return modified_html + "\n\n" + comment
  165. # 使用正则替换,只替换表格内容,保留其他部分(包括换行符)不变
  166. normalized_content = re.sub(table_pattern, normalize_table_match, markdown_content, flags=re.DOTALL)
  167. return normalized_content
  168. def normalize_json_table(
  169. json_content: str,
  170. *,
  171. table_type_key: str = "category",
  172. table_type_value: str = "Table",
  173. html_key: str = "text",
  174. cells_key: str | None = None,
  175. ) -> str:
  176. """
  177. 专门处理JSON格式OCR结果中表格的数字标准化。
  178. 通过参数指定提取用的 key,以兼容不同 OCR 工具的 JSON 结构。
  179. Args:
  180. json_content: JSON格式的OCR结果内容(字符串或已解析的 list)
  181. table_type_key: 用于判断“是否为表格”的字段名,如 "type" 或 "category"
  182. table_type_value: 上述字段等于该值时视为表格,如 "table" 或 "Table"
  183. html_key: 存放表格 HTML 的字段名,如 "table_body" 或 "text"
  184. cells_key: 存放单元格列表的字段名,如 "table_cells";为 None 则不处理 cells,
  185. 仅标准化 html_key 中的表格
  186. Returns:
  187. 标准化后的JSON内容(字符串)
  188. 常见格式示例:
  189. - 旧格式: category="Table", html 在 "text"
  190. normalize_json_table(s) # 默认即此
  191. - mineru_vllm_results_cell_bbox: type="table", html 在 "table_body", cells 在 "table_cells"
  192. normalize_json_table(s, table_type_key="type", table_type_value="table",
  193. html_key="table_body", cells_key="table_cells")
  194. """
  195. import json
  196. from ast import literal_eval
  197. try:
  198. data = json.loads(json_content) if isinstance(json_content, str) else json_content
  199. if not isinstance(data, list):
  200. return json_content
  201. for item in data:
  202. if not isinstance(item, dict):
  203. continue
  204. # 按参数判断是否为表格项,且包含 HTML
  205. if item.get(table_type_key) != table_type_value or html_key not in item:
  206. continue
  207. table_html = item[html_key]
  208. if not table_html or not isinstance(table_html, str):
  209. continue
  210. from bs4 import BeautifulSoup, Tag
  211. soup = BeautifulSoup(table_html, "html.parser")
  212. tables = soup.find_all("table")
  213. table_changes: list[dict] = []
  214. for table in tables:
  215. if not isinstance(table, Tag):
  216. continue
  217. for row_idx, tr in enumerate(table.find_all("tr")): # type: ignore[reportAttributeAccessIssue]
  218. cells_tag = tr.find_all(["td", "th"]) # type: ignore[reportAttributeAccessIssue]
  219. for col_idx, cell in enumerate(cells_tag):
  220. if not isinstance(cell, Tag):
  221. continue
  222. original_text = cell.get_text()
  223. normalized_text = normalize_financial_numbers(original_text)
  224. if original_text == normalized_text:
  225. continue
  226. change: dict[str, object] = {
  227. "row": row_idx,
  228. "col": col_idx,
  229. "old": original_text,
  230. "new": normalized_text,
  231. }
  232. bbox_attr = cell.get("data-bbox")
  233. if isinstance(bbox_attr, str):
  234. try:
  235. change["bbox"] = literal_eval(bbox_attr)
  236. except Exception:
  237. change["bbox"] = bbox_attr
  238. table_changes.append(change)
  239. cell.string = normalized_text
  240. # 写回 HTML
  241. item[html_key] = str(soup)
  242. if table_changes:
  243. item["number_normalization_changes"] = table_changes
  244. # 若指定了 cells_key,同时标准化 cells 中每格的 text(及 matched_text)
  245. # for key in ("text", "matched_text"):
  246. table_cell_text_keys = ["text"]
  247. if cells_key and cells_key in item and isinstance(item[cells_key], list):
  248. for cell in item[cells_key]:
  249. if not isinstance(cell, dict):
  250. continue
  251. for key in table_cell_text_keys:
  252. if key not in cell or not isinstance(cell[key], str):
  253. continue
  254. orig = cell[key]
  255. norm = normalize_financial_numbers(orig)
  256. if norm != orig:
  257. cell[key] = norm
  258. return json.dumps(data, ensure_ascii=False, indent=2)
  259. except json.JSONDecodeError as e:
  260. print(f"⚠️ JSON解析失败: {e}")
  261. return json_content
  262. except Exception as e:
  263. print(f"⚠️ JSON表格标准化失败: {e}")
  264. return json_content
  265. def normalize_json_file(
  266. file_path: str,
  267. output_path: str | None = None,
  268. *,
  269. table_type_key: str = "category",
  270. table_type_value: str = "Table",
  271. html_key: str = "text",
  272. cells_key: str | None = None,
  273. ) -> str:
  274. """
  275. 标准化JSON文件中的表格数字。
  276. 提取表格时使用的 key 可通过参数指定,以兼容不同 OCR 工具。
  277. Args:
  278. file_path: 输入JSON文件路径
  279. output_path: 输出文件路径,如果为None则覆盖原文件
  280. table_type_key: 判断表格的字段名(见 normalize_json_table)
  281. table_type_value: 判断表格的字段值
  282. html_key: 表格 HTML 所在字段名
  283. cells_key: 单元格列表所在字段名,None 表示不处理 cells
  284. Returns:
  285. 标准化后的JSON内容
  286. """
  287. input_file = Path(file_path)
  288. output_file = Path(output_path) if output_path else input_file
  289. if not input_file.exists():
  290. raise FileNotFoundError(f"找不到文件: {file_path}")
  291. with open(input_file, "r", encoding="utf-8") as f:
  292. original_content = f.read()
  293. print(f"🔧 正在标准化JSON文件: {input_file.name}")
  294. normalized_content = normalize_json_table(
  295. original_content,
  296. table_type_key=table_type_key,
  297. table_type_value=table_type_value,
  298. html_key=html_key,
  299. cells_key=cells_key,
  300. )
  301. # 保存标准化后的文件
  302. with open(output_file, 'w', encoding='utf-8') as f:
  303. f.write(normalized_content)
  304. # 统计变化
  305. changes = sum(1 for o, n in zip(original_content, normalized_content) if o != n)
  306. if changes > 0:
  307. print(f"✅ 标准化了 {changes} 个字符")
  308. # 如果输出路径不同,也保存原始版本
  309. if output_path and output_path != file_path:
  310. original_backup = Path(output_path).parent / f"{Path(output_path).stem}_original.json"
  311. with open(original_backup, 'w', encoding='utf-8') as f:
  312. f.write(original_content)
  313. print(f"📄 原始版本已保存到: {original_backup}")
  314. else:
  315. print("ℹ️ 无需标准化(已是标准格式)")
  316. print(f"📄 标准化结果已保存到: {output_file}")
  317. return normalized_content
  318. if __name__ == "__main__":
  319. """
  320. 简单验证:构造一份“故意打乱逗号/小数点”的 JSON / Markdown 示例,
  321. 并打印标准化前后的差异。
  322. """
  323. import json
  324. print("=== JSON 示例:金额格式纠错 + 变更记录 ===")
  325. demo_json_data = [
  326. {
  327. "category": "Table",
  328. "text": (
  329. "<table><tbody>"
  330. "<tr><td data-bbox=\"[0,0,10,10]\">项目</td>"
  331. "<td data-bbox=\"[10,0,20,10]\">2023 年12 月31 日</td></tr>"
  332. # 故意打乱的数字:应为 12,123,456.00 和 1,234,567.89
  333. "<tr><td data-bbox=\"[0,10,10,20]\">测试金额A</td>"
  334. "<td data-bbox=\"[10,10,20,20]\">12.123,456,00</td></tr>"
  335. "<tr><td data-bbox=\"[0,20,10,30]\">测试金额B</td>"
  336. "<td data-bbox=\"[10,20,20,30]\">1,234,567,89</td></tr>"
  337. "<tr><td data-bbox=\"[0,20,10,40]\">测试金额C</td>"
  338. "<td data-bbox=\"[10,20,20,40]\">301,55</td></tr>"
  339. "<tr><td data-bbox=\"[0,20,10,50]\">测试金额D</td>"
  340. "<td data-bbox=\"[10,20,20,40]\">1.068.987,094.02</td></tr>"
  341. "</tbody></table>"
  342. ),
  343. }
  344. ]
  345. demo_json_str = json.dumps(demo_json_data, ensure_ascii=False, indent=2)
  346. print("原始 JSON:")
  347. print(demo_json_str)
  348. normalized_json_str = normalize_json_table(demo_json_str)
  349. print("\n标准化后 JSON:")
  350. print(normalized_json_str)
  351. print("\n=== Markdown 示例:金额格式纠错 + 注释说明 ===")
  352. demo_md = """<table><tbody>
  353. <tr><td>项目</td><td>2023 年12 月31 日</td></tr>
  354. <tr><td>测试金额A</td><td>12.123,456,00</td></tr>
  355. <tr><td>测试金额B</td><td>1,234,567,89</td></tr>
  356. <tr><td>测试金额C</td><td>301,55</td></tr>
  357. <tr><td>测试金额D</td><td>1.068.987,094.02</td></tr>
  358. </tbody></table>
  359. """
  360. print("原始 Markdown:")
  361. print(demo_md)
  362. normalized_md = normalize_markdown_table(demo_md)
  363. print("\n标准化后 Markdown:")
  364. print(normalized_md)
  365. cases = [
  366. # A 类:标准美式格式,不应被修改
  367. ("10,000.00", "10,000.00"),
  368. ("67,455.00", "67,455.00"),
  369. ("89,400.00", "89,400.00"),
  370. ("100,200.00", "100,200.00"),
  371. ("494,339.63", "494,339.63"),
  372. ("1,179.05", "1,179.05"),
  373. ("27,396.05", "27,396.05"),
  374. # B 类:混合/大数格式,需被修正
  375. ("19.879,111.45", "19,879,111.45"),
  376. ("27.072,795.05", "27,072,795.05"),
  377. ("468.348,422.85", "468,348,422.85"),
  378. ("4740,251.56", "4,740,251.56"),
  379. # C 类:多余分隔符
  380. ("585,515.936.19", "585,515,936.19"),
  381. ("22,240.761.60", "22,240,761.60"),
  382. ("198,757.280.38", "198,757,280.38"),
  383. ("618,846.219.71", "618,846,219.71"),
  384. # 原 demo 案例
  385. ("12.123,456,00", "12,123,456.00"),
  386. ("1,234,567,89", "1,234,567.89"),
  387. ("301,55", "301.55"),
  388. ("1.068.987,094.02", "1,068,987,094.02"),
  389. # 标准欧洲格式
  390. ("1.234,56", "1,234.56"),
  391. ("1.234.567,89", "1,234,567.89"),
  392. ]
  393. ok = fail = 0
  394. for inp, expected in cases:
  395. got = _normalize_amount_token(inp)
  396. status = "✅" if got == expected else "❌"
  397. if got != expected:
  398. fail += 1
  399. print(f"{status} {inp!r:30s} → {got!r}" + (f" (期望 {expected!r})" if got != expected else ""))
  400. print(f"\n共 {ok+fail} 个,通过 {len(cases)-fail},失败 {fail}")