normalize_financial_numbers.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. import re
  2. import os
  3. from pathlib import Path
  4. def _normalize_amount_token(token: str) -> str:
  5. """
  6. 规范单个金额 token 中逗号/小数点的用法,统一输出美式格式(千分位逗号 + 点小数)。
  7. 算法:
  8. 1. 找小数分隔符:优先取最后一个 '.'(若其后恰好为 1-2 位纯数字),
  9. 次选最后一个 ','(同条件);均不满足则视为纯整数。
  10. 2. 整数部分去除所有逗号和点,得到纯数字串,重新按三位一组插入千分位逗号。
  11. 3. 与小数部分拼接,统一输出 xxx,xxx.xx 格式。
  12. """
  13. if not token:
  14. return token
  15. # 只处理含分隔符的数字串,避免误改年份/ID 等纯数字
  16. if not re.fullmatch(r"[+-]?\d[\d,\.]*\d", token):
  17. return token
  18. if "," not in token and "." not in token:
  19. return token
  20. sign = ""
  21. core = token
  22. if core[0] in "+-":
  23. sign, core = core[0], core[1:]
  24. # 步骤 1:确定小数分隔符('.' 优先于 ',')
  25. dec_digits: str | None = None
  26. int_part = core
  27. for sep in (".", ","):
  28. pos = core.rfind(sep)
  29. if pos == -1:
  30. continue
  31. after = core[pos + 1 :]
  32. if 1 <= len(after) <= 2 and after.isdigit():
  33. dec_digits = after
  34. int_part = core[:pos]
  35. break
  36. # 步骤 2:整数部分去除所有分隔符,得到纯数字串
  37. int_digits = re.sub(r"[,.]", "", int_part)
  38. if not int_digits or not int_digits.isdigit():
  39. return token # 无法解析,保留原样
  40. # 步骤 3:重新做千分位分组
  41. n = len(int_digits)
  42. rem = n % 3 or 3
  43. groups = [int_digits[:rem]] + [int_digits[i : i + 3] for i in range(rem, n, 3)]
  44. result = sign + ",".join(groups)
  45. if dec_digits is not None:
  46. result += "." + dec_digits
  47. return result
  48. def normalize_financial_numbers(text: str) -> str:
  49. """
  50. 标准化财务数字:将全角字符转换为半角字符,并纠正常见的逗号/小数点错用。
  51. """
  52. if not text:
  53. return text
  54. # 定义全角到半角的映射
  55. fullwidth_to_halfwidth = {
  56. '0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
  57. '5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
  58. ',': ',', # 全角逗号转半角逗号
  59. '。': '.', # 全角句号转半角句号
  60. '.': '.', # 全角句点转半角句点
  61. ':': ':', # 全角冒号转半角冒号
  62. ';': ';', # 全角分号转半角分号
  63. '(': '(', # 全角左括号转半角左括号
  64. ')': ')', # 全角右括号转半角右括号
  65. '-': '-', # 全角减号转半角减号
  66. '+': '+', # 全角加号转半角加号
  67. '%': '%', # 全角百分号转半角百分号
  68. }
  69. # 第一步:执行基础字符替换(全角 -> 半角)
  70. normalized_text = text
  71. for fullwidth, halfwidth in fullwidth_to_halfwidth.items():
  72. normalized_text = normalized_text.replace(fullwidth, halfwidth)
  73. # 第二步:处理数字序列中的空格和分隔符(保留原有逻辑)
  74. number_sequence_pattern = r'(\d+(?:\s*[,,]\s*\d+)*(?:\s*[。..]\s*\d+)?)'
  75. def normalize_number_sequence(match):
  76. sequence = match.group(1)
  77. sequence = re.sub(r'(\d)\s*[,,]\s*(\d)', r'\1,\2', sequence)
  78. sequence = re.sub(r'(\d)\s*[。..]\s*(\d)', r'\1.\2', sequence)
  79. return sequence
  80. normalized_text = re.sub(number_sequence_pattern, normalize_number_sequence, normalized_text)
  81. # 第三步:对疑似金额 token 做逗号/小数点纠错
  82. amount_pattern = r'(?P<tok>[+-]?\d[\d,\.]*\d)'
  83. def _amount_sub(m: re.Match) -> str:
  84. tok = m.group('tok')
  85. return _normalize_amount_token(tok)
  86. normalized_text = re.sub(amount_pattern, _amount_sub, normalized_text)
  87. return normalized_text
  88. def normalize_markdown_table(markdown_content: str) -> str:
  89. """
  90. 专门处理Markdown表格中的数字标准化
  91. 注意:保留原始markdown中的换行符,只替换表格内的文本内容
  92. Args:
  93. markdown_content: Markdown内容
  94. Returns:
  95. 标准化后的Markdown内容
  96. """
  97. # 使用BeautifulSoup处理HTML表格
  98. from bs4 import BeautifulSoup, Tag
  99. import re
  100. # 使用正则表达式找到所有表格的位置,并保留其前后的内容
  101. # 匹配完整的HTML表格标签(包括嵌套)
  102. table_pattern = r'(<table[^>]*>.*?</table>)'
  103. def normalize_table_match(match):
  104. """处理单个表格匹配,保留原始格式,并追加数字标准化说明注释。"""
  105. table_html = match.group(1)
  106. original_table_html = table_html # 保存原始HTML用于比较
  107. # 解析表格HTML
  108. soup = BeautifulSoup(table_html, 'html.parser')
  109. tables = soup.find_all('table')
  110. # 记录本表格中所有数值修改
  111. changes: list[dict] = []
  112. for table in tables:
  113. if not isinstance(table, Tag):
  114. continue
  115. # 通过 tr / td(th) 计算行列位置
  116. for row_idx, tr in enumerate(table.find_all('tr')): # type: ignore[reportAttributeAccessIssue]
  117. cells = tr.find_all(['td', 'th']) # type: ignore[reportAttributeAccessIssue]
  118. for col_idx, cell in enumerate(cells):
  119. if not isinstance(cell, Tag):
  120. continue
  121. # 与 normalize_json_table 一致:整格取文本、只标准化一次、再写回
  122. original_text = cell.get_text()
  123. normalized_text = normalize_financial_numbers(original_text)
  124. if original_text == normalized_text:
  125. continue
  126. # 记录一条修改
  127. changes.append(
  128. {
  129. "row": row_idx,
  130. "col": col_idx,
  131. "old": original_text,
  132. "new": normalized_text,
  133. }
  134. )
  135. # 整格替换为标准化后的文本(与 normalize_json_table 的 cell.string = normalized_text 一致)
  136. cell.string = normalized_text
  137. # 如果没有任何数值修改,直接返回原始 HTML
  138. if not changes:
  139. return original_table_html
  140. # 获取修改后的HTML
  141. modified_html = str(soup)
  142. # 在表格后追加注释,说明哪些单元格被修改
  143. lines = ["<!-- 数字标准化说明:"]
  144. for ch in changes:
  145. lines.append(
  146. f" - [row={ch['row']},col={ch['col']}] {ch['old']} -> {ch['new']}"
  147. )
  148. lines.append("-->")
  149. comment = "\n".join(lines)
  150. return modified_html + "\n\n" + comment
  151. # 使用正则替换,只替换表格内容,保留其他部分(包括换行符)不变
  152. normalized_content = re.sub(table_pattern, normalize_table_match, markdown_content, flags=re.DOTALL)
  153. return normalized_content
  154. def normalize_json_table(
  155. json_content: str,
  156. *,
  157. table_type_key: str = "category",
  158. table_type_value: str = "Table",
  159. html_key: str = "text",
  160. cells_key: str | None = None,
  161. ) -> str:
  162. """
  163. 专门处理JSON格式OCR结果中表格的数字标准化。
  164. 通过参数指定提取用的 key,以兼容不同 OCR 工具的 JSON 结构。
  165. Args:
  166. json_content: JSON格式的OCR结果内容(字符串或已解析的 list)
  167. table_type_key: 用于判断“是否为表格”的字段名,如 "type" 或 "category"
  168. table_type_value: 上述字段等于该值时视为表格,如 "table" 或 "Table"
  169. html_key: 存放表格 HTML 的字段名,如 "table_body" 或 "text"
  170. cells_key: 存放单元格列表的字段名,如 "table_cells";为 None 则不处理 cells,
  171. 仅标准化 html_key 中的表格
  172. Returns:
  173. 标准化后的JSON内容(字符串)
  174. 常见格式示例:
  175. - 旧格式: category="Table", html 在 "text"
  176. normalize_json_table(s) # 默认即此
  177. - mineru_vllm_results_cell_bbox: type="table", html 在 "table_body", cells 在 "table_cells"
  178. normalize_json_table(s, table_type_key="type", table_type_value="table",
  179. html_key="table_body", cells_key="table_cells")
  180. """
  181. import json
  182. from ast import literal_eval
  183. try:
  184. data = json.loads(json_content) if isinstance(json_content, str) else json_content
  185. if not isinstance(data, list):
  186. return json_content
  187. for item in data:
  188. if not isinstance(item, dict):
  189. continue
  190. # 按参数判断是否为表格项,且包含 HTML
  191. if item.get(table_type_key) != table_type_value or html_key not in item:
  192. continue
  193. table_html = item[html_key]
  194. if not table_html or not isinstance(table_html, str):
  195. continue
  196. from bs4 import BeautifulSoup, Tag
  197. soup = BeautifulSoup(table_html, "html.parser")
  198. tables = soup.find_all("table")
  199. table_changes: list[dict] = []
  200. for table in tables:
  201. if not isinstance(table, Tag):
  202. continue
  203. for row_idx, tr in enumerate(table.find_all("tr")): # type: ignore[reportAttributeAccessIssue]
  204. cells_tag = tr.find_all(["td", "th"]) # type: ignore[reportAttributeAccessIssue]
  205. for col_idx, cell in enumerate(cells_tag):
  206. if not isinstance(cell, Tag):
  207. continue
  208. original_text = cell.get_text()
  209. normalized_text = normalize_financial_numbers(original_text)
  210. if original_text == normalized_text:
  211. continue
  212. change: dict[str, object] = {
  213. "row": row_idx,
  214. "col": col_idx,
  215. "old": original_text,
  216. "new": normalized_text,
  217. }
  218. bbox_attr = cell.get("data-bbox")
  219. if isinstance(bbox_attr, str):
  220. try:
  221. change["bbox"] = literal_eval(bbox_attr)
  222. except Exception:
  223. change["bbox"] = bbox_attr
  224. table_changes.append(change)
  225. cell.string = normalized_text
  226. # 写回 HTML
  227. item[html_key] = str(soup)
  228. if table_changes:
  229. item["number_normalization_changes"] = table_changes
  230. # 若指定了 cells_key,同时标准化 cells 中每格的 text(及 matched_text)
  231. # for key in ("text", "matched_text"):
  232. table_cell_text_keys = ["text"]
  233. if cells_key and cells_key in item and isinstance(item[cells_key], list):
  234. for cell in item[cells_key]:
  235. if not isinstance(cell, dict):
  236. continue
  237. for key in table_cell_text_keys:
  238. if key not in cell or not isinstance(cell[key], str):
  239. continue
  240. orig = cell[key]
  241. norm = normalize_financial_numbers(orig)
  242. if norm != orig:
  243. cell[key] = norm
  244. return json.dumps(data, ensure_ascii=False, indent=2)
  245. except json.JSONDecodeError as e:
  246. print(f"⚠️ JSON解析失败: {e}")
  247. return json_content
  248. except Exception as e:
  249. print(f"⚠️ JSON表格标准化失败: {e}")
  250. return json_content
  251. def normalize_json_file(
  252. file_path: str,
  253. output_path: str | None = None,
  254. *,
  255. table_type_key: str = "category",
  256. table_type_value: str = "Table",
  257. html_key: str = "text",
  258. cells_key: str | None = None,
  259. ) -> str:
  260. """
  261. 标准化JSON文件中的表格数字。
  262. 提取表格时使用的 key 可通过参数指定,以兼容不同 OCR 工具。
  263. Args:
  264. file_path: 输入JSON文件路径
  265. output_path: 输出文件路径,如果为None则覆盖原文件
  266. table_type_key: 判断表格的字段名(见 normalize_json_table)
  267. table_type_value: 判断表格的字段值
  268. html_key: 表格 HTML 所在字段名
  269. cells_key: 单元格列表所在字段名,None 表示不处理 cells
  270. Returns:
  271. 标准化后的JSON内容
  272. """
  273. input_file = Path(file_path)
  274. output_file = Path(output_path) if output_path else input_file
  275. if not input_file.exists():
  276. raise FileNotFoundError(f"找不到文件: {file_path}")
  277. with open(input_file, "r", encoding="utf-8") as f:
  278. original_content = f.read()
  279. print(f"🔧 正在标准化JSON文件: {input_file.name}")
  280. normalized_content = normalize_json_table(
  281. original_content,
  282. table_type_key=table_type_key,
  283. table_type_value=table_type_value,
  284. html_key=html_key,
  285. cells_key=cells_key,
  286. )
  287. # 保存标准化后的文件
  288. with open(output_file, 'w', encoding='utf-8') as f:
  289. f.write(normalized_content)
  290. # 统计变化
  291. changes = sum(1 for o, n in zip(original_content, normalized_content) if o != n)
  292. if changes > 0:
  293. print(f"✅ 标准化了 {changes} 个字符")
  294. # 如果输出路径不同,也保存原始版本
  295. if output_path and output_path != file_path:
  296. original_backup = Path(output_path).parent / f"{Path(output_path).stem}_original.json"
  297. with open(original_backup, 'w', encoding='utf-8') as f:
  298. f.write(original_content)
  299. print(f"📄 原始版本已保存到: {original_backup}")
  300. else:
  301. print("ℹ️ 无需标准化(已是标准格式)")
  302. print(f"📄 标准化结果已保存到: {output_file}")
  303. return normalized_content
  304. if __name__ == "__main__":
  305. """
  306. 简单验证:构造一份“故意打乱逗号/小数点”的 JSON / Markdown 示例,
  307. 并打印标准化前后的差异。
  308. """
  309. import json
  310. print("=== JSON 示例:金额格式纠错 + 变更记录 ===")
  311. demo_json_data = [
  312. {
  313. "category": "Table",
  314. "text": (
  315. "<table><tbody>"
  316. "<tr><td data-bbox=\"[0,0,10,10]\">项目</td>"
  317. "<td data-bbox=\"[10,0,20,10]\">2023 年12 月31 日</td></tr>"
  318. # 故意打乱的数字:应为 12,123,456.00 和 1,234,567.89
  319. "<tr><td data-bbox=\"[0,10,10,20]\">测试金额A</td>"
  320. "<td data-bbox=\"[10,10,20,20]\">12.123,456,00</td></tr>"
  321. "<tr><td data-bbox=\"[0,20,10,30]\">测试金额B</td>"
  322. "<td data-bbox=\"[10,20,20,30]\">1,234,567,89</td></tr>"
  323. "<tr><td data-bbox=\"[0,20,10,40]\">测试金额C</td>"
  324. "<td data-bbox=\"[10,20,20,40]\">301,55</td></tr>"
  325. "<tr><td data-bbox=\"[0,20,10,50]\">测试金额D</td>"
  326. "<td data-bbox=\"[10,20,20,40]\">1.068.987,094.02</td></tr>"
  327. "</tbody></table>"
  328. ),
  329. }
  330. ]
  331. demo_json_str = json.dumps(demo_json_data, ensure_ascii=False, indent=2)
  332. print("原始 JSON:")
  333. print(demo_json_str)
  334. normalized_json_str = normalize_json_table(demo_json_str)
  335. print("\n标准化后 JSON:")
  336. print(normalized_json_str)
  337. print("\n=== Markdown 示例:金额格式纠错 + 注释说明 ===")
  338. demo_md = """<table><tbody>
  339. <tr><td>项目</td><td>2023 年12 月31 日</td></tr>
  340. <tr><td>测试金额A</td><td>12.123,456,00</td></tr>
  341. <tr><td>测试金额B</td><td>1,234,567,89</td></tr>
  342. <tr><td>测试金额C</td><td>301,55</td></tr>
  343. <tr><td>测试金额D</td><td>1.068.987,094.02</td></tr>
  344. </tbody></table>
  345. """
  346. print("原始 Markdown:")
  347. print(demo_md)
  348. normalized_md = normalize_markdown_table(demo_md)
  349. print("\n标准化后 Markdown:")
  350. print(normalized_md)
  351. cases = [
  352. # A 类:标准美式格式,不应被修改
  353. ("10,000.00", "10,000.00"),
  354. ("67,455.00", "67,455.00"),
  355. ("89,400.00", "89,400.00"),
  356. ("100,200.00", "100,200.00"),
  357. ("494,339.63", "494,339.63"),
  358. ("1,179.05", "1,179.05"),
  359. ("27,396.05", "27,396.05"),
  360. # B 类:混合/大数格式,需被修正
  361. ("19.879,111.45", "19,879,111.45"),
  362. ("27.072,795.05", "27,072,795.05"),
  363. ("468.348,422.85", "468,348,422.85"),
  364. ("4740,251.56", "4,740,251.56"),
  365. # C 类:多余分隔符
  366. ("585,515.936.19", "585,515,936.19"),
  367. ("22,240.761.60", "22,240,761.60"),
  368. ("198,757.280.38", "198,757,280.38"),
  369. ("618,846.219.71", "618,846,219.71"),
  370. # 原 demo 案例
  371. ("12.123,456,00", "12,123,456.00"),
  372. ("1,234,567,89", "1,234,567.89"),
  373. ("301,55", "301.55"),
  374. ("1.068.987,094.02", "1,068,987,094.02"),
  375. # 标准欧洲格式
  376. ("1.234,56", "1,234.56"),
  377. ("1.234.567,89", "1,234,567.89"),
  378. ]
  379. ok = fail = 0
  380. for inp, expected in cases:
  381. got = _normalize_amount_token(inp)
  382. status = "✅" if got == expected else "❌"
  383. if got != expected:
  384. fail += 1
  385. print(f"{status} {inp!r:30s} → {got!r}" + (f" (期望 {expected!r})" if got != expected else ""))
  386. print(f"\n共 {ok+fail} 个,通过 {len(cases)-fail},失败 {fail}")