normalize_financial_numbers.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. import re
  2. import os
  3. from pathlib import Path
  4. from decimal import Decimal, InvalidOperation
  5. def _normalize_amount_token(token: str) -> str:
  6. """
  7. 规范单个金额 token 中逗号/小数点的用法。
  8. 仅在形态明显为金额时进行纠错,其他情况原样返回。
  9. """
  10. if not token:
  11. return token
  12. # 只处理包含数字的简单 token,避免带字母/其他符号的误改
  13. if not re.fullmatch(r"[+-]?\d[\d,\.]*\d", token):
  14. return token
  15. sign = ""
  16. core = token
  17. if core[0] in "+-":
  18. sign, core = core[0], core[1:]
  19. has_dot = "." in core
  20. has_comma = "," in core
  21. # 辅助: 尝试解析为 Decimal;失败则认为不安全,回退原值
  22. def _safe_decimal(s: str) -> bool:
  23. try:
  24. Decimal(s.replace(",", ""))
  25. return True
  26. except (InvalidOperation, ValueError):
  27. return False
  28. # 规则A:同时包含 . 和 ,,最后一个分隔符是逗号,且其后为 1-2 位数字
  29. if has_dot and has_comma:
  30. last_comma = core.rfind(",")
  31. last_dot = core.rfind(".")
  32. if last_comma > last_dot and last_comma != -1:
  33. frac = core[last_comma + 1 :]
  34. if 1 <= len(frac) <= 2 and frac.isdigit():
  35. # 先把所有点当作千分位逗号,再把最后一个逗号当作小数点
  36. temp = core.replace(".", ",")
  37. idx = temp.rfind(",")
  38. if idx != -1:
  39. candidate = temp[:idx] + "." + temp[idx + 1 :]
  40. if _safe_decimal(candidate):
  41. return sign + candidate
  42. # 规则B:只有 .,多个点,最后一段视为小数,其余为千分位
  43. if has_dot and not has_comma:
  44. parts = core.split(".")
  45. if len(parts) >= 3:
  46. last = parts[-1]
  47. ints = parts[:-1]
  48. if 1 <= len(last) <= 2 and all(len(p) == 3 for p in ints[1:]):
  49. candidate = ",".join(ints) + "." + last
  50. if _safe_decimal(candidate):
  51. return sign + candidate
  52. # 规则C:只有 ,,多个逗号,最后一段长度为 1-2 且前面为 3 位分组
  53. if has_comma and not has_dot:
  54. parts = core.split(",")
  55. if len(parts) >= 3:
  56. last = parts[-1]
  57. ints = parts[:-1]
  58. if 1 <= len(last) <= 2 and all(len(p) == 3 for p in ints[1:]):
  59. # 将最后一个逗号视为小数点
  60. idx = core.rfind(",")
  61. candidate = core[:idx] + "." + core[idx + 1 :]
  62. if _safe_decimal(candidate):
  63. return sign + candidate
  64. # 规则D:只有 ,,且仅有一个逗号、逗号后 1-2 位数字 → 欧洲格式小数,如 301,55 → 301.55
  65. elif len(parts) == 2:
  66. left, right = parts[0], parts[1]
  67. if 1 <= len(right) <= 2 and right.isdigit() and left.isdigit():
  68. candidate = left + "." + right
  69. if _safe_decimal(candidate):
  70. return sign + candidate
  71. # 没有需要纠错的典型形态,直接返回原 token
  72. return token
  73. def normalize_financial_numbers(text: str) -> str:
  74. """
  75. 标准化财务数字:将全角字符转换为半角字符,并纠正常见的逗号/小数点错用。
  76. """
  77. if not text:
  78. return text
  79. # 定义全角到半角的映射
  80. fullwidth_to_halfwidth = {
  81. '0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
  82. '5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
  83. ',': ',', # 全角逗号转半角逗号
  84. '。': '.', # 全角句号转半角句号
  85. '.': '.', # 全角句点转半角句点
  86. ':': ':', # 全角冒号转半角冒号
  87. ';': ';', # 全角分号转半角分号
  88. '(': '(', # 全角左括号转半角左括号
  89. ')': ')', # 全角右括号转半角右括号
  90. '-': '-', # 全角减号转半角减号
  91. '+': '+', # 全角加号转半角加号
  92. '%': '%', # 全角百分号转半角百分号
  93. }
  94. # 第一步:执行基础字符替换(全角 -> 半角)
  95. normalized_text = text
  96. for fullwidth, halfwidth in fullwidth_to_halfwidth.items():
  97. normalized_text = normalized_text.replace(fullwidth, halfwidth)
  98. # 第二步:处理数字序列中的空格和分隔符(保留原有逻辑)
  99. number_sequence_pattern = r'(\d+(?:\s*[,,]\s*\d+)*(?:\s*[。..]\s*\d+)?)'
  100. def normalize_number_sequence(match):
  101. sequence = match.group(1)
  102. sequence = re.sub(r'(\d)\s*[,,]\s*(\d)', r'\1,\2', sequence)
  103. sequence = re.sub(r'(\d)\s*[。..]\s*(\d)', r'\1.\2', sequence)
  104. return sequence
  105. normalized_text = re.sub(number_sequence_pattern, normalize_number_sequence, normalized_text)
  106. # 第三步:对疑似金额 token 做逗号/小数点纠错
  107. amount_pattern = r'(?P<tok>[+-]?\d[\d,\.]*\d)'
  108. def _amount_sub(m: re.Match) -> str:
  109. tok = m.group('tok')
  110. return _normalize_amount_token(tok)
  111. normalized_text = re.sub(amount_pattern, _amount_sub, normalized_text)
  112. return normalized_text
  113. def normalize_markdown_table(markdown_content: str) -> str:
  114. """
  115. 专门处理Markdown表格中的数字标准化
  116. 注意:保留原始markdown中的换行符,只替换表格内的文本内容
  117. Args:
  118. markdown_content: Markdown内容
  119. Returns:
  120. 标准化后的Markdown内容
  121. """
  122. # 使用BeautifulSoup处理HTML表格
  123. from bs4 import BeautifulSoup, Tag
  124. import re
  125. # 使用正则表达式找到所有表格的位置,并保留其前后的内容
  126. # 匹配完整的HTML表格标签(包括嵌套)
  127. table_pattern = r'(<table[^>]*>.*?</table>)'
  128. def normalize_table_match(match):
  129. """处理单个表格匹配,保留原始格式,并追加数字标准化说明注释。"""
  130. table_html = match.group(1)
  131. original_table_html = table_html # 保存原始HTML用于比较
  132. # 解析表格HTML
  133. soup = BeautifulSoup(table_html, 'html.parser')
  134. tables = soup.find_all('table')
  135. # 记录本表格中所有数值修改
  136. changes: list[dict] = []
  137. for table in tables:
  138. if not isinstance(table, Tag):
  139. continue
  140. # 通过 tr / td(th) 计算行列位置
  141. for row_idx, tr in enumerate(table.find_all('tr')): # type: ignore[reportAttributeAccessIssue]
  142. cells = tr.find_all(['td', 'th']) # type: ignore[reportAttributeAccessIssue]
  143. for col_idx, cell in enumerate(cells):
  144. if not isinstance(cell, Tag):
  145. continue
  146. # 与 normalize_json_table 一致:整格取文本、只标准化一次、再写回
  147. original_text = cell.get_text()
  148. normalized_text = normalize_financial_numbers(original_text)
  149. if original_text == normalized_text:
  150. continue
  151. # 记录一条修改
  152. changes.append(
  153. {
  154. "row": row_idx,
  155. "col": col_idx,
  156. "old": original_text,
  157. "new": normalized_text,
  158. }
  159. )
  160. # 整格替换为标准化后的文本(与 normalize_json_table 的 cell.string = normalized_text 一致)
  161. cell.string = normalized_text
  162. # 如果没有任何数值修改,直接返回原始 HTML
  163. if not changes:
  164. return original_table_html
  165. # 获取修改后的HTML
  166. modified_html = str(soup)
  167. # 在表格后追加注释,说明哪些单元格被修改
  168. lines = ["<!-- 数字标准化说明:"]
  169. for ch in changes:
  170. lines.append(
  171. f" - [row={ch['row']},col={ch['col']}] {ch['old']} -> {ch['new']}"
  172. )
  173. lines.append("-->")
  174. comment = "\n".join(lines)
  175. return modified_html + "\n\n" + comment
  176. # 使用正则替换,只替换表格内容,保留其他部分(包括换行符)不变
  177. normalized_content = re.sub(table_pattern, normalize_table_match, markdown_content, flags=re.DOTALL)
  178. return normalized_content
  179. def normalize_json_table(
  180. json_content: str,
  181. *,
  182. table_type_key: str = "category",
  183. table_type_value: str = "Table",
  184. html_key: str = "text",
  185. cells_key: str | None = None,
  186. ) -> str:
  187. """
  188. 专门处理JSON格式OCR结果中表格的数字标准化。
  189. 通过参数指定提取用的 key,以兼容不同 OCR 工具的 JSON 结构。
  190. Args:
  191. json_content: JSON格式的OCR结果内容(字符串或已解析的 list)
  192. table_type_key: 用于判断“是否为表格”的字段名,如 "type" 或 "category"
  193. table_type_value: 上述字段等于该值时视为表格,如 "table" 或 "Table"
  194. html_key: 存放表格 HTML 的字段名,如 "table_body" 或 "text"
  195. cells_key: 存放单元格列表的字段名,如 "table_cells";为 None 则不处理 cells,
  196. 仅标准化 html_key 中的表格
  197. Returns:
  198. 标准化后的JSON内容(字符串)
  199. 常见格式示例:
  200. - 旧格式: category="Table", html 在 "text"
  201. normalize_json_table(s) # 默认即此
  202. - mineru_vllm_results_cell_bbox: type="table", html 在 "table_body", cells 在 "table_cells"
  203. normalize_json_table(s, table_type_key="type", table_type_value="table",
  204. html_key="table_body", cells_key="table_cells")
  205. """
  206. import json
  207. from ast import literal_eval
  208. try:
  209. data = json.loads(json_content) if isinstance(json_content, str) else json_content
  210. if not isinstance(data, list):
  211. return json_content
  212. for item in data:
  213. if not isinstance(item, dict):
  214. continue
  215. # 按参数判断是否为表格项,且包含 HTML
  216. if item.get(table_type_key) != table_type_value or html_key not in item:
  217. continue
  218. table_html = item[html_key]
  219. if not table_html or not isinstance(table_html, str):
  220. continue
  221. from bs4 import BeautifulSoup, Tag
  222. soup = BeautifulSoup(table_html, "html.parser")
  223. tables = soup.find_all("table")
  224. table_changes: list[dict] = []
  225. for table in tables:
  226. if not isinstance(table, Tag):
  227. continue
  228. for row_idx, tr in enumerate(table.find_all("tr")): # type: ignore[reportAttributeAccessIssue]
  229. cells_tag = tr.find_all(["td", "th"]) # type: ignore[reportAttributeAccessIssue]
  230. for col_idx, cell in enumerate(cells_tag):
  231. if not isinstance(cell, Tag):
  232. continue
  233. original_text = cell.get_text()
  234. normalized_text = normalize_financial_numbers(original_text)
  235. if original_text == normalized_text:
  236. continue
  237. change: dict[str, object] = {
  238. "row": row_idx,
  239. "col": col_idx,
  240. "old": original_text,
  241. "new": normalized_text,
  242. }
  243. bbox_attr = cell.get("data-bbox")
  244. if isinstance(bbox_attr, str):
  245. try:
  246. change["bbox"] = literal_eval(bbox_attr)
  247. except Exception:
  248. change["bbox"] = bbox_attr
  249. table_changes.append(change)
  250. cell.string = normalized_text
  251. # 写回 HTML
  252. item[html_key] = str(soup)
  253. if table_changes:
  254. item["number_normalization_changes"] = table_changes
  255. # 若指定了 cells_key,同时标准化 cells 中每格的 text(及 matched_text)
  256. # for key in ("text", "matched_text"):
  257. table_cell_text_keys = ["text"]
  258. if cells_key and cells_key in item and isinstance(item[cells_key], list):
  259. for cell in item[cells_key]:
  260. if not isinstance(cell, dict):
  261. continue
  262. for key in table_cell_text_keys:
  263. if key not in cell or not isinstance(cell[key], str):
  264. continue
  265. orig = cell[key]
  266. norm = normalize_financial_numbers(orig)
  267. if norm != orig:
  268. cell[key] = norm
  269. return json.dumps(data, ensure_ascii=False, indent=2)
  270. except json.JSONDecodeError as e:
  271. print(f"⚠️ JSON解析失败: {e}")
  272. return json_content
  273. except Exception as e:
  274. print(f"⚠️ JSON表格标准化失败: {e}")
  275. return json_content
  276. def normalize_json_file(
  277. file_path: str,
  278. output_path: str | None = None,
  279. *,
  280. table_type_key: str = "category",
  281. table_type_value: str = "Table",
  282. html_key: str = "text",
  283. cells_key: str | None = None,
  284. ) -> str:
  285. """
  286. 标准化JSON文件中的表格数字。
  287. 提取表格时使用的 key 可通过参数指定,以兼容不同 OCR 工具。
  288. Args:
  289. file_path: 输入JSON文件路径
  290. output_path: 输出文件路径,如果为None则覆盖原文件
  291. table_type_key: 判断表格的字段名(见 normalize_json_table)
  292. table_type_value: 判断表格的字段值
  293. html_key: 表格 HTML 所在字段名
  294. cells_key: 单元格列表所在字段名,None 表示不处理 cells
  295. Returns:
  296. 标准化后的JSON内容
  297. """
  298. input_file = Path(file_path)
  299. output_file = Path(output_path) if output_path else input_file
  300. if not input_file.exists():
  301. raise FileNotFoundError(f"找不到文件: {file_path}")
  302. with open(input_file, "r", encoding="utf-8") as f:
  303. original_content = f.read()
  304. print(f"🔧 正在标准化JSON文件: {input_file.name}")
  305. normalized_content = normalize_json_table(
  306. original_content,
  307. table_type_key=table_type_key,
  308. table_type_value=table_type_value,
  309. html_key=html_key,
  310. cells_key=cells_key,
  311. )
  312. # 保存标准化后的文件
  313. with open(output_file, 'w', encoding='utf-8') as f:
  314. f.write(normalized_content)
  315. # 统计变化
  316. changes = sum(1 for o, n in zip(original_content, normalized_content) if o != n)
  317. if changes > 0:
  318. print(f"✅ 标准化了 {changes} 个字符")
  319. # 如果输出路径不同,也保存原始版本
  320. if output_path and output_path != file_path:
  321. original_backup = Path(output_path).parent / f"{Path(output_path).stem}_original.json"
  322. with open(original_backup, 'w', encoding='utf-8') as f:
  323. f.write(original_content)
  324. print(f"📄 原始版本已保存到: {original_backup}")
  325. else:
  326. print("ℹ️ 无需标准化(已是标准格式)")
  327. print(f"📄 标准化结果已保存到: {output_file}")
  328. return normalized_content
  329. if __name__ == "__main__":
  330. """
  331. 简单验证:构造一份“故意打乱逗号/小数点”的 JSON / Markdown 示例,
  332. 并打印标准化前后的差异。
  333. """
  334. import json
  335. print("=== JSON 示例:金额格式纠错 + 变更记录 ===")
  336. demo_json_data = [
  337. {
  338. "category": "Table",
  339. "text": (
  340. "<table><tbody>"
  341. "<tr><td data-bbox=\"[0,0,10,10]\">项目</td>"
  342. "<td data-bbox=\"[10,0,20,10]\">2023 年12 月31 日</td></tr>"
  343. # 故意打乱的数字:应为 12,123,456.00 和 1,234,567.89
  344. "<tr><td data-bbox=\"[0,10,10,20]\">测试金额A</td>"
  345. "<td data-bbox=\"[10,10,20,20]\">12.123,456,00</td></tr>"
  346. "<tr><td data-bbox=\"[0,20,10,30]\">测试金额B</td>"
  347. "<td data-bbox=\"[10,20,20,30]\">1,234,567,89</td></tr>"
  348. "<tr><td data-bbox=\"[0,20,10,40]\">测试金额C</td>"
  349. "<td data-bbox=\"[10,20,20,40]\">301,55</td></tr>"
  350. "</tbody></table>"
  351. ),
  352. }
  353. ]
  354. demo_json_str = json.dumps(demo_json_data, ensure_ascii=False, indent=2)
  355. print("原始 JSON:")
  356. print(demo_json_str)
  357. normalized_json_str = normalize_json_table(demo_json_str)
  358. print("\n标准化后 JSON:")
  359. print(normalized_json_str)
  360. print("\n=== Markdown 示例:金额格式纠错 + 注释说明 ===")
  361. demo_md = """<table><tbody>
  362. <tr><td>项目</td><td>2023 年12 月31 日</td></tr>
  363. <tr><td>测试金额A</td><td>12.123,456,00</td></tr>
  364. <tr><td>测试金额B</td><td>1,234,567,89</td></tr>
  365. <tr><td>测试金额C</td><td>301,55</td></tr>
  366. </tbody></table>
  367. """
  368. print("原始 Markdown:")
  369. print(demo_md)
  370. normalized_md = normalize_markdown_table(demo_md)
  371. print("\n标准化后 Markdown:")
  372. print(normalized_md)