token_to_middle_json.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. import re
  2. from mineru.utils.cut_image import cut_image_and_table
  3. from mineru.utils.enum_class import BlockType, ContentType
  4. from mineru.utils.hash_utils import str_md5
  5. from mineru.utils.vlm_magic_model import fix_two_layer_blocks
  6. from mineru.version import __version__
  7. def token_to_page_info(token, image_dict, page, image_writer, page_index) -> dict:
  8. """将token转换为页面信息"""
  9. # 解析token,提取坐标和类型
  10. # 假设token格式为:<|box_start|>x0 y0 x1 y1<|box_end|><|ref_start|>type<|ref_end|><|md_start|>content<|md_end|>
  11. # 这里需要根据实际的token格式进行解析
  12. # 提取所有完整块,每个块从<|box_start|>开始到<|md_end|>或<|im_end|>结束
  13. scale = image_dict["scale"]
  14. page_pil_img = image_dict["img_pil"]
  15. page_img_md5 = str_md5(image_dict["img_base64"])
  16. width, height = map(int, page.get_size())
  17. # 使用正则表达式查找所有块
  18. pattern = (
  19. r"<\|box_start\|>(.*?)<\|box_end\|><\|ref_start\|>(.*?)<\|ref_end\|><\|md_start\|>(.*?)(?:<\|md_end\|>|<\|im_end\|>)"
  20. )
  21. block_infos = re.findall(pattern, token, re.DOTALL)
  22. blocks = []
  23. # 解析每个块
  24. for index, block_info in enumerate(block_infos):
  25. block_bbox = block_info[0].strip()
  26. x1, y1, x2, y2 = map(int, block_bbox.split())
  27. x_1, y_1, x_2, y_2 = (
  28. int(x1 * width / 1000),
  29. int(y1 * height / 1000),
  30. int(x2 * width / 1000),
  31. int(y2 * height / 1000),
  32. )
  33. if x_2 < x_1:
  34. x_1, x_2 = x_2, x_1
  35. if y_2 < y_1:
  36. y_1, y_2 = y_2, y_1
  37. block_bbox = (x_1, y_1, x_2, y_2)
  38. block_type = block_info[1].strip()
  39. block_content = block_info[2].strip()
  40. # print(f"坐标: {block_bbox}")
  41. # print(f"类型: {block_type}")
  42. # print(f"内容: {block_content}")
  43. # print("-" * 50)
  44. span_type = "unknown"
  45. if block_type in [
  46. "text",
  47. "title",
  48. "image_caption",
  49. "image_footnote",
  50. "table_caption",
  51. "table_footnote",
  52. "list",
  53. "index",
  54. ]:
  55. span_type = ContentType.TEXT
  56. elif block_type in ["image"]:
  57. block_type = BlockType.IMAGE_BODY
  58. span_type = ContentType.IMAGE
  59. elif block_type in ["table"]:
  60. block_type = BlockType.TABLE_BODY
  61. span_type = ContentType.TABLE
  62. elif block_type in ["equation"]:
  63. block_type = BlockType.INTERLINE_EQUATION
  64. span_type = ContentType.INTERLINE_EQUATION
  65. if span_type in ["image", "table"]:
  66. span = {
  67. "bbox": block_bbox,
  68. "type": span_type,
  69. }
  70. if span_type == ContentType.TABLE:
  71. span["html"] = block_content
  72. span = cut_image_and_table(span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale)
  73. else:
  74. span = {
  75. "bbox": block_bbox,
  76. "type": span_type,
  77. "content": block_content,
  78. }
  79. line = {
  80. "bbox": block_bbox,
  81. "spans": [span],
  82. }
  83. blocks.append(
  84. {
  85. "bbox": block_bbox,
  86. "type": block_type,
  87. "lines": [line],
  88. "index": index,
  89. }
  90. )
  91. image_blocks = fix_two_layer_blocks(blocks, BlockType.IMAGE)
  92. table_blocks = fix_two_layer_blocks(blocks, BlockType.TABLE)
  93. page_blocks = [
  94. block
  95. for block in blocks
  96. if block["type"] in [BlockType.TEXT, BlockType.TITLE, BlockType.LIST, BlockType.INDEX, BlockType.INTERLINE_EQUATION]
  97. ]
  98. page_blocks.extend([*image_blocks, *table_blocks])
  99. # 对page_blocks根据index的值进行排序
  100. page_blocks.sort(key=lambda x: x["index"])
  101. page_info = {"para_blocks": page_blocks, "discarded_blocks": [], "page_size": [width, height], "page_idx": page_index}
  102. return page_info
  103. def result_to_middle_json(token_list, images_list, pdf_doc, image_writer):
  104. middle_json = {"pdf_info": [], "_version_name": __version__}
  105. for index, token in enumerate(token_list):
  106. page = pdf_doc[index]
  107. image_dict = images_list[index]
  108. page_info = token_to_page_info(token, image_dict, page, image_writer, index)
  109. middle_json["pdf_info"].append(page_info)
  110. return middle_json
  111. if __name__ == "__main__":
  112. output = r"<|box_start|>088 119 472 571<|box_end|><|ref_start|>image<|ref_end|><|md_start|>![]('img_url')<|md_end|>\n<|box_start|>079 582 482 608<|box_end|><|ref_start|>image_caption<|ref_end|><|md_start|>Fig. 2. (a) Schematic of the change in the FDC over time, and (b) definition of model parameters.<|md_end|>\n<|box_start|>079 624 285 638<|box_end|><|ref_start|>title<|ref_end|><|md_start|># 2.2. Zero flow day analysis<|md_end|>\n<|box_start|>079 656 482 801<|box_end|><|ref_start|>text<|ref_end|><|md_start|>A notable feature of Fig. 1 is the increase in the number of zero flow days. A similar approach to Eq. (2), using an inverse sigmoidal function was employed to assess the impact of afforestation on the number of zero flow days per year \((N_{\mathrm{zero}})\). In this case, the left hand side of Eq. (2) is replaced by \(N_{\mathrm{zero}}\) and \(b\) and \(S\) are constrained to negative as \(N_{\mathrm{zero}}\) decreases as rainfall increases, and increases with plantation growth:<|md_end|>\n<|box_start|>076 813 368 853<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nN_{\mathrm{zero}}=a+b(\Delta P)+\frac{Y}{1+\exp\left(\frac{T-T_{\mathrm{half}}}{S}\right)}\n\]<|md_end|>\n<|box_start|>079 865 482 895<|box_end|><|ref_start|>text<|ref_end|><|md_start|>For the average pre-treatment condition \(\Delta P=0\) and \(T=0\), \(N_{\mathrm{zero}}\) approximately equals \(a\). \(Y\) gives<|md_end|>\n<|box_start|>525 119 926 215<|box_end|><|ref_start|>text<|ref_end|><|md_start|>the magnitude of change in zero flow days due to afforestation, and \(S\) describes the shape of the response. For the average climate condition \(\Delta P=0\), \(a+Y\) becomes the number of zero flow days when the new equilibrium condition under afforestation is reached.<|md_end|>\n<|box_start|>525 240 704 253<|box_end|><|ref_start|>title<|ref_end|><|md_start|># 2.3. Statistical analyses<|md_end|>\n<|box_start|>525 271 926 368<|box_end|><|ref_start|>text<|ref_end|><|md_start|>The coefficient of efficiency \((E)\) (Nash and Sutcliffe, 1970; Chiew and McMahon, 1993; Legates and McCabe, 1999) was used as the 'goodness of fit' measure to evaluate the fit between observed and predicted flow deciles (2) and zero flow days (3). \(E\) is given by:<|md_end|>\n<|box_start|>520 375 735 415<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nE=1.0-\frac{\sum_{i=1}^{N}(O_{i}-P_{i})^{2}}{\sum_{i=1}^{N}(O_{i}-\bar{O})^{2}}\n\]<|md_end|>\n<|box_start|>525 424 926 601<|box_end|><|ref_start|>text<|ref_end|><|md_start|>where \(O\) are observed data, \(P\) are predicted values, and \(\bar{O}\) is the mean for the entire period. \(E\) is unity minus the ratio of the mean square error to the variance in the observed data, and ranges from \(-\infty\) to 1.0. Higher values indicate greater agreement between observed and predicted data as per the coefficient of determination \((r^{2})\). \(E\) is used in preference to \(r^{2}\) in evaluating hydrologic modelling because it is a measure of the deviation from the 1:1 line. As \(E\) is always \(<r^{2}\) we have arbitrarily considered \(E>0.7\) to indicate adequate model fits.<|md_end|>\n<|box_start|>525 603 926 731<|box_end|><|ref_start|>text<|ref_end|><|md_start|>It is important to assess the significance of the model parameters to check the model assumptions that rainfall and forest age are driving changes in the FDC. The model (2) was split into simplified forms, where only the rainfall or time terms were included by setting \(b=0\), as shown in Eq. (5), or \(Y=0\) as shown in Eq. (6). The component models (5) and (6) were then tested against the complete model, (2).<|md_end|>\n<|box_start|>520 739 735 778<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nQ_{\%}=a+\frac{Y}{1+\exp\left(\frac{T-T_{\mathrm{half}}^{\prime}}{S}\right)}\n\]<|md_end|>\n<|box_start|>525 787 553 799<|box_end|><|ref_start|>text<|ref_end|><|md_start|>and<|md_end|>\n<|box_start|>520 807 646 825<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nQ_{\%}=a+b\Delta P\n\]<|md_end|>\n<|box_start|>525 833 926 895<|box_end|><|ref_start|>text<|ref_end|><|md_start|>For both the flow duration curve analysis and zero flow days analysis, a \(t\)-test was then performed to test whether (5) and (6) were significantly different to (2). A critical value of \(t\) exceeding the calculated \(t\)-value<|md_end|><|im_end|>"
  113. p_info = token_to_page_info(output)
  114. # 将blocks 转换为json文本
  115. import json
  116. json_str = json.dumps(p_info, ensure_ascii=False, indent=4)
  117. print(json_str)