stats.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. from collections import Counter
  2. import numpy as np
  3. from magic_pdf.para.commons import *
  4. if sys.version_info[0] >= 3:
  5. sys.stdout.reconfigure(encoding="utf-8") # type: ignore
  6. class BlockStatisticsCalculator:
  7. def __init__(self) -> None:
  8. pass
  9. def __calc_stats_of_new_lines(self, new_lines):
  10. """
  11. This function calculates the paragraph metrics
  12. Parameters
  13. ----------
  14. combined_lines : list
  15. combined lines
  16. Returns
  17. -------
  18. X0 : float
  19. Median of x0 values, which represents the left average boundary of the block
  20. X1 : float
  21. Median of x1 values, which represents the right average boundary of the block
  22. avg_char_width : float
  23. Average of char widths, which represents the average char width of the block
  24. avg_char_height : float
  25. Average of line heights, which represents the average line height of the block
  26. """
  27. x0_values = []
  28. x1_values = []
  29. char_widths = []
  30. char_heights = []
  31. block_font_types = []
  32. block_font_sizes = []
  33. block_directions = []
  34. if len(new_lines) > 0:
  35. for i, line in enumerate(new_lines):
  36. line_bbox = line["bbox"]
  37. line_text = line["text"]
  38. line_spans = line["spans"]
  39. num_chars = len([ch for ch in line_text if not ch.isspace()])
  40. x0_values.append(line_bbox[0])
  41. x1_values.append(line_bbox[2])
  42. if num_chars > 0:
  43. char_width = (line_bbox[2] - line_bbox[0]) / num_chars
  44. char_widths.append(char_width)
  45. for span in line_spans:
  46. block_font_types.append(span["font"])
  47. block_font_sizes.append(span["size"])
  48. if "dir" in line:
  49. block_directions.append(line["dir"])
  50. # line_font_types = [span["font"] for span in line_spans]
  51. char_heights = [span["size"] for span in line_spans]
  52. X0 = np.median(x0_values) if x0_values else 0
  53. X1 = np.median(x1_values) if x1_values else 0
  54. avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
  55. avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
  56. # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
  57. max_span_length = 0
  58. max_span_font_type = None
  59. for line in new_lines:
  60. line_spans = line["spans"]
  61. for span in line_spans:
  62. span_length = span["bbox"][2] - span["bbox"][0]
  63. if span_length > max_span_length:
  64. max_span_length = span_length
  65. max_span_font_type = span["font"]
  66. max_freq_font_type = max_span_font_type
  67. avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
  68. avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
  69. avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
  70. median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
  71. return (
  72. X0,
  73. X1,
  74. avg_char_width,
  75. avg_char_height,
  76. max_freq_font_type,
  77. avg_font_size,
  78. (avg_dir_horizontal, avg_dir_vertical),
  79. median_font_size,
  80. )
  81. def __make_new_block(self, input_block):
  82. new_block = {}
  83. raw_lines = input_block["lines"]
  84. stats = self.__calc_stats_of_new_lines(raw_lines)
  85. block_id = input_block["block_id"]
  86. block_bbox = input_block["bbox"]
  87. block_text = input_block["text"]
  88. block_lines = raw_lines
  89. block_avg_left_boundary = stats[0]
  90. block_avg_right_boundary = stats[1]
  91. block_avg_char_width = stats[2]
  92. block_avg_char_height = stats[3]
  93. block_font_type = stats[4]
  94. block_font_size = stats[5]
  95. block_direction = stats[6]
  96. block_median_font_size = stats[7]
  97. new_block["block_id"] = block_id
  98. new_block["bbox"] = block_bbox
  99. new_block["text"] = block_text
  100. new_block["dir"] = block_direction
  101. new_block["X0"] = block_avg_left_boundary
  102. new_block["X1"] = block_avg_right_boundary
  103. new_block["avg_char_width"] = block_avg_char_width
  104. new_block["avg_char_height"] = block_avg_char_height
  105. new_block["block_font_type"] = block_font_type
  106. new_block["block_font_size"] = block_font_size
  107. new_block["lines"] = block_lines
  108. new_block["median_font_size"] = block_median_font_size
  109. return new_block
  110. def batch_process_blocks(self, pdf_dic):
  111. """
  112. This function processes the blocks in batch.
  113. Parameters
  114. ----------
  115. self : object
  116. The instance of the class.
  117. ----------
  118. blocks : list
  119. Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json
  120. Returns
  121. -------
  122. result_dict : dict
  123. result dictionary
  124. """
  125. for page_id, blocks in pdf_dic.items():
  126. if page_id.startswith("page_"):
  127. para_blocks = []
  128. if "para_blocks" in blocks.keys():
  129. input_blocks = blocks["para_blocks"]
  130. for input_block in input_blocks:
  131. new_block = self.__make_new_block(input_block)
  132. para_blocks.append(new_block)
  133. blocks["para_blocks"] = para_blocks
  134. return pdf_dic
  135. class DocStatisticsCalculator:
  136. def __init__(self) -> None:
  137. pass
  138. def calc_stats_of_doc(self, pdf_dict):
  139. """
  140. This function computes the statistics of the document
  141. Parameters
  142. ----------
  143. result_dict : dict
  144. result dictionary
  145. Returns
  146. -------
  147. statistics : dict
  148. statistics of the document
  149. """
  150. total_text_length = 0
  151. total_num_blocks = 0
  152. for page_id, blocks in pdf_dict.items():
  153. if page_id.startswith("page_"):
  154. if "para_blocks" in blocks.keys():
  155. para_blocks = blocks["para_blocks"]
  156. for para_block in para_blocks:
  157. total_text_length += len(para_block["text"])
  158. total_num_blocks += 1
  159. avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
  160. font_list = []
  161. for page_id, blocks in pdf_dict.items():
  162. if page_id.startswith("page_"):
  163. if "para_blocks" in blocks.keys():
  164. input_blocks = blocks["para_blocks"]
  165. for input_block in input_blocks:
  166. block_text_length = len(input_block.get("text", ""))
  167. if block_text_length < avg_text_length * 0.5:
  168. continue
  169. block_font_type = safe_get(input_block, "block_font_type", "")
  170. block_font_size = safe_get(input_block, "block_font_size", 0)
  171. font_list.append((block_font_type, block_font_size))
  172. font_counter = Counter(font_list)
  173. most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
  174. second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
  175. statistics = {
  176. "num_pages": 0,
  177. "num_blocks": 0,
  178. "num_paras": 0,
  179. "num_titles": 0,
  180. "num_header_blocks": 0,
  181. "num_footer_blocks": 0,
  182. "num_watermark_blocks": 0,
  183. "num_vertical_margin_note_blocks": 0,
  184. "most_common_font_type": most_common_font[0][0],
  185. "most_common_font_size": most_common_font[0][1],
  186. "number_of_most_common_font": most_common_font[1],
  187. "second_most_common_font_type": second_most_common_font[0][0],
  188. "second_most_common_font_size": second_most_common_font[0][1],
  189. "number_of_second_most_common_font": second_most_common_font[1],
  190. "avg_text_length": avg_text_length,
  191. }
  192. for page_id, blocks in pdf_dict.items():
  193. if page_id.startswith("page_"):
  194. blocks = pdf_dict[page_id]["para_blocks"]
  195. statistics["num_pages"] += 1
  196. for block_id, block_data in enumerate(blocks):
  197. statistics["num_blocks"] += 1
  198. if "paras" in block_data.keys():
  199. statistics["num_paras"] += len(block_data["paras"])
  200. for line in block_data["lines"]:
  201. if line.get("is_title", 0):
  202. statistics["num_titles"] += 1
  203. if block_data.get("is_header", 0):
  204. statistics["num_header_blocks"] += 1
  205. if block_data.get("is_footer", 0):
  206. statistics["num_footer_blocks"] += 1
  207. if block_data.get("is_watermark", 0):
  208. statistics["num_watermark_blocks"] += 1
  209. if block_data.get("is_vertical_margin_note", 0):
  210. statistics["num_vertical_margin_note_blocks"] += 1
  211. pdf_dict["statistics"] = statistics
  212. return pdf_dict