calc_span_stats.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. import os
  2. import csv
  3. import json
  4. import pandas as pd
  5. from pandas import DataFrame as df
  6. from matplotlib import pyplot as plt
  7. from termcolor import cprint
  8. """
  9. Execute this script in the following way:
  10. 1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following:
  11. code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json
  12. 2. Under the directory code-clean, execute the following command:
  13. $ python -m libs.calc_span_stats
  14. """
  15. def print_green_on_red(text):
  16. cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
  17. def print_green(text):
  18. print()
  19. cprint(text, "green", attrs=["bold"], end="\n\n")
  20. def print_red(text):
  21. print()
  22. cprint(text, "red", attrs=["bold"], end="\n\n")
  23. def safe_get(dict_obj, key, default):
  24. val = dict_obj.get(key)
  25. if val is None:
  26. return default
  27. else:
  28. return val
  29. class SpanStatsCalc:
  30. """Calculate statistics of span."""
  31. def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str):
  32. """Draw multiple figures in one figure."""
  33. # make a canvas
  34. fig = plt.figure(fig_num, figsize=(20, 20))
  35. pass
  36. def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame:
  37. """Calculate statistics per pdf_dict."""
  38. span_stats = pd.DataFrame()
  39. span_stats = []
  40. span_id = 0
  41. for page_id, blocks in pdf_dict.items():
  42. if page_id.startswith("page_"):
  43. if "para_blocks" in blocks.keys():
  44. for para_block in blocks["para_blocks"]:
  45. for line in para_block["lines"]:
  46. for span in line["spans"]:
  47. span_text = safe_get(span, "text", "")
  48. span_font_name = safe_get(span, "font", "")
  49. span_font_size = safe_get(span, "size", 0)
  50. span_font_color = safe_get(span, "color", "")
  51. span_font_flags = safe_get(span, "flags", 0)
  52. span_font_flags_decoded = safe_get(span, "decomposed_flags", {})
  53. span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False)
  54. span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False)
  55. span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False)
  56. span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False)
  57. span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False)
  58. span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False)
  59. span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False)
  60. span_stats.append(
  61. {
  62. "span_id": span_id, # id of span
  63. "page_id": page_id, # page number of pdf
  64. "span_text": span_text, # text of span
  65. "span_font_name": span_font_name, # font name of span
  66. "span_font_size": span_font_size, # font size of span
  67. "span_font_color": span_font_color, # font color of span
  68. "span_font_flags": span_font_flags, # font flags of span
  69. "span_is_superscript": int(
  70. span_is_super_script
  71. ), # indicate whether the span is super script or not
  72. "span_is_italic": int(span_is_italic), # indicate whether the span is italic or not
  73. "span_is_serifed": int(span_is_serifed), # indicate whether the span is serifed or not
  74. "span_is_sans_serifed": int(
  75. span_is_sans_serifed
  76. ), # indicate whether the span is sans serifed or not
  77. "span_is_monospaced": int(
  78. span_is_monospaced
  79. ), # indicate whether the span is monospaced or not
  80. "span_is_proportional": int(
  81. span_is_proportional
  82. ), # indicate whether the span is proportional or not
  83. "span_is_bold": int(span_is_bold), # indicate whether the span is bold or not
  84. }
  85. )
  86. span_id += 1
  87. span_stats = pd.DataFrame(span_stats)
  88. # print(span_stats)
  89. return span_stats
  90. def __find_pdf_dic_files(
  91. jf_name="pdf_dic.json",
  92. base_code_name="code-clean",
  93. tgt_base_dir_name="tmp",
  94. unittest_dir_name="unittest",
  95. md_dir_name="md",
  96. book_names=[
  97. "scihub",
  98. ], # other possible values: "zlib", "arxiv" and so on
  99. ):
  100. pdf_dict_files = []
  101. curr_dir = os.path.dirname(__file__)
  102. for i in range(len(curr_dir)):
  103. if curr_dir[i : i + len(base_code_name)] == base_code_name:
  104. base_code_dir_name = curr_dir[: i + len(base_code_name)]
  105. for book_name in book_names:
  106. search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name)
  107. if os.path.exists(base_code_dir_name):
  108. search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name)
  109. for root, dirs, files in os.walk(search_dir_name):
  110. for file in files:
  111. if file == jf_name:
  112. pdf_dict_files.append(os.path.join(root, file))
  113. break
  114. return pdf_dict_files
  115. def combine_span_texts(group_df, span_stats):
  116. combined_span_texts = []
  117. for _, row in group_df.iterrows():
  118. curr_span_id = row.name
  119. curr_span_text = row["span_text"]
  120. pre_span_id = curr_span_id - 1
  121. pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else ""
  122. next_span_id = curr_span_id + 1
  123. next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else ""
  124. # pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow
  125. pointer_sign = "→ → → "
  126. combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text])
  127. combined_span_texts.append(combined_text)
  128. return "\n\n".join(combined_span_texts)
  129. # pd.set_option("display.max_colwidth", None) # 设置为 None 来显示完整的文本
  130. pd.set_option("display.max_rows", None) # 设置为 None 来显示更多的行
  131. def main():
  132. pdf_dict_files = __find_pdf_dic_files()
  133. # print(pdf_dict_files)
  134. span_stats_calc = SpanStatsCalc()
  135. for pdf_dict_file in pdf_dict_files:
  136. print("-" * 100)
  137. print_green_on_red(f"Processing {pdf_dict_file}")
  138. with open(pdf_dict_file, "r", encoding="utf-8") as f:
  139. pdf_dict = json.load(f)
  140. raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict)
  141. save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv")
  142. raw_df.to_csv(save_path, index=False)
  143. filtered_df = raw_df[raw_df["span_is_superscript"] == 1]
  144. if filtered_df.empty:
  145. print("No superscript span found!")
  146. continue
  147. filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
  148. combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df) # type: ignore
  149. final_df = filtered_grouped_df.size().reset_index(name="count")
  150. final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True)
  151. print(final_df)
  152. final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n"))
  153. save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv")
  154. # 使用 UTF-8 编码并添加 BOM,确保所有字段被双引号包围
  155. final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
  156. # 创建一个 2x2 的图表布局
  157. fig, axs = plt.subplots(2, 2, figsize=(15, 10))
  158. # 按照 span_font_name 分类作图
  159. final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name")
  160. # 按照 span_font_size 分类作图
  161. final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size")
  162. # 按照 span_font_color 分类作图
  163. final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color")
  164. # 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图
  165. grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
  166. grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping")
  167. # 调整布局
  168. plt.tight_layout()
  169. # 显示图表
  170. # plt.show()
  171. # 保存图表到 PNG 文件
  172. save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png")
  173. plt.savefig(save_path)
  174. # 清除画布
  175. plt.clf()
  176. if __name__ == "__main__":
  177. main()