para_pipeline.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. import os
  2. import json
  3. from magic_pdf.para.commons import *
  4. from magic_pdf.para.raw_processor import RawBlockProcessor
  5. from magic_pdf.para.layout_match_processor import LayoutFilterProcessor
  6. from magic_pdf.para.stats import BlockStatisticsCalculator
  7. from magic_pdf.para.stats import DocStatisticsCalculator
  8. from magic_pdf.para.title_processor import TitleProcessor
  9. from magic_pdf.para.block_termination_processor import BlockTerminationProcessor
  10. from magic_pdf.para.block_continuation_processor import BlockContinuationProcessor
  11. from magic_pdf.para.draw import DrawAnnos
  12. from magic_pdf.para.exceptions import (
  13. DenseSingleLineBlockException,
  14. TitleDetectionException,
  15. TitleLevelException,
  16. ParaSplitException,
  17. ParaMergeException,
  18. DiscardByException,
  19. )
  20. if sys.version_info[0] >= 3:
  21. sys.stdout.reconfigure(encoding="utf-8") # type: ignore
  22. class ParaProcessPipeline:
  23. def __init__(self) -> None:
  24. pass
  25. def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
  26. """
  27. This function processes the paragraphs, including:
  28. 1. Read raw input json file into pdf_dic
  29. 2. Detect and replace equations
  30. 3. Combine spans into a natural line
  31. 4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
  32. 5. Compute statistics for each block
  33. 6. Detect titles in the document
  34. 7. Detect paragraphs inside each block
  35. 8. Divide the level of the titles
  36. 9. Detect and combine paragraphs from different blocks into one paragraph
  37. 10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
  38. 11. Draw annotations on the pdf file
  39. Parameters
  40. ----------
  41. pdf_dic_json_fpath : str
  42. path to the pdf dictionary json file.
  43. Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
  44. input_pdf_doc : str
  45. path to the input pdf file
  46. output_pdf_path : str
  47. path to the output pdf file
  48. Returns
  49. -------
  50. pdf_dict : dict
  51. result dictionary
  52. """
  53. error_info = None
  54. output_json_file = ""
  55. output_dir = ""
  56. if input_pdf_path is not None:
  57. input_pdf_path = os.path.abspath(input_pdf_path)
  58. # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
  59. if output_pdf_path is not None:
  60. output_dir = os.path.dirname(output_pdf_path)
  61. output_json_file = f"{output_dir}/pdf_dic.json"
  62. def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
  63. """
  64. Save the pdf_dic to a json file
  65. """
  66. output_pdf_file_name = os.path.basename(output_pdf_path)
  67. # output_dir = os.path.dirname(output_pdf_path)
  68. output_dir = "\\tmp\\pdf_parse"
  69. output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
  70. pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
  71. if not os.path.exists(output_dir):
  72. os.makedirs(output_dir)
  73. if para_debug_mode == "full":
  74. with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
  75. json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
  76. # Validate the output already exists
  77. if not os.path.exists(pdf_dic_json_fpath):
  78. print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
  79. return None
  80. else:
  81. print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
  82. return pdf_dic_json_fpath
  83. """
  84. Preprocess the lines of block
  85. """
  86. # Find and replace the interline and inline equations, should be better done before the paragraph processing
  87. # Create "para_blocks" for each page.
  88. # equationProcessor = EquationsProcessor()
  89. # pdf_dic = equationProcessor.batch_process_blocks(pdf_info_dict)
  90. # Combine spans into a natural line
  91. rawBlockProcessor = RawBlockProcessor()
  92. pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
  93. # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
  94. # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
  95. layoutFilter = LayoutFilterProcessor()
  96. pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
  97. # Compute statistics for each block
  98. blockStatisticsCalculator = BlockStatisticsCalculator()
  99. pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
  100. # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
  101. # Compute statistics for all blocks(namely this pdf document)
  102. docStatisticsCalculator = DocStatisticsCalculator()
  103. pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
  104. # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
  105. # Dump the first three stages of pdf_dic to a json file
  106. if para_debug_mode == "full":
  107. pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
  108. """
  109. Detect titles in the document
  110. """
  111. doc_statistics = pdf_dic["statistics"]
  112. titleProcessor = TitleProcessor(doc_statistics)
  113. pdf_dic = titleProcessor.batch_process_blocks_detect_titles(pdf_dic)
  114. if para_debug_mode == "full":
  115. pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
  116. """
  117. Detect and divide the level of the titles
  118. """
  119. titleProcessor = TitleProcessor()
  120. pdf_dic = titleProcessor.batch_process_blocks_recog_title_level(pdf_dic)
  121. if para_debug_mode == "full":
  122. pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
  123. """
  124. Detect and split paragraphs inside each block
  125. """
  126. blockInnerParasProcessor = BlockTerminationProcessor()
  127. pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
  128. if para_debug_mode == "full":
  129. pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
  130. # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
  131. # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
  132. """
  133. Detect and combine paragraphs from different blocks into one paragraph
  134. """
  135. blockContinuationProcessor = BlockContinuationProcessor()
  136. pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
  137. pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
  138. if para_debug_mode == "full":
  139. pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
  140. # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
  141. # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
  142. """
  143. Discard pdf files by checking exceptions and return the error info to the caller
  144. """
  145. discardByException = DiscardByException()
  146. is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
  147. pdf_dic, exception=DenseSingleLineBlockException()
  148. )
  149. is_discard_by_title_detection = discardByException.discard_by_title_detection(
  150. pdf_dic, exception=TitleDetectionException()
  151. )
  152. is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
  153. is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
  154. is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
  155. """
  156. if any(
  157. info is not None
  158. for info in [
  159. is_discard_by_single_line_block,
  160. is_discard_by_title_detection,
  161. is_discard_by_title_level,
  162. is_discard_by_split_para,
  163. is_discard_by_merge_para,
  164. ]
  165. ):
  166. error_info = next(
  167. (
  168. info
  169. for info in [
  170. is_discard_by_single_line_block,
  171. is_discard_by_title_detection,
  172. is_discard_by_title_level,
  173. is_discard_by_split_para,
  174. is_discard_by_merge_para,
  175. ]
  176. if info is not None
  177. ),
  178. None,
  179. )
  180. return pdf_dic, error_info
  181. if any(
  182. info is not None
  183. for info in [
  184. is_discard_by_single_line_block,
  185. is_discard_by_title_detection,
  186. is_discard_by_title_level,
  187. is_discard_by_split_para,
  188. is_discard_by_merge_para,
  189. ]
  190. ):
  191. error_info = next(
  192. (
  193. info
  194. for info in [
  195. is_discard_by_single_line_block,
  196. is_discard_by_title_detection,
  197. is_discard_by_title_level,
  198. is_discard_by_split_para,
  199. is_discard_by_merge_para,
  200. ]
  201. if info is not None
  202. ),
  203. None,
  204. )
  205. return pdf_dic, error_info
  206. """
  207. """
  208. Dump the final pdf_dic to a json file
  209. """
  210. if para_debug_mode is not None:
  211. with open(output_json_file, "w", encoding="utf-8") as f:
  212. json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
  213. """
  214. Draw the annotations
  215. """
  216. if is_discard_by_single_line_block is not None:
  217. error_info = is_discard_by_single_line_block
  218. elif is_discard_by_title_detection is not None:
  219. error_info = is_discard_by_title_detection
  220. elif is_discard_by_title_level is not None:
  221. error_info = is_discard_by_title_level
  222. elif is_discard_by_split_para is not None:
  223. error_info = is_discard_by_split_para
  224. elif is_discard_by_merge_para is not None:
  225. error_info = is_discard_by_merge_para
  226. if error_info is not None:
  227. return pdf_dic, error_info
  228. """
  229. Dump the final pdf_dic to a json file
  230. """
  231. if para_debug_mode is not None:
  232. with open(output_json_file, "w", encoding="utf-8") as f:
  233. json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
  234. """
  235. Draw the annotations
  236. """
  237. if para_debug_mode is not None:
  238. drawAnnos = DrawAnnos()
  239. drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
  240. """
  241. Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
  242. """
  243. if para_debug_mode is not None:
  244. for fpath in os.listdir(output_dir):
  245. if fpath.endswith(".json") and "stage" in fpath:
  246. os.remove(os.path.join(output_dir, fpath))
  247. return pdf_dic, error_info