block_continuation_processor.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. import os
  2. import unicodedata
  3. from magic_pdf.para.commons import *
  4. if sys.version_info[0] >= 3:
  5. sys.stdout.reconfigure(encoding="utf-8") # type: ignore
  6. class BlockContinuationProcessor:
  7. """
  8. This class is used to process the blocks to detect block continuations.
  9. """
  10. def __init__(self) -> None:
  11. pass
  12. def __is_similar_font_type(self, font_type1, font_type2, prefix_length_ratio=0.3):
  13. """
  14. This function checks if the two font types are similar.
  15. Definition of similar font types: the two font types have a common prefix,
  16. and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
  17. Parameters
  18. ----------
  19. font_type1 : str
  20. font type 1
  21. font_type2 : str
  22. font type 2
  23. prefix_length_ratio : float
  24. minimum ratio of the common prefix length to the length of the shorter font type
  25. Returns
  26. -------
  27. bool
  28. True if the two font types are similar, False otherwise.
  29. """
  30. if isinstance(font_type1, list):
  31. font_type1 = font_type1[0] if font_type1 else ""
  32. if isinstance(font_type2, list):
  33. font_type2 = font_type2[0] if font_type2 else ""
  34. if font_type1 == font_type2:
  35. return True
  36. # Find the length of the common prefix
  37. common_prefix_length = len(os.path.commonprefix([font_type1, font_type2]))
  38. # Calculate the minimum prefix length based on the ratio
  39. min_prefix_length = int(min(len(font_type1), len(font_type2)) * prefix_length_ratio)
  40. return common_prefix_length >= min_prefix_length
  41. def __is_same_block_font(self, block1, block2):
  42. """
  43. This function compares the font of block1 and block2
  44. Parameters
  45. ----------
  46. block1 : dict
  47. block1
  48. block2 : dict
  49. block2
  50. Returns
  51. -------
  52. is_same : bool
  53. True if block1 and block2 have the same font, else False
  54. """
  55. block_1_font_type = safe_get(block1, "block_font_type", "")
  56. block_1_font_size = safe_get(block1, "block_font_size", 0)
  57. block_1_avg_char_width = safe_get(block1, "avg_char_width", 0)
  58. block_2_font_type = safe_get(block2, "block_font_type", "")
  59. block_2_font_size = safe_get(block2, "block_font_size", 0)
  60. block_2_avg_char_width = safe_get(block2, "avg_char_width", 0)
  61. if isinstance(block_1_font_size, list):
  62. block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
  63. if isinstance(block_2_font_size, list):
  64. block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
  65. block_1_text = safe_get(block1, "text", "")
  66. block_2_text = safe_get(block2, "text", "")
  67. if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
  68. return False
  69. if not block_1_text or not block_2_text:
  70. return False
  71. else:
  72. text_len_ratio = len(block_2_text) / len(block_1_text)
  73. if text_len_ratio < 0.2:
  74. avg_char_width_condition = (
  75. abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
  76. < 0.5
  77. )
  78. else:
  79. avg_char_width_condition = (
  80. abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
  81. < 0.2
  82. )
  83. block_font_size_condtion = abs(block_1_font_size - block_2_font_size) < 1
  84. return (
  85. self.__is_similar_font_type(block_1_font_type, block_2_font_type)
  86. and avg_char_width_condition
  87. and block_font_size_condtion
  88. )
  89. def _is_alphabet_char(self, char):
  90. if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
  91. return True
  92. else:
  93. return False
  94. def _is_chinese_char(self, char):
  95. if char >= "\u4e00" and char <= "\u9fa5":
  96. return True
  97. else:
  98. return False
  99. def _is_other_letter_char(self, char):
  100. try:
  101. cat = unicodedata.category(char)
  102. if cat == "Lu" or cat == "Ll":
  103. return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
  104. except TypeError:
  105. print("The input to the function must be a single character.")
  106. return False
  107. def _is_year(self, s: str):
  108. try:
  109. number = int(s)
  110. return 1900 <= number <= 2099
  111. except ValueError:
  112. return False
  113. def __is_para_font_consistent(self, para_1, para_2):
  114. """
  115. This function compares the font of para1 and para2
  116. Parameters
  117. ----------
  118. para1 : dict
  119. para1
  120. para2 : dict
  121. para2
  122. Returns
  123. -------
  124. is_same : bool
  125. True if para1 and para2 have the same font, else False
  126. """
  127. if para_1 is None or para_2 is None:
  128. return False
  129. para_1_font_type = safe_get(para_1, "para_font_type", "")
  130. para_1_font_size = safe_get(para_1, "para_font_size", 0)
  131. para_1_font_color = safe_get(para_1, "para_font_color", "")
  132. para_2_font_type = safe_get(para_2, "para_font_type", "")
  133. para_2_font_size = safe_get(para_2, "para_font_size", 0)
  134. para_2_font_color = safe_get(para_2, "para_font_color", "")
  135. if isinstance(para_1_font_type, list): # get the most common font type
  136. para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
  137. if isinstance(para_2_font_type, list):
  138. para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
  139. if isinstance(para_1_font_size, list): # compute average font type
  140. para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
  141. if isinstance(para_2_font_size, list): # compute average font type
  142. para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
  143. return (
  144. self.__is_similar_font_type(para_1_font_type, para_2_font_type)
  145. and abs(para_1_font_size - para_2_font_size) < 1.5
  146. # and para_font_color1 == para_font_color2
  147. )
  148. def _is_para_puncs_consistent(self, para_1, para_2):
  149. """
  150. This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
  151. Parameters
  152. ----------
  153. para1 : dict
  154. para1
  155. para2 : dict
  156. para2
  157. Returns
  158. -------
  159. is_same : bool
  160. True if para1 and para2 are from the same paragraph by using the puncs, else False
  161. """
  162. para_1_text = safe_get(para_1, "para_text", "").strip()
  163. para_2_text = safe_get(para_2, "para_text", "").strip()
  164. para_1_bboxes = safe_get(para_1, "para_bbox", [])
  165. para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
  166. para_2_bboxes = safe_get(para_2, "para_bbox", [])
  167. para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
  168. # print_yellow(" Features of determine puncs_consistent:")
  169. # print(f" para_1_text: {para_1_text}")
  170. # print(f" para_2_text: {para_2_text}")
  171. # print(f" para_1_bboxes: {para_1_bboxes}")
  172. # print(f" para_2_bboxes: {para_2_bboxes}")
  173. # print(f" para_1_font_sizes: {para_1_font_sizes}")
  174. # print(f" para_2_font_sizes: {para_2_font_sizes}")
  175. if is_nested_list(para_1_bboxes):
  176. x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
  177. else:
  178. x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
  179. if is_nested_list(para_2_bboxes):
  180. x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
  181. para_2_font_sizes = para_2_font_sizes[0] # type: ignore
  182. else:
  183. x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
  184. right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
  185. are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
  186. left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
  187. is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
  188. is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
  189. # Check if either para_text1 or para_text2 is empty
  190. if not para_1_text or not para_2_text:
  191. return False
  192. # Define the end puncs for a sentence to end and hyphen
  193. end_puncs = [".", "?", "!", "。", "?", "!", "…"]
  194. hyphen = ["-", "—"]
  195. # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
  196. para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
  197. para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
  198. para_1_end_with_space = para_1_text and para_1_text[-1] == " "
  199. para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
  200. # print_yellow(f" para_1_end_with_hyphen: {para_1_end_with_hyphen}")
  201. # print_yellow(f" para_1_end_with_end_punc: {para_1_end_with_end_punc}")
  202. # print_yellow(f" para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
  203. # print_yellow(f" para_1_end_with_space: {para_1_end_with_space}")
  204. if para_1_end_with_hyphen: # If para_text1 ends with hyphen
  205. # print_red(f"para_1 is end with hyphen.")
  206. para_2_is_consistent = para_2_text and (
  207. para_2_text[0] in hyphen
  208. or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
  209. or (self._is_chinese_char(para_2_text[0]))
  210. or (self._is_other_letter_char(para_2_text[0]))
  211. )
  212. if para_2_is_consistent:
  213. # print(f"para_2 is consistent.\n")
  214. return True
  215. else:
  216. # print(f"para_2 is not consistent.\n")
  217. pass
  218. elif para_1_end_with_end_punc: # If para_text1 ends with ending punctuations
  219. # print_red(f"para_1 is end with end_punc.")
  220. para_2_is_consistent = (
  221. para_2_text
  222. and (
  223. para_2_text[0] == " "
  224. or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
  225. or (self._is_chinese_char(para_2_text[0]))
  226. or (self._is_other_letter_char(para_2_text[0]))
  227. )
  228. and not is_para2_left_indent_than_papa1
  229. )
  230. if para_2_is_consistent:
  231. # print(f"para_2 is consistent.\n")
  232. return True
  233. else:
  234. # print(f"para_2 is not consistent.\n")
  235. pass
  236. elif para_1_not_end_with_end_punc: # If para_text1 is not end with ending punctuations
  237. # print_red(f"para_1 is NOT end with end_punc.")
  238. para_2_is_consistent = para_2_text and (
  239. para_2_text[0] == " "
  240. or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
  241. or (self._is_alphabet_char(para_2_text[0]))
  242. or (self._is_year(para_2_text[0:4]))
  243. or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
  244. or (self._is_chinese_char(para_2_text[0]))
  245. or (self._is_other_letter_char(para_2_text[0]))
  246. )
  247. if para_2_is_consistent:
  248. # print(f"para_2 is consistent.\n")
  249. return True
  250. else:
  251. # print(f"para_2 is not consistent.\n")
  252. pass
  253. elif para_1_end_with_space: # If para_text1 ends with space
  254. # print_red(f"para_1 is end with space.")
  255. para_2_is_consistent = para_2_text and (
  256. para_2_text[0] == " "
  257. or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
  258. or (self._is_chinese_char(para_2_text[0]))
  259. or (self._is_other_letter_char(para_2_text[0]))
  260. )
  261. if para_2_is_consistent:
  262. # print(f"para_2 is consistent.\n")
  263. return True
  264. else:
  265. pass
  266. # print(f"para_2 is not consistent.\n")
  267. return False
  268. def _is_block_consistent(self, block1, block2):
  269. """
  270. This function determines whether block1 and block2 are originally from the same block
  271. Parameters
  272. ----------
  273. block1 : dict
  274. block1s
  275. block2 : dict
  276. block2
  277. Returns
  278. -------
  279. is_same : bool
  280. True if block1 and block2 are from the same block, else False
  281. """
  282. return self.__is_same_block_font(block1, block2)
  283. def _is_para_continued(self, para1, para2):
  284. """
  285. This function determines whether para1 and para2 are originally from the same paragraph
  286. Parameters
  287. ----------
  288. para1 : dict
  289. para1
  290. para2 : dict
  291. para2
  292. Returns
  293. -------
  294. is_same : bool
  295. True if para1 and para2 are from the same paragraph, else False
  296. """
  297. is_para_font_consistent = self.__is_para_font_consistent(para1, para2)
  298. is_para_puncs_consistent = self._is_para_puncs_consistent(para1, para2)
  299. return is_para_font_consistent and is_para_puncs_consistent
  300. def _are_boundaries_of_block_consistent(self, block1, block2):
  301. """
  302. This function checks if the boundaries of block1 and block2 are consistent
  303. Parameters
  304. ----------
  305. block1 : dict
  306. block1
  307. block2 : dict
  308. block2
  309. Returns
  310. -------
  311. is_consistent : bool
  312. True if the boundaries of block1 and block2 are consistent, else False
  313. """
  314. last_line_of_block1 = block1["lines"][-1]
  315. first_line_of_block2 = block2["lines"][0]
  316. spans_of_last_line_of_block1 = last_line_of_block1["spans"]
  317. spans_of_first_line_of_block2 = first_line_of_block2["spans"]
  318. font_type_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["font"].lower()
  319. font_size_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["size"]
  320. font_color_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["color"]
  321. font_flags_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["flags"]
  322. font_type_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["font"].lower()
  323. font_size_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["size"]
  324. font_color_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["color"]
  325. font_flags_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["flags"]
  326. return (
  327. self.__is_similar_font_type(font_type_of_last_line_of_block1, font_type_of_first_line_of_block2)
  328. and abs(font_size_of_last_line_of_block1 - font_size_of_first_line_of_block2) < 1
  329. # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
  330. and font_flags_of_last_line_of_block1 == font_flags_of_first_line_of_block2
  331. )
  332. def _get_last_paragraph(self, block):
  333. """
  334. Retrieves the last paragraph from a block.
  335. Parameters
  336. ----------
  337. block : dict
  338. The block from which to retrieve the paragraph.
  339. Returns
  340. -------
  341. dict
  342. The last paragraph of the block.
  343. """
  344. if block["paras"]:
  345. last_para_key = list(block["paras"].keys())[-1]
  346. return block["paras"][last_para_key]
  347. else:
  348. return None
  349. def _get_first_paragraph(self, block):
  350. """
  351. Retrieves the first paragraph from a block.
  352. Parameters
  353. ----------
  354. block : dict
  355. The block from which to retrieve the paragraph.
  356. Returns
  357. -------
  358. dict
  359. The first paragraph of the block.
  360. """
  361. if block["paras"]:
  362. first_para_key = list(block["paras"].keys())[0]
  363. return block["paras"][first_para_key]
  364. else:
  365. return None
  366. def should_merge_next_para(self, curr_para, next_para):
  367. if self._is_para_continued(curr_para, next_para):
  368. return True
  369. else:
  370. return False
  371. def batch_tag_paras(self, pdf_dict):
  372. the_last_page_id = len(pdf_dict) - 1
  373. for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
  374. if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
  375. para_blocks_of_curr_page = curr_page_content["para_blocks"]
  376. next_page_idx = curr_page_idx + 1
  377. next_page_id = f"page_{next_page_idx}"
  378. next_page_content = pdf_dict.get(next_page_id, {})
  379. for i, current_block in enumerate(para_blocks_of_curr_page):
  380. for para_id, curr_para in current_block["paras"].items():
  381. curr_para["curr_para_location"] = [
  382. curr_page_idx,
  383. current_block["block_id"],
  384. int(para_id.split("_")[-1]),
  385. ]
  386. curr_para["next_para_location"] = None # 默认设置为None
  387. curr_para["merge_next_para"] = False # 默认设置为False
  388. next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
  389. if next_block:
  390. curr_block_last_para_key = list(current_block["paras"].keys())[-1]
  391. curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
  392. next_block_first_para_key = list(next_block["paras"].keys())[0]
  393. next_blk_first_para = next_block["paras"][next_block_first_para_key]
  394. if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
  395. curr_blk_last_para["next_para_location"] = [
  396. curr_page_idx,
  397. next_block["block_id"],
  398. int(next_block_first_para_key.split("_")[-1]),
  399. ]
  400. curr_blk_last_para["merge_next_para"] = True
  401. else:
  402. # Handle the case where the next block is in a different page
  403. curr_block_last_para_key = list(current_block["paras"].keys())[-1]
  404. curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
  405. while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
  406. next_page_idx += 1
  407. next_page_id = f"page_{next_page_idx}"
  408. next_page_content = pdf_dict.get(next_page_id, {})
  409. if next_page_content.get("para_blocks", []):
  410. next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
  411. next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
  412. if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
  413. curr_blk_last_para["next_para_location"] = [
  414. next_page_idx,
  415. next_page_content["para_blocks"][0]["block_id"],
  416. int(next_blk_first_para_key.split("_")[-1]),
  417. ]
  418. curr_blk_last_para["merge_next_para"] = True
  419. return pdf_dict
  420. def find_block_by_id(self, para_blocks, block_id):
  421. for block in para_blocks:
  422. if block.get("block_id") == block_id:
  423. return block
  424. return None
  425. def batch_merge_paras(self, pdf_dict):
  426. for page_id, page_content in pdf_dict.items():
  427. if page_id.startswith("page_") and page_content.get("para_blocks", []):
  428. para_blocks_of_page = page_content["para_blocks"]
  429. for i in range(len(para_blocks_of_page)):
  430. current_block = para_blocks_of_page[i]
  431. paras = current_block["paras"]
  432. for para_id, curr_para in list(paras.items()):
  433. # 跳过标题段落
  434. if curr_para.get("is_para_title"):
  435. continue
  436. while curr_para.get("merge_next_para"):
  437. next_para_location = curr_para.get("next_para_location")
  438. if not next_para_location:
  439. break
  440. next_page_idx, next_block_id, next_para_id = next_para_location
  441. next_page_id = f"page_{next_page_idx}"
  442. next_page_content = pdf_dict.get(next_page_id)
  443. if not next_page_content:
  444. break
  445. next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
  446. if not next_block:
  447. break
  448. next_para = next_block["paras"].get(f"para_{next_para_id}")
  449. if not next_para or next_para.get("is_para_title"):
  450. break
  451. # 合并段落文本
  452. curr_para_text = curr_para.get("para_text", "")
  453. next_para_text = next_para.get("para_text", "")
  454. curr_para["para_text"] = curr_para_text + " " + next_para_text
  455. # 更新 next_para_location
  456. curr_para["next_para_location"] = next_para.get("next_para_location")
  457. # 将下一个段落文本置为空,表示已被合并
  458. next_para["para_text"] = ""
  459. # 更新 merge_next_para 标记
  460. curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
  461. return pdf_dict