title_processor.py 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014
  1. import os
  2. import re
  3. import numpy as np
  4. from magic_pdf.libs.nlp_utils import NLPModels
  5. from magic_pdf.para.commons import *
  6. if sys.version_info[0] >= 3:
  7. sys.stdout.reconfigure(encoding="utf-8") # type: ignore
  8. class TitleProcessor:
  9. def __init__(self, *doc_statistics) -> None:
  10. if len(doc_statistics) > 0:
  11. self.doc_statistics = doc_statistics[0]
  12. self.nlp_model = NLPModels()
  13. self.MAX_TITLE_LEVEL = 3
  14. self.numbered_title_pattern = r"""
  15. ^ # 行首
  16. ( # 开始捕获组
  17. [\(\(]\d+[\)\)] # 括号内数字,支持中文和英文括号,例如:(1) 或 (1)
  18. |\d+[\)\)]\s # 数字后跟右括号和空格,支持中文和英文括号,例如:2) 或 2)
  19. |[\(\(][A-Z][\)\)] # 括号内大写字母,支持中文和英文括号,例如:(A) 或 (A)
  20. |[A-Z][\)\)]\s # 大写字母后跟右括号和空格,例如:A) 或 A)
  21. |[\(\(][IVXLCDM]+[\)\)] # 括号内罗马数字,支持中文和英文括号,例如:(I) 或 (I)
  22. |[IVXLCDM]+[\)\)]\s # 罗马数字后跟右括号和空格,例如:I) 或 I)
  23. |\d+(\.\d+)*\s # 数字或复合数字编号后跟空格,例如:1. 或 3.2.1
  24. |[一二三四五六七八九十百千]+[、\s] # 中文序号后跟顿号和空格,例如:一、
  25. |[\(|\(][一二三四五六七八九十百千]+[\)|\)]\s* # 中文括号内中文序号后跟空格,例如:(一)
  26. |[A-Z]\.\d+(\.\d+)?\s # 大写字母后跟点和数字,例如:A.1 或 A.1.1
  27. |[\(\(][a-z][\)\)] # 括号内小写字母,支持中文和英文括号,例如:(a) 或 (a)
  28. |[a-z]\)\s # 小写字母后跟右括号和空格,例如:a)
  29. |[A-Z]-\s # 大写字母后跟短横线和空格,例如:A-
  30. |\w+:\s # 英文序号词后跟冒号和空格,例如:First:
  31. |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
  32. |[IVXLCDM]+\. # 罗马数字后跟点,例如:I.
  33. |\d+\.\s # 单个数字后跟点和空格,例如:1.
  34. ) # 结束捕获组
  35. .+ # 标题的其余部分
  36. """
  37. def _is_potential_title(
  38. self,
  39. curr_line,
  40. prev_line,
  41. prev_line_is_title,
  42. next_line,
  43. avg_char_width,
  44. avg_char_height,
  45. median_font_size,
  46. ):
  47. """
  48. This function checks if the line is a potential title.
  49. Parameters
  50. ----------
  51. curr_line : dict
  52. current line
  53. prev_line : dict
  54. previous line
  55. next_line : dict
  56. next line
  57. avg_char_width : float
  58. average of char widths
  59. avg_char_height : float
  60. average of line heights
  61. Returns
  62. -------
  63. bool
  64. True if the line is a potential title, False otherwise.
  65. """
  66. def __is_line_centered(line_bbox, page_bbox, avg_char_width):
  67. """
  68. This function checks if the line is centered on the page
  69. Parameters
  70. ----------
  71. line_bbox : list
  72. bbox of the line
  73. page_bbox : list
  74. bbox of the page
  75. avg_char_width : float
  76. average of char widths
  77. Returns
  78. -------
  79. bool
  80. True if the line is centered on the page, False otherwise.
  81. """
  82. horizontal_ratio = 0.5
  83. horizontal_thres = horizontal_ratio * avg_char_width
  84. x0, _, x1, _ = line_bbox
  85. _, _, page_x1, _ = page_bbox
  86. return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
  87. def __is_bold_font_line(line):
  88. """
  89. Check if a line contains any bold font style.
  90. """
  91. def _is_bold_span(span):
  92. # if span text is empty or only contains space, return False
  93. if not span["text"].strip():
  94. return False
  95. return bool(span["flags"] & 2**4) # Check if the font is bold
  96. for span in line["spans"]:
  97. if not _is_bold_span(span):
  98. return False
  99. return True
  100. def __is_italic_font_line(line):
  101. """
  102. Check if a line contains any italic font style.
  103. """
  104. def __is_italic_span(span):
  105. return bool(span["flags"] & 2**1) # Check if the font is italic
  106. for span in line["spans"]:
  107. if not __is_italic_span(span):
  108. return False
  109. return True
  110. def __is_punctuation_heavy(line_text):
  111. """
  112. Check if the line contains a high ratio of punctuation marks, which may indicate
  113. that the line is not a title.
  114. Parameters:
  115. line_text (str): Text of the line.
  116. Returns:
  117. bool: True if the line is heavy with punctuation, False otherwise.
  118. """
  119. # Pattern for common title format like "X.Y. Title"
  120. pattern = r"\b\d+\.\d+\..*\b"
  121. # If the line matches the title format, return False
  122. if re.match(pattern, line_text.strip()):
  123. return False
  124. # Find all punctuation marks in the line
  125. punctuation_marks = re.findall(r"[^\w\s]", line_text)
  126. number_of_punctuation_marks = len(punctuation_marks)
  127. text_length = len(line_text)
  128. if text_length == 0:
  129. return False
  130. punctuation_ratio = number_of_punctuation_marks / text_length
  131. if punctuation_ratio >= 0.1:
  132. return True
  133. return False
  134. def __has_mixed_font_styles(spans, strict_mode=False):
  135. """
  136. This function checks if the line has mixed font styles, the strict mode will compare the font types
  137. Parameters
  138. ----------
  139. spans : list
  140. spans of the line
  141. strict_mode : bool
  142. True for strict mode, the font types will be fully compared
  143. False for non-strict mode, the font types will be compared by the most longest common prefix
  144. Returns
  145. -------
  146. bool
  147. True if the line has mixed font styles, False otherwise.
  148. """
  149. if strict_mode:
  150. font_styles = set()
  151. for span in spans:
  152. font_style = span["font"].lower()
  153. font_styles.add(font_style)
  154. return len(font_styles) > 1
  155. else: # non-strict mode
  156. font_styles = []
  157. for span in spans:
  158. font_style = span["font"].lower()
  159. font_styles.append(font_style)
  160. if len(font_styles) > 1:
  161. longest_common_prefix = os.path.commonprefix(font_styles)
  162. if len(longest_common_prefix) > 0:
  163. return False
  164. else:
  165. return True
  166. else:
  167. return False
  168. def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
  169. """
  170. This function checks if the current line has a different font type from the previous and next lines
  171. Parameters
  172. ----------
  173. curr_line_font_type : str
  174. font type of the current line
  175. prev_line_font_type : str
  176. font type of the previous line
  177. next_line_font_type : str
  178. font type of the next line
  179. Returns
  180. -------
  181. bool
  182. True if the current line has a different font type from the previous and next lines, False otherwise.
  183. """
  184. return all(
  185. curr_line_font_type != other_font_type.lower()
  186. for other_font_type in [prev_line_font_type, next_line_font_type]
  187. if other_font_type is not None
  188. )
  189. def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
  190. """
  191. This function checks if the current line has a larger font size than the previous and next lines
  192. Parameters
  193. ----------
  194. curr_line_font_size : float
  195. font size of the current line
  196. prev_line_font_size : float
  197. font size of the previous line
  198. next_line_font_size : float
  199. font size of the next line
  200. Returns
  201. -------
  202. bool
  203. True if the current line has a larger font size than the previous and next lines, False otherwise.
  204. """
  205. return all(
  206. curr_line_font_size > other_font_size * 1.2
  207. for other_font_size in [prev_line_font_size, next_line_font_size]
  208. if other_font_size is not None
  209. )
  210. def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
  211. """
  212. This function checks if the current line is similar to the previous line
  213. Parameters
  214. ----------
  215. curr_line : dict
  216. current line
  217. prev_line : dict
  218. previous line
  219. Returns
  220. -------
  221. bool
  222. True if the current line is similar to the previous line, False otherwise.
  223. """
  224. if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
  225. return True
  226. else:
  227. return False
  228. def __is_same_font_type_of_docAvg(curr_line_font_type):
  229. """
  230. This function checks if the current line has the same font type as the document average font type
  231. Parameters
  232. ----------
  233. curr_line_font_type : str
  234. font type of the current line
  235. Returns
  236. -------
  237. bool
  238. True if the current line has the same font type as the document average font type, False otherwise.
  239. """
  240. doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
  241. doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
  242. return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
  243. def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
  244. """
  245. This function checks if the current line has a large enough font size
  246. Parameters
  247. ----------
  248. curr_line_font_size : float
  249. font size of the current line
  250. ratio : float
  251. ratio of the current line font size to the document average font size
  252. Returns
  253. -------
  254. bool
  255. True if the current line has a large enough font size, False otherwise.
  256. """
  257. doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
  258. doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
  259. doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
  260. return curr_line_font_size >= doc_avg_font_size * ratio
  261. def __is_sufficient_spacing_above_and_below(
  262. curr_line_bbox,
  263. prev_line_bbox,
  264. next_line_bbox,
  265. avg_char_height,
  266. median_font_size,
  267. ):
  268. """
  269. This function checks if the current line has sufficient spacing above and below
  270. Parameters
  271. ----------
  272. curr_line_bbox : list
  273. bbox of the current line
  274. prev_line_bbox : list
  275. bbox of the previous line
  276. next_line_bbox : list
  277. bbox of the next line
  278. avg_char_width : float
  279. average of char widths
  280. avg_char_height : float
  281. average of line heights
  282. Returns
  283. -------
  284. bool
  285. True if the current line has sufficient spacing above and below, False otherwise.
  286. """
  287. vertical_ratio = 1.25
  288. vertical_thres = vertical_ratio * median_font_size
  289. _, y0, _, y1 = curr_line_bbox
  290. sufficient_spacing_above = False
  291. if prev_line_bbox:
  292. vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
  293. sufficient_spacing_above = vertical_spacing_above > vertical_thres
  294. else:
  295. sufficient_spacing_above = True
  296. sufficient_spacing_below = False
  297. if next_line_bbox:
  298. vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
  299. sufficient_spacing_below = vertical_spacing_below > vertical_thres
  300. else:
  301. sufficient_spacing_below = True
  302. return (sufficient_spacing_above, sufficient_spacing_below)
  303. def __is_word_list_line_by_rules(curr_line_text):
  304. """
  305. This function checks if the current line is a word list
  306. Parameters
  307. ----------
  308. curr_line_text : str
  309. text of the current line
  310. Returns
  311. -------
  312. bool
  313. True if the current line is a name list, False otherwise.
  314. """
  315. # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[,,;;\s]|$)"
  316. name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[,,;;\s]|$)"
  317. compiled_pattern = re.compile(name_list_pattern)
  318. if compiled_pattern.search(curr_line_text):
  319. return True
  320. else:
  321. return False
  322. # """
  323. def __get_text_catgr_by_nlp(curr_line_text):
  324. """
  325. This function checks if the current line is a name list using nlp model, such as spacy
  326. Parameters
  327. ----------
  328. curr_line_text : str
  329. text of the current line
  330. Returns
  331. -------
  332. bool
  333. True if the current line is a name list, False otherwise.
  334. """
  335. result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
  336. return result
  337. # """
  338. def __is_numbered_title(curr_line_text):
  339. """
  340. This function checks if the current line is a numbered list
  341. Parameters
  342. ----------
  343. curr_line_text : str
  344. text of the current line
  345. Returns
  346. -------
  347. bool
  348. True if the current line is a numbered list, False otherwise.
  349. """
  350. compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
  351. if compiled_pattern.search(curr_line_text):
  352. return True
  353. else:
  354. return False
  355. def __is_end_with_ending_puncs(line_text):
  356. """
  357. This function checks if the current line ends with a ending punctuation mark
  358. Parameters
  359. ----------
  360. line_text : str
  361. text of the current line
  362. Returns
  363. -------
  364. bool
  365. True if the current line ends with a punctuation mark, False otherwise.
  366. """
  367. end_puncs = [".", "?", "!", "。", "?", "!", "…"]
  368. line_text = line_text.rstrip()
  369. if line_text[-1] in end_puncs:
  370. return True
  371. return False
  372. def __contains_only_no_meaning_symbols(line_text):
  373. """
  374. This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
  375. Situation contains:
  376. 1. Only have punctuation marks
  377. 2. Only have other non-meaning symbols
  378. Parameters
  379. ----------
  380. line_text : str
  381. text of the current line
  382. Returns
  383. -------
  384. bool
  385. True if the current line contains only symbols that have no meaning, False otherwise.
  386. """
  387. punctuation_marks = re.findall(r"[^\w\s]", line_text) # find all punctuation marks
  388. number_of_punctuation_marks = len(punctuation_marks)
  389. text_length = len(line_text)
  390. if text_length == 0:
  391. return False
  392. punctuation_ratio = number_of_punctuation_marks / text_length
  393. if punctuation_ratio >= 0.9:
  394. return True
  395. return False
  396. def __is_equation(line_text):
  397. """
  398. This function checks if the current line is an equation.
  399. Parameters
  400. ----------
  401. line_text : str
  402. Returns
  403. -------
  404. bool
  405. True if the current line is an equation, False otherwise.
  406. """
  407. equation_reg = r"\$.*?\\overline.*?\$" # to match interline equations
  408. if re.search(equation_reg, line_text):
  409. return True
  410. else:
  411. return False
  412. def __is_title_by_len(text, max_length=200):
  413. """
  414. This function checks if the current line is a title by length.
  415. Parameters
  416. ----------
  417. text : str
  418. text of the current line
  419. max_length : int
  420. max length of the title
  421. Returns
  422. -------
  423. bool
  424. True if the current line is a title, False otherwise.
  425. """
  426. text = text.strip()
  427. return len(text) <= max_length
  428. def __compute_line_font_type_and_size(curr_line):
  429. """
  430. This function computes the font type and font size of the line.
  431. Parameters
  432. ----------
  433. line : dict
  434. line
  435. Returns
  436. -------
  437. font_type : str
  438. font type of the line
  439. font_size : float
  440. font size of the line
  441. """
  442. spans = curr_line["spans"]
  443. max_accumulated_length = 0
  444. max_span_font_size = curr_line["spans"][0]["size"] # default value, float type
  445. max_span_font_type = curr_line["spans"][0]["font"].lower() # default value, string type
  446. for span in spans:
  447. if span["text"].isspace():
  448. continue
  449. span_length = span["bbox"][2] - span["bbox"][0]
  450. if span_length > max_accumulated_length:
  451. max_accumulated_length = span_length
  452. max_span_font_size = span["size"]
  453. max_span_font_type = span["font"].lower()
  454. return max_span_font_type, max_span_font_size
  455. """
  456. Title detecting main Process.
  457. """
  458. """
  459. Basic features about the current line.
  460. """
  461. curr_line_bbox = curr_line["bbox"]
  462. curr_line_text = curr_line["text"]
  463. curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
  464. if len(curr_line_text.strip()) == 0: # skip empty lines
  465. return False
  466. prev_line_bbox = prev_line["bbox"] if prev_line else None
  467. if prev_line:
  468. prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
  469. else:
  470. prev_line_font_type, prev_line_font_size = None, None
  471. next_line_bbox = next_line["bbox"] if next_line else None
  472. if next_line:
  473. next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
  474. else:
  475. next_line_font_type, next_line_font_size = None, None
  476. """
  477. Aggregated features about the current line.
  478. """
  479. is_italc_font = __is_italic_font_line(curr_line)
  480. is_bold_font = __is_bold_font_line(curr_line)
  481. is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
  482. is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
  483. is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
  484. is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
  485. is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
  486. is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
  487. is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
  488. is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
  489. is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
  490. is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
  491. is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
  492. curr_line_font_size, prev_line_font_size, next_line_font_size
  493. )
  494. is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
  495. curr_line_font_type, prev_line_font_type, next_line_font_type
  496. )
  497. has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
  498. curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
  499. )
  500. is_similar_to_pre_line = __is_similar_to_pre_line(
  501. curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
  502. )
  503. """
  504. Further aggregated features about the current line.
  505. Attention:
  506. Features that start with __ are for internal use.
  507. """
  508. __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
  509. curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
  510. )
  511. __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
  512. is_a_left_inline_title = (
  513. is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
  514. )
  515. is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
  516. is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
  517. is_title_by_check_pre_and_next_line = (
  518. (prev_line is not None or next_line is not None)
  519. and has_sufficient_spaces_above
  520. and has_sufficient_spaces_below
  521. and is_potential_title_font
  522. )
  523. is_numbered_title = __is_numbered_title(curr_line_text) and (
  524. (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
  525. )
  526. is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
  527. is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
  528. is_equation = __is_equation(curr_line_text)
  529. is_title_by_len = __is_title_by_len(curr_line_text)
  530. """
  531. Decide if the line is a title.
  532. """
  533. # is_title = False
  534. # if prev_line_is_title:
  535. is_title = (
  536. is_not_end_with_ending_puncs # not end with ending punctuation marks
  537. and is_not_only_no_meaning_symbols # not only have no meaning symbols
  538. and is_title_by_len # is a title by length, default max length is 200
  539. and not is_equation # an interline equation should never be a title
  540. and is_potential_title_font # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
  541. and (
  542. (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
  543. or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
  544. or (
  545. is_much_larger_font_than_doc_avg
  546. and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
  547. )
  548. or (
  549. is_font_size_little_less_than_doc_avg
  550. and is_bold_font
  551. and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
  552. )
  553. ) # not the same font type as the document average font type, which includes the most common font type and the second most common font type
  554. and (
  555. (
  556. not is_person_or_org_list_line_by_nlp
  557. and (
  558. is_much_larger_font_than_doc_avg
  559. or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
  560. )
  561. )
  562. or (
  563. not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
  564. and not is_a_left_inline_title
  565. and not is_punctuation_heavy
  566. and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
  567. )
  568. or (
  569. is_person_or_org_list_line_by_nlp
  570. and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
  571. and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
  572. )
  573. or (is_numbered_title and not is_a_left_inline_title)
  574. )
  575. )
  576. # ) or (is_similar_to_pre_line and prev_line_is_title)
  577. is_name_or_org_list_to_be_removed = (
  578. (is_person_or_org_list_line_by_nlp)
  579. and is_punctuation_heavy
  580. and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
  581. ) and not is_title
  582. if is_name_or_org_list_to_be_removed:
  583. is_author_or_org_list = True
  584. # print curr_line_text to check
  585. # print_yellow(f"Text of is_author_or_org_list: {curr_line_text}")
  586. else:
  587. is_author_or_org_list = False
  588. """
  589. # print reason why the line is a title
  590. if is_title:
  591. print_green("This line is a title.")
  592. print_green("↓" * 10)
  593. print()
  594. print("curr_line_text: ", curr_line_text)
  595. print()
  596. # print reason why the line is not a title
  597. line_text = curr_line_text.strip()
  598. test_text = "Career/Personal Life"
  599. text_content_condition = line_text == test_text
  600. if not is_title and text_content_condition: # Print specific line
  601. # if not is_title: # Print each line
  602. print_red("This line is not a title.")
  603. print_red("↓" * 10)
  604. print()
  605. print("curr_line_text: ", curr_line_text)
  606. print()
  607. if is_not_end_with_ending_puncs:
  608. print_green(f"is_not_end_with_ending_puncs")
  609. else:
  610. print_red(f"is_end_with_ending_puncs")
  611. if is_not_only_no_meaning_symbols:
  612. print_green(f"is_not_only_no_meaning_symbols")
  613. else:
  614. print_red(f"is_only_no_meaning_symbols")
  615. if is_title_by_len:
  616. print_green(f"is_title_by_len: {is_title_by_len}")
  617. else:
  618. print_red(f"is_not_title_by_len: {is_title_by_len}")
  619. if is_equation:
  620. print_red(f"is_equation")
  621. else:
  622. print_green(f"is_not_equation")
  623. if is_potential_title_font:
  624. print_green(f"is_potential_title_font")
  625. else:
  626. print_red(f"is_not_potential_title_font")
  627. if is_punctuation_heavy:
  628. print_red("is_punctuation_heavy")
  629. else:
  630. print_green("is_not_punctuation_heavy")
  631. if is_bold_font:
  632. print_green(f"is_bold_font")
  633. else:
  634. print_red(f"is_not_bold_font")
  635. if is_font_size_not_less_than_doc_avg:
  636. print_green(f"is_larger_font_than_doc_avg")
  637. else:
  638. print_red(f"is_not_larger_font_than_doc_avg")
  639. if is_much_larger_font_than_doc_avg:
  640. print_green(f"is_much_larger_font_than_doc_avg")
  641. else:
  642. print_red(f"is_not_much_larger_font_than_doc_avg")
  643. if is_not_same_font_type_of_docAvg:
  644. print_green(f"is_not_same_font_type_of_docAvg")
  645. else:
  646. print_red(f"is_same_font_type_of_docAvg")
  647. if is_word_list_line_by_rules:
  648. print_red("is_word_list_line_by_rules")
  649. else:
  650. print_green("is_not_name_list_by_rules")
  651. if is_person_or_org_list_line_by_nlp:
  652. print_red("is_person_or_org_list_line_by_nlp")
  653. else:
  654. print_green("is_not_person_or_org_list_line_by_nlp")
  655. if not is_numbered_title:
  656. print_red("is_not_numbered_title")
  657. else:
  658. print_green("is_numbered_title")
  659. if is_a_left_inline_title:
  660. print_red("is_a_left_inline_title")
  661. else:
  662. print_green("is_not_a_left_inline_title")
  663. if not is_title_by_check_prev_line:
  664. print_red("is_not_title_by_check_prev_line")
  665. else:
  666. print_green("is_title_by_check_prev_line")
  667. if not is_title_by_check_next_line:
  668. print_red("is_not_title_by_check_next_line")
  669. else:
  670. print_green("is_title_by_check_next_line")
  671. if not is_title_by_check_pre_and_next_line:
  672. print_red("is_not_title_by_check_pre_and_next_line")
  673. else:
  674. print_green("is_title_by_check_pre_and_next_line")
  675. # print_green("Common features:")
  676. # print_green("↓" * 10)
  677. # print(f" curr_line_font_type: {curr_line_font_type}")
  678. # print(f" curr_line_font_size: {curr_line_font_size}")
  679. # print()
  680. """
  681. return is_title, is_author_or_org_list
  682. def _detect_block_title(self, input_block):
  683. """
  684. Use the functions 'is_potential_title' to detect titles of each paragraph block.
  685. If a line is a title, then the value of key 'is_title' of the line will be set to True.
  686. """
  687. raw_lines = input_block["lines"]
  688. prev_line_is_title_flag = False
  689. for i, curr_line in enumerate(raw_lines):
  690. prev_line = raw_lines[i - 1] if i > 0 else None
  691. next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
  692. blk_avg_char_width = input_block["avg_char_width"]
  693. blk_avg_char_height = input_block["avg_char_height"]
  694. blk_media_font_size = input_block["median_font_size"]
  695. is_title, is_author_or_org_list = self._is_potential_title(
  696. curr_line,
  697. prev_line,
  698. prev_line_is_title_flag,
  699. next_line,
  700. blk_avg_char_width,
  701. blk_avg_char_height,
  702. blk_media_font_size,
  703. )
  704. if is_title:
  705. curr_line["is_title"] = is_title
  706. prev_line_is_title_flag = True
  707. else:
  708. curr_line["is_title"] = False
  709. prev_line_is_title_flag = False
  710. if is_author_or_org_list:
  711. curr_line["is_author_or_org_list"] = is_author_or_org_list
  712. else:
  713. curr_line["is_author_or_org_list"] = False
  714. return input_block
  715. def batch_process_blocks_detect_titles(self, pdf_dic):
  716. """
  717. This function batch process the blocks to detect titles.
  718. Parameters
  719. ----------
  720. pdf_dict : dict
  721. result dictionary
  722. Returns
  723. -------
  724. pdf_dict : dict
  725. result dictionary
  726. """
  727. num_titles = 0
  728. for page_id, blocks in pdf_dic.items():
  729. if page_id.startswith("page_"):
  730. para_blocks = []
  731. if "para_blocks" in blocks.keys():
  732. para_blocks = blocks["para_blocks"]
  733. all_single_line_blocks = []
  734. for block in para_blocks:
  735. if len(block["lines"]) == 1:
  736. all_single_line_blocks.append(block)
  737. new_para_blocks = []
  738. if not len(all_single_line_blocks) == len(para_blocks): # Not all blocks are single line blocks.
  739. for para_block in para_blocks:
  740. new_block = self._detect_block_title(para_block)
  741. new_para_blocks.append(new_block)
  742. num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
  743. else: # All blocks are single line blocks.
  744. for para_block in para_blocks:
  745. new_para_blocks.append(para_block)
  746. num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
  747. para_blocks = new_para_blocks
  748. blocks["para_blocks"] = para_blocks
  749. for para_block in para_blocks:
  750. all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
  751. para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
  752. if (
  753. all_titles and para_text_len < 200
  754. ): # total length of the paragraph is less than 200, more than this should not be a title
  755. para_block["is_block_title"] = 1
  756. else:
  757. para_block["is_block_title"] = 0
  758. all_name_or_org_list_to_be_removed = all(
  759. safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
  760. )
  761. if all_name_or_org_list_to_be_removed and page_id == "page_0":
  762. para_block["is_block_an_author_or_org_list"] = 1
  763. else:
  764. para_block["is_block_an_author_or_org_list"] = 0
  765. pdf_dic["statistics"]["num_titles"] = num_titles
  766. return pdf_dic
  767. def __determine_size_based_level(self, title_blocks):
  768. """
  769. This function determines the title level based on the font size of the title.
  770. Parameters
  771. ----------
  772. title_blocks : list
  773. Returns
  774. -------
  775. title_blocks : list
  776. """
  777. font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
  778. # Use the mean and std of font sizes to remove extreme values
  779. mean_font_size = np.mean(font_sizes)
  780. std_font_size = np.std(font_sizes)
  781. min_extreme_font_size = mean_font_size - std_font_size # type: ignore
  782. max_extreme_font_size = mean_font_size + std_font_size # type: ignore
  783. # Compute the threshold for title level
  784. middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
  785. if middle_font_sizes.size > 0:
  786. middle_mean_font_size = np.mean(middle_font_sizes)
  787. level_threshold = middle_mean_font_size
  788. else:
  789. level_threshold = mean_font_size
  790. for tb in title_blocks:
  791. title_block = tb["block"]
  792. title_font_size = safe_get(title_block, "block_font_size", 0)
  793. current_level = 1 # Initialize title level, the biggest level is 1
  794. # print(f"Before adjustment by font size, {current_level}")
  795. if title_font_size >= max_extreme_font_size:
  796. current_level = 1
  797. elif title_font_size <= min_extreme_font_size:
  798. current_level = 3
  799. elif float(title_font_size) >= float(level_threshold):
  800. current_level = 2
  801. else:
  802. current_level = 3
  803. # print(f"After adjustment by font size, {current_level}")
  804. title_block["block_title_level"] = current_level
  805. return title_blocks
  806. def batch_process_blocks_recog_title_level(self, pdf_dic):
  807. title_blocks = []
  808. # Collect all titles
  809. for page_id, blocks in pdf_dic.items():
  810. if page_id.startswith("page_"):
  811. para_blocks = blocks.get("para_blocks", [])
  812. for block in para_blocks:
  813. if block.get("is_block_title"):
  814. title_obj = {"page_id": page_id, "block": block}
  815. title_blocks.append(title_obj)
  816. # Determine title level
  817. if title_blocks:
  818. # Determine title level based on font size
  819. title_blocks = self.__determine_size_based_level(title_blocks)
  820. return pdf_dic