exceptions.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. class DenseSingleLineBlockException(Exception):
  2. """
  3. This class defines the exception type for dense single line-block.
  4. """
  5. def __init__(self, message="DenseSingleLineBlockException"):
  6. self.message = message
  7. super().__init__(self.message)
  8. def __str__(self):
  9. return f"{self.message}"
  10. def __repr__(self):
  11. return f"{self.message}"
  12. class TitleDetectionException(Exception):
  13. """
  14. This class defines the exception type for title detection.
  15. """
  16. def __init__(self, message="TitleDetectionException"):
  17. self.message = message
  18. super().__init__(self.message)
  19. def __str__(self):
  20. return f"{self.message}"
  21. def __repr__(self):
  22. return f"{self.message}"
  23. class TitleLevelException(Exception):
  24. """
  25. This class defines the exception type for title level.
  26. """
  27. def __init__(self, message="TitleLevelException"):
  28. self.message = message
  29. super().__init__(self.message)
  30. def __str__(self):
  31. return f"{self.message}"
  32. def __repr__(self):
  33. return f"{self.message}"
  34. class ParaSplitException(Exception):
  35. """
  36. This class defines the exception type for paragraph splitting.
  37. """
  38. def __init__(self, message="ParaSplitException"):
  39. self.message = message
  40. super().__init__(self.message)
  41. def __str__(self):
  42. return f"{self.message}"
  43. def __repr__(self):
  44. return f"{self.message}"
  45. class ParaMergeException(Exception):
  46. """
  47. This class defines the exception type for paragraph merging.
  48. """
  49. def __init__(self, message="ParaMergeException"):
  50. self.message = message
  51. super().__init__(self.message)
  52. def __str__(self):
  53. return f"{self.message}"
  54. def __repr__(self):
  55. return f"{self.message}"
  56. class DiscardByException:
  57. """
  58. This class discards pdf files by exception
  59. """
  60. def __init__(self) -> None:
  61. pass
  62. def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
  63. """
  64. This function discards pdf files by single line block exception
  65. Parameters
  66. ----------
  67. pdf_dic : dict
  68. pdf dictionary
  69. exception : str
  70. exception message
  71. Returns
  72. -------
  73. error_message : str
  74. """
  75. exception_page_nums = 0
  76. page_num = 0
  77. for page_id, page in pdf_dic.items():
  78. if page_id.startswith("page_"):
  79. page_num += 1
  80. if "preproc_blocks" in page.keys():
  81. preproc_blocks = page["preproc_blocks"]
  82. all_single_line_blocks = []
  83. for block in preproc_blocks:
  84. if len(block["lines"]) == 1:
  85. all_single_line_blocks.append(block)
  86. if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
  87. exception_page_nums += 1
  88. if page_num == 0:
  89. return None
  90. if exception_page_nums / page_num > 0.1: # Low ratio means basically, whenever this is the case, it is discarded
  91. return exception.message
  92. return None
  93. def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
  94. """
  95. This function discards pdf files by title detection exception
  96. Parameters
  97. ----------
  98. pdf_dic : dict
  99. pdf dictionary
  100. exception : str
  101. exception message
  102. Returns
  103. -------
  104. error_message : str
  105. """
  106. # return exception.message
  107. return None
  108. def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
  109. """
  110. This function discards pdf files by title level exception
  111. Parameters
  112. ----------
  113. pdf_dic : dict
  114. pdf dictionary
  115. exception : str
  116. exception message
  117. Returns
  118. -------
  119. error_message : str
  120. """
  121. # return exception.message
  122. return None
  123. def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
  124. """
  125. This function discards pdf files by split para exception
  126. Parameters
  127. ----------
  128. pdf_dic : dict
  129. pdf dictionary
  130. exception : str
  131. exception message
  132. Returns
  133. -------
  134. error_message : str
  135. """
  136. # return exception.message
  137. return None
  138. def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
  139. """
  140. This function discards pdf files by merge para exception
  141. Parameters
  142. ----------
  143. pdf_dic : dict
  144. pdf dictionary
  145. exception : str
  146. exception message
  147. Returns
  148. -------
  149. error_message : str
  150. """
  151. # return exception.message
  152. return None