| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198 |
- class DenseSingleLineBlockException(Exception):
- """
- This class defines the exception type for dense single line-block.
- """
- def __init__(self, message="DenseSingleLineBlockException"):
- self.message = message
- super().__init__(self.message)
- def __str__(self):
- return f"{self.message}"
- def __repr__(self):
- return f"{self.message}"
- class TitleDetectionException(Exception):
- """
- This class defines the exception type for title detection.
- """
- def __init__(self, message="TitleDetectionException"):
- self.message = message
- super().__init__(self.message)
- def __str__(self):
- return f"{self.message}"
- def __repr__(self):
- return f"{self.message}"
- class TitleLevelException(Exception):
- """
- This class defines the exception type for title level.
- """
- def __init__(self, message="TitleLevelException"):
- self.message = message
- super().__init__(self.message)
- def __str__(self):
- return f"{self.message}"
- def __repr__(self):
- return f"{self.message}"
- class ParaSplitException(Exception):
- """
- This class defines the exception type for paragraph splitting.
- """
- def __init__(self, message="ParaSplitException"):
- self.message = message
- super().__init__(self.message)
- def __str__(self):
- return f"{self.message}"
- def __repr__(self):
- return f"{self.message}"
- class ParaMergeException(Exception):
- """
- This class defines the exception type for paragraph merging.
- """
- def __init__(self, message="ParaMergeException"):
- self.message = message
- super().__init__(self.message)
- def __str__(self):
- return f"{self.message}"
- def __repr__(self):
- return f"{self.message}"
- class DiscardByException:
- """
- This class discards pdf files by exception
- """
- def __init__(self) -> None:
- pass
- def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
- """
- This function discards pdf files by single line block exception
- Parameters
- ----------
- pdf_dic : dict
- pdf dictionary
- exception : str
- exception message
- Returns
- -------
- error_message : str
- """
- exception_page_nums = 0
- page_num = 0
- for page_id, page in pdf_dic.items():
- if page_id.startswith("page_"):
- page_num += 1
- if "preproc_blocks" in page.keys():
- preproc_blocks = page["preproc_blocks"]
- all_single_line_blocks = []
- for block in preproc_blocks:
- if len(block["lines"]) == 1:
- all_single_line_blocks.append(block)
- if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
- exception_page_nums += 1
- if page_num == 0:
- return None
- if exception_page_nums / page_num > 0.1: # Low ratio means basically, whenever this is the case, it is discarded
- return exception.message
- return None
- def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
- """
- This function discards pdf files by title detection exception
- Parameters
- ----------
- pdf_dic : dict
- pdf dictionary
- exception : str
- exception message
- Returns
- -------
- error_message : str
- """
- # return exception.message
- return None
- def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
- """
- This function discards pdf files by title level exception
- Parameters
- ----------
- pdf_dic : dict
- pdf dictionary
- exception : str
- exception message
- Returns
- -------
- error_message : str
- """
- # return exception.message
- return None
- def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
- """
- This function discards pdf files by split para exception
- Parameters
- ----------
- pdf_dic : dict
- pdf dictionary
- exception : str
- exception message
- Returns
- -------
- error_message : str
- """
- # return exception.message
- return None
- def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
- """
- This function discards pdf files by merge para exception
- Parameters
- ----------
- pdf_dic : dict
- pdf dictionary
- exception : str
- exception message
- Returns
- -------
- error_message : str
- """
- # return exception.message
- return None
|