test_pdf2text_recogPara_TitleProcessor.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import json
  2. import unittest
  3. from utils_for_test_para import UtilsForTestPara
  4. from magic_pdf.post_proc.detect_para import TitleProcessor
  5. # from ... pdf2text_recogPara import * # another way to import
  6. """
  7. Execute the following command to run the test under directory code-clean:
  8. python -m tests.test_para.test_pdf2text_recogPara_ClassName
  9. or
  10. pytest -v -s app/pdf_toolbox/tests/test_para/test_pdf2text_recogPara_TitleProcessor.py
  11. """
  12. class TestTitleProcessor(unittest.TestCase):
  13. def setUp(self):
  14. self.title_processor = TitleProcessor()
  15. self.utils = UtilsForTestPara()
  16. self.preproc_out_jsons = self.utils.read_preproc_out_jfiles()
  17. def test_batch_process_blocks_detect_titles(self):
  18. """
  19. Test the function detect_titles with preprocessed output JSON
  20. """
  21. for preproc_out_json in self.preproc_out_jsons:
  22. with open(preproc_out_json, "r", encoding="utf-8") as f:
  23. preproc_dict = json.load(f)
  24. preproc_dict["statistics"] = {}
  25. result = self.title_processor.batch_detect_titles(preproc_dict)
  26. for page_id, blocks in preproc_dict.items():
  27. if page_id.startswith("page_"):
  28. pass
  29. else:
  30. continue
  31. def test_batch_process_blocks_recog_title_level(self):
  32. """
  33. Test the function batch_process_blocks_recog_title_level with preprocessed output JSON
  34. """
  35. for preproc_out_json in self.preproc_out_jsons:
  36. with open(preproc_out_json, "r", encoding="utf-8") as f:
  37. preproc_dict = json.load(f)
  38. preproc_dict["statistics"] = {}
  39. result = self.title_processor.batch_recog_title_level(preproc_dict)
  40. for page_id, blocks in preproc_dict.items():
  41. if page_id.startswith("page_"):
  42. pass
  43. else:
  44. continue
  45. if __name__ == "__main__":
  46. unittest.main()