types.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. import copy
  2. import json
  3. import os
  4. from typing import Callable
  5. from magic_pdf.config.enums import SupportedPdfParseMethod
  6. from magic_pdf.data.data_reader_writer import DataWriter
  7. from magic_pdf.data.dataset import Dataset
  8. from magic_pdf.filter import classify
  9. from magic_pdf.libs.draw_bbox import draw_model_bbox
  10. from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
  11. from magic_pdf.pipe.types import PipeResult
  12. class InferenceResult:
  13. def __init__(self, inference_results: list, dataset: Dataset):
  14. self._infer_res = inference_results
  15. self._dataset = dataset
  16. def draw_model(self, file_path: str) -> None:
  17. dir_name = os.path.dirname(file_path)
  18. base_name = os.path.basename(file_path)
  19. if not os.path.exists(dir_name):
  20. os.makedirs(dir_name, exist_ok=True)
  21. draw_model_bbox(
  22. copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
  23. )
  24. def dump_model(self, writer: DataWriter, file_path: str):
  25. writer.write_string(
  26. file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
  27. )
  28. def get_infer_res(self):
  29. return self._infer_res
  30. def apply(self, proc: Callable, *args, **kwargs):
  31. return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
  32. def pipe_auto_mode(
  33. self,
  34. imageWriter: DataWriter,
  35. start_page_id=0,
  36. end_page_id=None,
  37. debug_mode=False,
  38. lang=None,
  39. ) -> PipeResult:
  40. def proc(*args, **kwargs) -> PipeResult:
  41. res = pdf_parse_union(*args, **kwargs)
  42. return PipeResult(res, self._dataset)
  43. pdf_proc_method = classify(self._dataset.data_bits())
  44. if pdf_proc_method == SupportedPdfParseMethod.TXT:
  45. return self.apply(
  46. proc,
  47. self._dataset,
  48. imageWriter,
  49. SupportedPdfParseMethod.TXT,
  50. start_page_id=0,
  51. end_page_id=None,
  52. debug_mode=False,
  53. lang=None,
  54. )
  55. else:
  56. return self.apply(
  57. proc,
  58. self._dataset,
  59. imageWriter,
  60. SupportedPdfParseMethod.OCR,
  61. start_page_id=0,
  62. end_page_id=None,
  63. debug_mode=False,
  64. lang=None,
  65. )
  66. def pipe_txt_mode(
  67. self,
  68. imageWriter: DataWriter,
  69. start_page_id=0,
  70. end_page_id=None,
  71. debug_mode=False,
  72. lang=None,
  73. ) -> PipeResult:
  74. def proc(*args, **kwargs) -> PipeResult:
  75. res = pdf_parse_union(*args, **kwargs)
  76. return PipeResult(res, self._dataset)
  77. return self.apply(
  78. proc,
  79. self._dataset,
  80. imageWriter,
  81. SupportedPdfParseMethod.TXT,
  82. start_page_id=0,
  83. end_page_id=None,
  84. debug_mode=False,
  85. lang=None,
  86. )
  87. def pipe_ocr_mode(
  88. self,
  89. imageWriter: DataWriter,
  90. start_page_id=0,
  91. end_page_id=None,
  92. debug_mode=False,
  93. lang=None,
  94. ) -> PipeResult:
  95. def proc(*args, **kwargs) -> PipeResult:
  96. res = pdf_parse_union(*args, **kwargs)
  97. return PipeResult(res, self._dataset)
  98. return self.apply(
  99. proc,
  100. self._dataset,
  101. imageWriter,
  102. SupportedPdfParseMethod.TXT,
  103. start_page_id=0,
  104. end_page_id=None,
  105. debug_mode=False,
  106. lang=None,
  107. )