user_api.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. """用户输入: model数组,每个元素代表一个页面 pdf在s3的路径 截图保存的s3位置.
  2. 然后:
  3. 1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
  4. 2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
  5. 其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
  6. """
  7. from loguru import logger
  8. from magic_pdf.data.data_reader_writer import DataWriter
  9. from magic_pdf.libs.version import __version__
  10. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  11. from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
  12. from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
  13. PARSE_TYPE_TXT = 'txt'
  14. PARSE_TYPE_OCR = 'ocr'
  15. def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
  16. start_page_id=0, end_page_id=None, lang=None,
  17. *args, **kwargs):
  18. """解析文本类pdf."""
  19. pdf_info_dict = parse_pdf_by_txt(
  20. pdf_bytes,
  21. pdf_models,
  22. imageWriter,
  23. start_page_id=start_page_id,
  24. end_page_id=end_page_id,
  25. debug_mode=is_debug,
  26. )
  27. pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
  28. pdf_info_dict['_version_name'] = __version__
  29. if lang is not None:
  30. pdf_info_dict['_lang'] = lang
  31. return pdf_info_dict
  32. def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
  33. start_page_id=0, end_page_id=None, lang=None,
  34. *args, **kwargs):
  35. """解析ocr类pdf."""
  36. pdf_info_dict = parse_pdf_by_ocr(
  37. pdf_bytes,
  38. pdf_models,
  39. imageWriter,
  40. start_page_id=start_page_id,
  41. end_page_id=end_page_id,
  42. debug_mode=is_debug,
  43. )
  44. pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
  45. pdf_info_dict['_version_name'] = __version__
  46. if lang is not None:
  47. pdf_info_dict['_lang'] = lang
  48. return pdf_info_dict
  49. def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
  50. input_model_is_empty: bool = False,
  51. start_page_id=0, end_page_id=None, lang=None,
  52. *args, **kwargs):
  53. """ocr和文本混合的pdf,全部解析出来."""
  54. def parse_pdf(method):
  55. try:
  56. return method(
  57. pdf_bytes,
  58. pdf_models,
  59. imageWriter,
  60. start_page_id=start_page_id,
  61. end_page_id=end_page_id,
  62. debug_mode=is_debug,
  63. )
  64. except Exception as e:
  65. logger.exception(e)
  66. return None
  67. pdf_info_dict = parse_pdf(parse_pdf_by_txt)
  68. if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
  69. logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
  70. if input_model_is_empty:
  71. layout_model = kwargs.get('layout_model', None)
  72. formula_enable = kwargs.get('formula_enable', None)
  73. table_enable = kwargs.get('table_enable', None)
  74. pdf_models = doc_analyze(
  75. pdf_bytes,
  76. ocr=True,
  77. start_page_id=start_page_id,
  78. end_page_id=end_page_id,
  79. lang=lang,
  80. layout_model=layout_model,
  81. formula_enable=formula_enable,
  82. table_enable=table_enable,
  83. )
  84. pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
  85. if pdf_info_dict is None:
  86. raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
  87. else:
  88. pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
  89. else:
  90. pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
  91. pdf_info_dict['_version_name'] = __version__
  92. if lang is not None:
  93. pdf_info_dict['_lang'] = lang
  94. return pdf_info_dict