spark_api.py 1.0 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. """
  2. 用户输入:
  3. model数组,每个元素代表一个页面
  4. pdf在s3的路径
  5. 截图保存的s3位置
  6. 然后:
  7. 1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
  8. 2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
  9. 其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
  10. """
  11. from magic_pdf.io import AbsReaderWriter
  12. def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
  13. """
  14. 解析文本类pdf
  15. """
  16. pass
  17. def parse_ocr_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
  18. """
  19. 解析ocr类pdf
  20. """
  21. pass
  22. def parse_union_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
  23. """
  24. ocr和文本混合的pdf,全部解析出来
  25. """
  26. pass