read_api.rst 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. read_api
  2. ==========
  3. Read the content from file or directory to create ``Dataset``, Currently we provided serval functions that cover some scenarios.
  4. if you have new scenarios that is common to most of the users, you can post it on the offical github issues with detail descriptions.
  5. Also it is easy to implement your own read-related funtions.
  6. Important Functions
  7. -------------------
  8. read_jsonl
  9. ^^^^^^^^^^^^^^^^
  10. Read the contet from jsonl which may located on local machine or remote s3. if you want to know more about jsonl, please goto :doc:`../../additional_notes/glossary`
  11. .. code:: python
  12. from magic_pdf.data.read_api import *
  13. from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
  14. from magic_pdf.data.schemas import S3Config
  15. # read jsonl from local machine
  16. datasets = read_jsonl("tt.jsonl", None) # replace with real jsonl file
  17. # read jsonl from remote s3
  18. bucket = "bucket_1" # replace with real s3 bucket
  19. ak = "access_key_1" # replace with real s3 access key
  20. sk = "secret_key_1" # replace with real s3 secret key
  21. endpoint_url = "endpoint_url_1" # replace with real s3 endpoint url
  22. bucket_2 = "bucket_2" # replace with real s3 bucket
  23. ak_2 = "access_key_2" # replace with real s3 access key
  24. sk_2 = "secret_key_2" # replace with real s3 secret key
  25. endpoint_url_2 = "endpoint_url_2" # replace with real s3 endpoint url
  26. s3configs = [
  27. S3Config(
  28. bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
  29. ),
  30. S3Config(
  31. bucket_name=bucket_2,
  32. access_key=ak_2,
  33. secret_key=sk_2,
  34. endpoint_url=endpoint_url_2,
  35. ),
  36. ]
  37. s3_reader = MultiBucketS3DataReader(bucket, s3configs)
  38. datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader) # replace with real s3 jsonl file
  39. read_local_pdfs
  40. ^^^^^^^^^^^^^^^^^
  41. Read pdf from path or directory.
  42. .. code:: python
  43. from magic_pdf.data.read_api import *
  44. # read pdf path
  45. datasets = read_local_pdfs("tt.pdf")
  46. # read pdfs under directory
  47. datasets = read_local_pdfs("pdfs/")
  48. read_local_images
  49. ^^^^^^^^^^^^^^^^^^^
  50. Read images from path or directory
  51. .. code:: python
  52. from magic_pdf.data.read_api import *
  53. # read from image path
  54. datasets = read_local_images("tt.png") # replace with real file path
  55. # read files from directory that endswith suffix in suffixes array
  56. datasets = read_local_images("images/", suffixes=[".png", ".jpg"]) # replace with real directory
  57. read_local_office
  58. ^^^^^^^^^^^^^^^^^^^^
  59. Read MS-Office files from path or directory
  60. .. code:: python
  61. from magic_pdf.data.read_api import *
  62. # read from image path
  63. datasets = read_local_office("tt.doc") # replace with real file path
  64. # read files from directory that endswith suffix in suffixes array
  65. datasets = read_local_office("docs/") # replace with real directory
  66. Check :doc:`../../api/read_api` for more details