read_api.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. import json
  2. import os
  3. import tempfile
  4. import shutil
  5. from pathlib import Path
  6. from magic_pdf.config.exceptions import EmptyData, InvalidParams
  7. from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
  8. MultiBucketS3DataReader)
  9. from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
  10. from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
  11. def read_jsonl(
  12. s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
  13. ) -> list[PymuDocDataset]:
  14. """Read the jsonl file and return the list of PymuDocDataset.
  15. Args:
  16. s3_path_or_local (str): local file or s3 path
  17. s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
  18. Raises:
  19. InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
  20. EmptyData: if no pdf file location is provided in some line of jsonl file.
  21. InvalidParams: if the file location is s3 path but s3_client is not provided
  22. Returns:
  23. list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
  24. """
  25. bits_arr = []
  26. if s3_path_or_local.startswith('s3://'):
  27. if s3_client is None:
  28. raise InvalidParams('s3_client is required when s3_path is provided')
  29. jsonl_bits = s3_client.read(s3_path_or_local)
  30. else:
  31. jsonl_bits = FileBasedDataReader('').read(s3_path_or_local)
  32. jsonl_d = [
  33. json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
  34. ]
  35. for d in jsonl_d:
  36. pdf_path = d.get('file_location', '') or d.get('path', '')
  37. if len(pdf_path) == 0:
  38. raise EmptyData('pdf file location is empty')
  39. if pdf_path.startswith('s3://'):
  40. if s3_client is None:
  41. raise InvalidParams('s3_client is required when s3_path is provided')
  42. bits_arr.append(s3_client.read(pdf_path))
  43. else:
  44. bits_arr.append(FileBasedDataReader('').read(pdf_path))
  45. return [PymuDocDataset(bits) for bits in bits_arr]
  46. def read_local_pdfs(path: str) -> list[PymuDocDataset]:
  47. """Read pdf from path or directory.
  48. Args:
  49. path (str): pdf file path or directory that contains pdf files
  50. Returns:
  51. list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
  52. """
  53. if os.path.isdir(path):
  54. reader = FileBasedDataReader()
  55. ret = []
  56. for root, _, files in os.walk(path):
  57. for file in files:
  58. suffix = file.split('.')
  59. if suffix[-1] == 'pdf':
  60. ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
  61. return ret
  62. else:
  63. reader = FileBasedDataReader()
  64. bits = reader.read(path)
  65. return [PymuDocDataset(bits)]
  66. def read_local_office(path: str) -> list[PymuDocDataset]:
  67. """Read ms-office file (ppt, pptx, doc, docx) from path or directory.
  68. Args:
  69. path (str): ms-office file or directory that contains ms-office files
  70. Returns:
  71. list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
  72. """
  73. suffixes = ['ppt', 'pptx', 'doc', 'docx']
  74. fns = []
  75. ret = []
  76. if os.path.isdir(path):
  77. for root, _, files in os.walk(path):
  78. for file in files:
  79. suffix = file.split('.')
  80. if suffix[-1] in suffixes:
  81. fns.append((os.path.join(root, file)))
  82. else:
  83. fns.append(path)
  84. reader = FileBasedDataReader()
  85. temp_dir = tempfile.mkdtemp()
  86. for fn in fns:
  87. convert_file_to_pdf(fn, temp_dir)
  88. fn_path = Path(fn)
  89. pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
  90. ret.append(PymuDocDataset(reader.read(pdf_fn)))
  91. shutil.rmtree(temp_dir)
  92. return ret
  93. def read_local_images(path: str, suffixes: list[str]=[]) -> list[ImageDataset]:
  94. """Read images from path or directory.
  95. Args:
  96. path (str): image file path or directory that contains image files
  97. suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
  98. Returns:
  99. list[ImageDataset]: each image file will converted to a ImageDataset
  100. """
  101. if os.path.isdir(path):
  102. imgs_bits = []
  103. s_suffixes = set(suffixes)
  104. reader = FileBasedDataReader()
  105. for root, _, files in os.walk(path):
  106. for file in files:
  107. suffix = file.split('.')
  108. if suffix[-1] in s_suffixes:
  109. imgs_bits.append(reader.read(os.path.join(root, file)))
  110. return [ImageDataset(bits) for bits in imgs_bits]
  111. else:
  112. reader = FileBasedDataReader()
  113. bits = reader.read(path)
  114. return [ImageDataset(bits)]