read_api.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. import json
  2. import os
  3. import tempfile
  4. import shutil
  5. from pathlib import Path
  6. from magic_pdf.config.exceptions import EmptyData, InvalidParams
  7. from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
  8. MultiBucketS3DataReader)
  9. from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
  10. from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
  11. def read_jsonl(
  12. s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
  13. ) -> list[PymuDocDataset]:
  14. """Read the jsonl file and return the list of PymuDocDataset.
  15. Args:
  16. s3_path_or_local (str): local file or s3 path
  17. s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
  18. Raises:
  19. InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
  20. EmptyData: if no pdf file location is provided in some line of jsonl file.
  21. InvalidParams: if the file location is s3 path but s3_client is not provided
  22. Returns:
  23. list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
  24. """
  25. bits_arr = []
  26. if s3_path_or_local.startswith('s3://'):
  27. if s3_client is None:
  28. raise InvalidParams('s3_client is required when s3_path is provided')
  29. jsonl_bits = s3_client.read(s3_path_or_local)
  30. else:
  31. jsonl_bits = FileBasedDataReader('').read(s3_path_or_local)
  32. jsonl_d = [
  33. json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
  34. ]
  35. for d in jsonl_d:
  36. pdf_path = d.get('file_location', '') or d.get('path', '')
  37. if len(pdf_path) == 0:
  38. raise EmptyData('pdf file location is empty')
  39. if pdf_path.startswith('s3://'):
  40. if s3_client is None:
  41. raise InvalidParams('s3_client is required when s3_path is provided')
  42. bits_arr.append(s3_client.read(pdf_path))
  43. else:
  44. bits_arr.append(FileBasedDataReader('').read(pdf_path))
  45. return [PymuDocDataset(bits) for bits in bits_arr]
  46. def read_local_pdfs(path: str) -> list[PymuDocDataset]:
  47. """Read pdf from path or directory.
  48. Args:
  49. path (str): pdf file path or directory that contains pdf files
  50. Returns:
  51. list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
  52. """
  53. if os.path.isdir(path):
  54. reader = FileBasedDataReader()
  55. ret = []
  56. for root, _, files in os.walk(path):
  57. for file in files:
  58. suffix = file.split('.')
  59. if suffix[-1] == 'pdf':
  60. ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
  61. return ret
  62. else:
  63. reader = FileBasedDataReader()
  64. bits = reader.read(path)
  65. return [PymuDocDataset(bits)]
  66. def read_local_office(path: str) -> list[PymuDocDataset]:
  67. """Read ms-office file (ppt, pptx, doc, docx) from path or directory.
  68. Args:
  69. path (str): ms-office file or directory that contains ms-office files
  70. Returns:
  71. list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
  72. Raises:
  73. ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
  74. FileNotFoundError: File not Found
  75. Exception: Unknown Exception raised
  76. """
  77. suffixes = ['.ppt', '.pptx', '.doc', '.docx']
  78. fns = []
  79. ret = []
  80. if os.path.isdir(path):
  81. for root, _, files in os.walk(path):
  82. for file in files:
  83. suffix = Path(file).suffix
  84. if suffix in suffixes:
  85. fns.append((os.path.join(root, file)))
  86. else:
  87. fns.append(path)
  88. reader = FileBasedDataReader()
  89. temp_dir = tempfile.mkdtemp()
  90. for fn in fns:
  91. try:
  92. convert_file_to_pdf(fn, temp_dir)
  93. except ConvertToPdfError as e:
  94. raise e
  95. except FileNotFoundError as e:
  96. raise e
  97. except Exception as e:
  98. raise e
  99. fn_path = Path(fn)
  100. pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
  101. ret.append(PymuDocDataset(reader.read(pdf_fn)))
  102. shutil.rmtree(temp_dir)
  103. return ret
  104. def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
  105. """Read images from path or directory.
  106. Args:
  107. path (str): image file path or directory that contains image files
  108. suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
  109. Returns:
  110. list[ImageDataset]: each image file will converted to a ImageDataset
  111. """
  112. if os.path.isdir(path):
  113. imgs_bits = []
  114. s_suffixes = set(suffixes)
  115. reader = FileBasedDataReader()
  116. for root, _, files in os.walk(path):
  117. for file in files:
  118. suffix = Path(file).suffix
  119. if suffix in s_suffixes:
  120. imgs_bits.append(reader.read(os.path.join(root, file)))
  121. return [ImageDataset(bits) for bits in imgs_bits]
  122. else:
  123. reader = FileBasedDataReader()
  124. bits = reader.read(path)
  125. return [ImageDataset(bits)]