read_api.py 3.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. import json
  2. import os
  3. from pathlib import Path
  4. from magic_pdf.config.exceptions import EmptyData, InvalidParams
  5. from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
  6. MultiBucketS3DataReader)
  7. from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
  8. def read_jsonl(
  9. s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
  10. ) -> list[PymuDocDataset]:
  11. """Read the jsonl file and return the list of PymuDocDataset.
  12. Args:
  13. s3_path_or_local (str): local file or s3 path
  14. s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
  15. Raises:
  16. InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
  17. EmptyData: if no pdf file location is provided in some line of jsonl file.
  18. InvalidParams: if the file location is s3 path but s3_client is not provided
  19. Returns:
  20. list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
  21. """
  22. bits_arr = []
  23. if s3_path_or_local.startswith('s3://'):
  24. if s3_client is None:
  25. raise InvalidParams('s3_client is required when s3_path is provided')
  26. jsonl_bits = s3_client.read(s3_path_or_local)
  27. else:
  28. jsonl_bits = FileBasedDataReader('').read(s3_path_or_local)
  29. jsonl_d = [
  30. json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
  31. ]
  32. for d in jsonl_d:
  33. pdf_path = d.get('file_location', '') or d.get('path', '')
  34. if len(pdf_path) == 0:
  35. raise EmptyData('pdf file location is empty')
  36. if pdf_path.startswith('s3://'):
  37. if s3_client is None:
  38. raise InvalidParams('s3_client is required when s3_path is provided')
  39. bits_arr.append(s3_client.read(pdf_path))
  40. else:
  41. bits_arr.append(FileBasedDataReader('').read(pdf_path))
  42. return [PymuDocDataset(bits) for bits in bits_arr]
  43. def read_local_pdfs(path: str) -> list[PymuDocDataset]:
  44. """Read pdf from path or directory.
  45. Args:
  46. path (str): pdf file path or directory that contains pdf files
  47. Returns:
  48. list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
  49. """
  50. if os.path.isdir(path):
  51. reader = FileBasedDataReader()
  52. ret = []
  53. for root, _, files in os.walk(path):
  54. for file in files:
  55. suffix = file.split('.')
  56. if suffix[-1] == 'pdf':
  57. ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
  58. return ret
  59. else:
  60. reader = FileBasedDataReader()
  61. bits = reader.read(path)
  62. return [PymuDocDataset(bits)]
  63. def read_local_images(path: str, suffixes: list[str]=[]) -> list[ImageDataset]:
  64. """Read images from path or directory.
  65. Args:
  66. path (str): image file path or directory that contains image files
  67. suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
  68. Returns:
  69. list[ImageDataset]: each image file will converted to a ImageDataset
  70. """
  71. if os.path.isdir(path):
  72. imgs_bits = []
  73. s_suffixes = set(suffixes)
  74. reader = FileBasedDataReader()
  75. for root, _, files in os.walk(path):
  76. for file in files:
  77. suffix = file.split('.')
  78. if suffix[-1] in s_suffixes:
  79. imgs_bits.append(reader.read(os.path.join(root, file)))
  80. return [ImageDataset(bits) for bits in imgs_bits]
  81. else:
  82. reader = FileBasedDataReader()
  83. bits = reader.read(path)
  84. return [ImageDataset(bits)]