| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142 |
- import json
- import os
- import tempfile
- import shutil
- from pathlib import Path
- from magic_pdf.config.exceptions import EmptyData, InvalidParams
- from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
- MultiBucketS3DataReader)
- from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
- from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
- def read_jsonl(
- s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
- ) -> list[PymuDocDataset]:
- """Read the jsonl file and return the list of PymuDocDataset.
- Args:
- s3_path_or_local (str): local file or s3 path
- s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
- Raises:
- InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
- EmptyData: if no pdf file location is provided in some line of jsonl file.
- InvalidParams: if the file location is s3 path but s3_client is not provided
- Returns:
- list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
- """
- bits_arr = []
- if s3_path_or_local.startswith('s3://'):
- if s3_client is None:
- raise InvalidParams('s3_client is required when s3_path is provided')
- jsonl_bits = s3_client.read(s3_path_or_local)
- else:
- jsonl_bits = FileBasedDataReader('').read(s3_path_or_local)
- jsonl_d = [
- json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
- ]
- for d in jsonl_d:
- pdf_path = d.get('file_location', '') or d.get('path', '')
- if len(pdf_path) == 0:
- raise EmptyData('pdf file location is empty')
- if pdf_path.startswith('s3://'):
- if s3_client is None:
- raise InvalidParams('s3_client is required when s3_path is provided')
- bits_arr.append(s3_client.read(pdf_path))
- else:
- bits_arr.append(FileBasedDataReader('').read(pdf_path))
- return [PymuDocDataset(bits) for bits in bits_arr]
- def read_local_pdfs(path: str) -> list[PymuDocDataset]:
- """Read pdf from path or directory.
- Args:
- path (str): pdf file path or directory that contains pdf files
- Returns:
- list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
- """
- if os.path.isdir(path):
- reader = FileBasedDataReader()
- ret = []
- for root, _, files in os.walk(path):
- for file in files:
- suffix = file.split('.')
- if suffix[-1] == 'pdf':
- ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
- return ret
- else:
- reader = FileBasedDataReader()
- bits = reader.read(path)
- return [PymuDocDataset(bits)]
- def read_local_office(path: str) -> list[PymuDocDataset]:
- """Read ms-office file (ppt, pptx, doc, docx) from path or directory.
- Args:
- path (str): ms-office file or directory that contains ms-office files
- Returns:
- list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
-
- Raises:
- ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
- FileNotFoundError: File not Found
- Exception: Unknown Exception raised
- """
- suffixes = ['.ppt', '.pptx', '.doc', '.docx']
- fns = []
- ret = []
- if os.path.isdir(path):
- for root, _, files in os.walk(path):
- for file in files:
- suffix = Path(file).suffix
- if suffix in suffixes:
- fns.append((os.path.join(root, file)))
- else:
- fns.append(path)
-
- reader = FileBasedDataReader()
- temp_dir = tempfile.mkdtemp()
- for fn in fns:
- try:
- convert_file_to_pdf(fn, temp_dir)
- except ConvertToPdfError as e:
- raise e
- except FileNotFoundError as e:
- raise e
- except Exception as e:
- raise e
- fn_path = Path(fn)
- pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
- ret.append(PymuDocDataset(reader.read(pdf_fn)))
- shutil.rmtree(temp_dir)
- return ret
- def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg', '.jpeg']) -> list[ImageDataset]:
- """Read images from path or directory.
- Args:
- path (str): image file path or directory that contains image files
- suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
- Returns:
- list[ImageDataset]: each image file will converted to a ImageDataset
- """
- if os.path.isdir(path):
- imgs_bits = []
- s_suffixes = set(suffixes)
- reader = FileBasedDataReader()
- for root, _, files in os.walk(path):
- for file in files:
- suffix = Path(file).suffix
- if suffix in s_suffixes:
- imgs_bits.append(reader.read(os.path.join(root, file)))
- return [ImageDataset(bits) for bits in imgs_bits]
- else:
- reader = FileBasedDataReader()
- bits = reader.read(path)
- return [ImageDataset(bits)]
|