| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293 |
- # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import os
- import os.path as osp
- from collections import defaultdict
- from PIL import Image
- import json
- import numpy as np
- from .....utils.errors import DatasetFileNotFoundError
- def check(dataset_dir, output, sample_num=10):
- """ check dataset """
- dataset_dir = osp.abspath(dataset_dir)
- if not osp.exists(dataset_dir) or not osp.isdir(dataset_dir):
- raise DatasetFileNotFoundError(file_path=dataset_dir)
- sample_cnts = dict()
- sample_paths = defaultdict(list)
- delim = '\t'
- valid_num_parts = 2
- tags = ['train', 'val']
- for _, tag in enumerate(tags):
- file_list = osp.join(dataset_dir, f'{tag}.txt')
- if not osp.exists(file_list):
- if tag in ('train', 'val'):
- # train and val file lists must exist
- raise DatasetFileNotFoundError(
- file_path=file_list,
- solution=f"Ensure that both `train.txt` and `val.txt` exist in \
- {dataset_dir}")
- else:
- continue
- else:
- with open(file_list, 'r', encoding='utf-8') as f:
- all_lines = f.readlines()
- sample_cnts[tag] = len(all_lines)
- for idx, line in enumerate(all_lines):
- substr = line.strip("\n").split(delim)
- if len(line.strip("\n")) < 1:
- continue
- assert len(substr) == valid_num_parts or len(
- line.strip("\n")) <= 1, \
- f"Error in {line}, \
- The number of delimiter-separated items in each row in {file_list} \
- should be {valid_num_parts} (current delimiter is '{delim}')."
- file_name = substr[0]
- label = substr[1]
- img_path = osp.join(dataset_dir, file_name)
- if len(sample_paths[tag]) < sample_num:
- sample_paths[tag].append(
- os.path.relpath(img_path, output))
- if not osp.exists(img_path):
- raise DatasetFileNotFoundError(file_path=img_path)
- # check det label
- label = json.loads(label)
- for item in label:
- assert "points" in item and "transcription" in item, \
- f"line {idx} is not in the correct format."
- box = np.array(item['points'])
- assert box.shape[1] == 2, \
- f"{box} in line {idx} is not in the correct format."
- txt = item['transcription']
- assert isinstance(txt, str), \
- f"{txt} in line {idx} is not in the correct format."
- attrs = {}
- attrs['train_samples'] = sample_cnts['train']
- attrs['train_sample_paths'] = sample_paths['train'][:sample_num]
- attrs['val_samples'] = sample_cnts['val']
- attrs['val_sample_paths'] = sample_paths['val'][:sample_num]
- return attrs
|