| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import json
- import os
- import os.path as osp
- from collections import defaultdict, Counter
- from pathlib import Path
- from PIL import Image, ImageOps
- from pycocotools.coco import COCO
- from .utils.visualizer import draw_bbox, draw_mask
- from .....utils.errors import DatasetFileNotFoundError
- from .....utils.logging import info
- def check(dataset_dir, output, sample_num=10):
- """ check dataset """
- info(dataset_dir)
- dataset_dir = osp.abspath(dataset_dir)
- if not osp.exists(dataset_dir) or not osp.isdir(dataset_dir):
- raise DatasetFileNotFoundError(file_path=dataset_dir)
- sample_cnts = dict()
- sample_paths = defaultdict(list)
- im_sizes = defaultdict(Counter)
- tags = ['instance_train', 'instance_val']
- for _, tag in enumerate(tags):
- file_list = osp.join(dataset_dir, f'annotations/{tag}.json')
- if not osp.exists(file_list):
- if tag in ('instance_train', 'instance_val'):
- # train and val file lists must exist
- raise DatasetFileNotFoundError(
- file_path=file_list,
- solution=f"Ensure that both `instance_train.json` and `instance_val.json` exist in \
- {dataset_dir}/annotations")
- else:
- continue
- else:
- with open(file_list, 'r', encoding='utf-8') as f:
- jsondata = json.load(f)
- datanno = jsondata['annotations']
- sample_cnts[tag] = len(datanno)
- coco = COCO(file_list)
- num_class = len(coco.getCatIds())
- vis_save_dir = osp.join(output, 'demo_img')
- image_info = jsondata['images']
- for i in range(sample_num):
- file_name = image_info[i]['file_name']
- img_id = image_info[i]['id']
- img_path = osp.join(dataset_dir, 'images', file_name)
- if not osp.exists(img_path):
- raise DatasetFileNotFoundError(file_path=img_path)
- img = Image.open(img_path)
- img = ImageOps.exif_transpose(img)
- vis_im = draw_bbox(img, coco, img_id)
- vis_im = draw_mask(vis_im, coco, img_id)
- vis_path = osp.join(vis_save_dir, file_name)
- Path(vis_path).parent.mkdir(parents=True, exist_ok=True)
- vis_im.save(vis_path)
- sample_path = osp.join('check_dataset',
- os.path.relpath(vis_path, output))
- sample_paths[tag].append(sample_path)
- attrs = {}
- attrs['num_classes'] = num_class
- attrs['train_samples'] = sample_cnts['instance_train']
- attrs['train_sample_paths'] = sample_paths['instance_train']
- attrs['val_samples'] = sample_cnts['instance_val']
- attrs['val_sample_paths'] = sample_paths['instance_val']
- return attrs
|