coco_split.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os.path as osp
  15. import random
  16. import json
  17. from .utils import MyEncoder
  18. def split_coco_dataset(dataset_dir, val_percent, test_percent, save_dir):
  19. if not osp.exists(osp.join(dataset_dir, "annotations.json")):
  20. raise ValueError("\'annotations.json\' is not found in {}!".format(
  21. dataset_dir))
  22. try:
  23. from pycocotools.coco import COCO
  24. except:
  25. print(
  26. "pycococotools is not installed, follow this doc install pycocotools: https://paddlex.readthedocs.io/zh_CN/develop/install.html#pycocotools"
  27. )
  28. return
  29. annotation_file = osp.join(dataset_dir, "annotations.json")
  30. coco = COCO(annotation_file)
  31. img_ids = coco.getImgIds()
  32. cat_ids = coco.getCatIds()
  33. anno_ids = coco.getAnnIds()
  34. val_num = int(len(img_ids) * val_percent)
  35. test_num = int(len(img_ids) * test_percent)
  36. train_num = len(img_ids) - val_num - test_num
  37. random.shuffle(img_ids)
  38. train_files_ids = img_ids[:train_num]
  39. val_files_ids = img_ids[train_num:train_num + val_num]
  40. test_files_ids = img_ids[train_num + val_num:]
  41. for img_id_list in [train_files_ids, val_files_ids, test_files_ids]:
  42. img_anno_ids = coco.getAnnIds(imgIds=img_id_list, iscrowd=0)
  43. imgs = coco.loadImgs(img_id_list)
  44. instances = coco.loadAnns(img_anno_ids)
  45. categories = coco.loadCats(cat_ids)
  46. img_dict = {
  47. "annotations": instances,
  48. "images": imgs,
  49. "categories": categories
  50. }
  51. if img_id_list == train_files_ids:
  52. json_file = open(osp.join(save_dir, 'train.json'), 'w+')
  53. json.dump(img_dict, json_file, cls=MyEncoder)
  54. elif img_id_list == val_files_ids:
  55. json_file = open(osp.join(save_dir, 'val.json'), 'w+')
  56. json.dump(img_dict, json_file, cls=MyEncoder)
  57. elif img_id_list == test_files_ids and len(test_files_ids):
  58. json_file = open(osp.join(save_dir, 'test.json'), 'w+')
  59. json.dump(img_dict, json_file, cls=MyEncoder)
  60. return train_num, val_num, test_num