coco_split.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os.path as osp
  15. import random
  16. import json
  17. from .utils import MyEncoder
  18. import paddlex.utils.logging as logging
  19. def split_coco_dataset(dataset_dir, val_percent, test_percent, save_dir):
  20. # matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
  21. # or matplotlib.backends is imported for the first time
  22. # pycocotools import matplotlib
  23. import matplotlib
  24. matplotlib.use('Agg')
  25. from pycocotools.coco import COCO
  26. if not osp.exists(osp.join(dataset_dir, "annotations.json")):
  27. logging.error("\'annotations.json\' is not found in {}!".format(
  28. dataset_dir))
  29. annotation_file = osp.join(dataset_dir, "annotations.json")
  30. coco = COCO(annotation_file)
  31. img_ids = coco.getImgIds()
  32. cat_ids = coco.getCatIds()
  33. anno_ids = coco.getAnnIds()
  34. val_num = int(len(img_ids) * val_percent)
  35. test_num = int(len(img_ids) * test_percent)
  36. train_num = len(img_ids) - val_num - test_num
  37. random.shuffle(img_ids)
  38. train_files_ids = img_ids[:train_num]
  39. val_files_ids = img_ids[train_num:train_num + val_num]
  40. test_files_ids = img_ids[train_num + val_num:]
  41. for img_id_list in [train_files_ids, val_files_ids, test_files_ids]:
  42. img_anno_ids = coco.getAnnIds(imgIds=img_id_list, iscrowd=0)
  43. imgs = coco.loadImgs(img_id_list)
  44. instances = coco.loadAnns(img_anno_ids)
  45. categories = coco.loadCats(cat_ids)
  46. img_dict = {
  47. "annotations": instances,
  48. "images": imgs,
  49. "categories": categories
  50. }
  51. if img_id_list == train_files_ids:
  52. json_file = open(osp.join(save_dir, 'train.json'), 'w+')
  53. json.dump(img_dict, json_file, cls=MyEncoder)
  54. elif img_id_list == val_files_ids:
  55. json_file = open(osp.join(save_dir, 'val.json'), 'w+')
  56. json.dump(img_dict, json_file, cls=MyEncoder)
  57. elif img_id_list == test_files_ids and len(test_files_ids):
  58. json_file = open(osp.join(save_dir, 'test.json'), 'w+')
  59. json.dump(img_dict, json_file, cls=MyEncoder)
  60. return train_num, val_num, test_num