analyse_dataset.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import platform
  16. from collections import defaultdict
  17. import numpy as np
  18. from .....utils.deps import function_requires_deps, is_dep_available
  19. from .....utils.fonts import PINGFANG_FONT_FILE_PATH
  20. if is_dep_available("matplotlib"):
  21. import matplotlib.pyplot as plt
  22. from matplotlib import font_manager
  23. if is_dep_available("pycocotools"):
  24. from pycocotools.coco import COCO
  25. @function_requires_deps("pycocotools", "matplotlib")
  26. def deep_analyse(dataset_dir, output):
  27. """class analysis for dataset"""
  28. tags = ["train", "val"]
  29. all_instances = 0
  30. for tag in tags:
  31. annotations_path = os.path.abspath(
  32. os.path.join(dataset_dir, f"annotations/instance_{tag}.json")
  33. )
  34. labels_cnt = defaultdict(list)
  35. coco = COCO(annotations_path)
  36. cat_ids = coco.getCatIds()
  37. for cat_id in cat_ids:
  38. cat_name = coco.loadCats(ids=cat_id)[0]["name"]
  39. labels_cnt[cat_name] = labels_cnt[cat_name] + coco.getAnnIds(catIds=cat_id)
  40. all_instances += len(labels_cnt[cat_name])
  41. if tag == "train":
  42. cnts_train = [len(cat_ids) for cat_name, cat_ids in labels_cnt.items()]
  43. elif tag == "val":
  44. cnts_val = [len(cat_ids) for cat_name, cat_ids in labels_cnt.items()]
  45. classes = [cat_name for cat_name, cat_ids in labels_cnt.items()]
  46. sorted_id = sorted(
  47. range(len(cnts_train)), key=lambda k: cnts_train[k], reverse=True
  48. )
  49. cnts_train_sorted = sorted(cnts_train, reverse=True)
  50. cnts_val_sorted = [cnts_val[index] for index in sorted_id]
  51. classes_sorted = [classes[index] for index in sorted_id]
  52. x = np.arange(len(classes))
  53. width = 0.5
  54. # bar
  55. os_system = platform.system().lower()
  56. if os_system == "windows":
  57. plt.rcParams["font.sans-serif"] = "FangSong"
  58. else:
  59. font = font_manager.FontProperties(fname=PINGFANG_FONT_FILE_PATH)
  60. fig, ax = plt.subplots(figsize=(max(8, int(len(classes) / 5)), 5), dpi=120)
  61. ax.bar(x, cnts_train_sorted, width=0.5, label="train")
  62. ax.bar(x + width, cnts_val_sorted, width=0.5, label="val")
  63. plt.xticks(
  64. x + width / 2,
  65. classes_sorted,
  66. rotation=90,
  67. fontproperties=None if os_system == "windows" else font,
  68. )
  69. ax.set_ylabel("Counts")
  70. plt.legend()
  71. fig.tight_layout()
  72. fig_path = os.path.join(output, "histogram.png")
  73. fig.savefig(fig_path)
  74. return {"histogram": os.path.join("check_dataset", "histogram.png")}