analyse_dataset.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import platform
  16. from collections import defaultdict
  17. import numpy as np
  18. from .....utils.deps import function_requires_deps, is_dep_available
  19. from .....utils.file_interface import custom_open
  20. from .....utils.fonts import PINGFANG_FONT
  21. if is_dep_available("matplotlib"):
  22. import matplotlib.pyplot as plt
  23. from matplotlib import font_manager
  24. @function_requires_deps("matplotlib")
  25. def deep_analyse(dataset_path, output):
  26. """class analysis for dataset"""
  27. tags = ["train", "val"]
  28. labels_cnt = defaultdict(str)
  29. label_path = os.path.join(dataset_path, "label_map.txt")
  30. with custom_open(label_path, "r") as f:
  31. lines = f.readlines()
  32. for line in lines:
  33. line = line.strip().split()
  34. labels_cnt[line[1]] = " ".join(line[0])
  35. for tag in tags:
  36. anno_path = os.path.join(dataset_path, f"{tag}.txt")
  37. classes_num = defaultdict(int)
  38. for i in range(len(labels_cnt)):
  39. classes_num[labels_cnt[str(i)]] = 0
  40. with custom_open(anno_path, "r") as f:
  41. lines = f.readlines()
  42. for line in lines:
  43. line = line.strip().split()
  44. label_file_path = os.path.join(dataset_path, line[0])
  45. with custom_open(label_file_path, "r") as f:
  46. label_lines = f.readlines()
  47. for label_line in label_lines:
  48. label_info = label_line.strip().split(" ")
  49. classes_num[labels_cnt[label_info[0]]] += 1
  50. if tag == "train":
  51. cnts_train = [cat_ids for cat_name, cat_ids in classes_num.items()]
  52. elif tag == "val":
  53. cnts_val = [cat_ids for cat_name, cat_ids in classes_num.items()]
  54. classes = [cat_name for cat_name, cat_ids in classes_num.items()]
  55. sorted_id = sorted(
  56. range(len(cnts_train)), key=lambda k: cnts_train[k], reverse=True
  57. )
  58. cnts_train_sorted = [cnts_train[index] for index in sorted_id]
  59. cnts_val_sorted = [cnts_val[index] for index in sorted_id]
  60. classes_sorted = [classes[index] for index in sorted_id]
  61. x = np.arange(len(classes))
  62. width = 0.5
  63. # bar
  64. os_system = platform.system().lower()
  65. if os_system == "windows":
  66. plt.rcParams["font.sans-serif"] = "FangSong"
  67. else:
  68. font = font_manager.FontProperties(fname=PINGFANG_FONT.path, size=10)
  69. fig, ax = plt.subplots(figsize=(max(8, int(len(classes) / 5)), 5), dpi=300)
  70. ax.bar(x, cnts_train_sorted, width=0.5, label="train")
  71. ax.bar(x + width, cnts_val_sorted, width=0.5, label="val")
  72. plt.xticks(
  73. x + width / 2,
  74. classes_sorted,
  75. rotation=90,
  76. fontproperties=None if os_system == "windows" else font,
  77. )
  78. ax.set_xlabel(
  79. "类别名称", fontproperties=None if os_system == "windows" else font, fontsize=12
  80. )
  81. ax.set_ylabel(
  82. "样本框数量",
  83. fontproperties=None if os_system == "windows" else font,
  84. fontsize=12,
  85. )
  86. plt.legend(loc=1)
  87. fig.tight_layout()
  88. file_path = os.path.join(output, "histogram.png")
  89. fig.savefig(file_path, dpi=300)
  90. return {"histogram": os.path.join("check_dataset", "histogram.png")}