dataset_checker.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. from abc import ABC, abstractmethod
  16. from .utils import build_res_dict
  17. from ....utils.misc import AutoRegisterABCMetaClass
  18. from ....utils.config import AttrDict
  19. from ....utils.logging import info
  20. def build_dataset_checker(config: AttrDict) -> "BaseDatasetChecker":
  21. """build dataset checker
  22. Args:
  23. config (AttrDict): PaddleX pipeline config, which is loaded from pipeline yaml file.
  24. Returns:
  25. BaseDatasetChecker: the dataset checker, which is subclass of BaseDatasetChecker.
  26. """
  27. model_name = config.Global.model
  28. return BaseDatasetChecker.get(model_name)(config)
  29. class BaseDatasetChecker(ABC, metaclass=AutoRegisterABCMetaClass):
  30. """ Base Dataset Checker """
  31. __is_base = True
  32. def __init__(self, config):
  33. """Initialize the instance.
  34. Args:
  35. config (AttrDict): PaddleX pipeline config, which is loaded from pipeline yaml file.
  36. """
  37. super().__init__()
  38. self.global_config = config.Global
  39. self.check_dataset_config = config.CheckDataset
  40. self.output = os.path.join(self.global_config.output, "check_dataset")
  41. def check(self) -> dict:
  42. """execute dataset checking
  43. Returns:
  44. dict: the dataset checking result.
  45. """
  46. dataset_dir = self.get_dataset_root(self.global_config.dataset_dir)
  47. if not os.path.exists(self.output):
  48. os.makedirs(self.output)
  49. if self.check_dataset_config.get("convert", None):
  50. if self.check_dataset_config.convert.get("enable", False):
  51. self.convert_dataset(dataset_dir)
  52. info("Convert dataset successfully !")
  53. if self.check_dataset_config.get("split", None):
  54. if self.check_dataset_config.split.get("enable", False):
  55. self.split_dataset(dataset_dir)
  56. info("Split dataset successfully !")
  57. attrs = self.check_dataset(dataset_dir)
  58. analysis = self.analyse(dataset_dir)
  59. check_result = build_res_dict(True)
  60. check_result["attributes"] = attrs
  61. check_result["analysis"] = analysis
  62. check_result["dataset_path"] = self.global_config.dataset_dir
  63. check_result["show_type"] = self.get_show_type()
  64. check_result["dataset_type"] = self.get_dataset_type()
  65. info("Check dataset passed !")
  66. return check_result
  67. def get_dataset_root(self, dataset_dir: str) -> str:
  68. """find the dataset root dir
  69. Args:
  70. dataset_dir (str): the directory that contain dataset.
  71. Returns:
  72. str: the root directory of dataset.
  73. """
  74. # XXX: forward compatible
  75. # dataset_dir = [d for d in Path(dataset_dir).iterdir() if d.is_dir()]
  76. # assert len(dataset_dir) == 1
  77. # return dataset_dir[0].as_posix()
  78. return dataset_dir
  79. @abstractmethod
  80. def check_dataset(self, dataset_dir: str):
  81. """check if the dataset meets the specifications and get dataset summary
  82. Args:
  83. dataset_dir (str): the root directory of dataset.
  84. Raises:
  85. NotImplementedError
  86. """
  87. raise NotImplementedError
  88. def convert_dataset(self, src_dataset_dir: str) -> str:
  89. """convert the dataset from other type to specified type
  90. Args:
  91. src_dataset_dir (str): the root directory of dataset.
  92. Returns:
  93. str: the root directory of converted dataset.
  94. """
  95. dst_dataset_dir = src_dataset_dir
  96. return dst_dataset_dir
  97. def split_dataset(self, src_dataset_dir: str) -> str:
  98. """repartition the train and validation dataset
  99. Args:
  100. src_dataset_dir (str): the root directory of dataset.
  101. Returns:
  102. str: the root directory of splited dataset.
  103. """
  104. dst_dataset_dir = src_dataset_dir
  105. return dst_dataset_dir
  106. def analyse(self, dataset_dir: str) -> dict:
  107. """deep analyse dataset
  108. Args:
  109. dataset_dir (str): the root directory of dataset.
  110. Returns:
  111. dict: the deep analysis results.
  112. """
  113. return {}
  114. @abstractmethod
  115. def get_show_type(self):
  116. """return the dataset show type
  117. Raises:
  118. NotImplementedError
  119. """
  120. raise NotImplementedError
  121. @abstractmethod
  122. def get_dataset_type(self):
  123. """ return the dataset type
  124. Raises:
  125. NotImplementedError
  126. """
  127. raise NotImplementedError