dataset_checker.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. from abc import ABC, abstractmethod
  16. from .utils import build_res_dict
  17. from ....utils.misc import AutoRegisterABCMetaClass
  18. from ....utils.config import AttrDict
  19. from ....utils.logging import info
  20. def build_dataset_checker(config: AttrDict) -> "BaseDatasetChecker":
  21. """build dataset checker
  22. Args:
  23. config (AttrDict): PaddleX pipeline config, which is loaded from pipeline yaml file.
  24. Returns:
  25. BaseDatasetChecker: the dataset checker, which is subclass of BaseDatasetChecker.
  26. """
  27. model_name = config.Global.model
  28. try:
  29. import feature_line_modules
  30. except ModuleNotFoundError:
  31. info(
  32. "The PaddleX FeaTure Line plugin is not installed, but continuing execution."
  33. )
  34. return BaseDatasetChecker.get(model_name)(config)
  35. class BaseDatasetChecker(ABC, metaclass=AutoRegisterABCMetaClass):
  36. """Base Dataset Checker"""
  37. __is_base = True
  38. def __init__(self, config):
  39. """Initialize the instance.
  40. Args:
  41. config (AttrDict): PaddleX pipeline config, which is loaded from pipeline yaml file.
  42. """
  43. super().__init__()
  44. self.global_config = config.Global
  45. self.check_dataset_config = config.CheckDataset
  46. self.output = os.path.join(self.global_config.output, "check_dataset")
  47. def check(self) -> dict:
  48. """execute dataset checking
  49. Returns:
  50. dict: the dataset checking result.
  51. """
  52. dataset_dir = self.get_dataset_root(self.global_config.dataset_dir)
  53. if not os.path.exists(self.output):
  54. os.makedirs(self.output)
  55. if self.check_dataset_config.get("convert", None):
  56. if self.check_dataset_config.convert.get("enable", False):
  57. self.convert_dataset(dataset_dir)
  58. info("Convert dataset successfully !")
  59. if self.check_dataset_config.get("split", None):
  60. if self.check_dataset_config.split.get("enable", False):
  61. self.split_dataset(dataset_dir)
  62. info("Split dataset successfully !")
  63. attrs = self.check_dataset(dataset_dir)
  64. analysis = self.analyse(dataset_dir)
  65. check_result = build_res_dict(True)
  66. check_result["attributes"] = attrs
  67. check_result["analysis"] = analysis
  68. check_result["dataset_path"] = os.path.basename(dataset_dir)
  69. check_result["show_type"] = self.get_show_type()
  70. check_result["dataset_type"] = self.get_dataset_type()
  71. info("Check dataset passed !")
  72. return check_result
  73. def get_dataset_root(self, dataset_dir: str) -> str:
  74. """find the dataset root dir
  75. Args:
  76. dataset_dir (str): the directory that contain dataset.
  77. Returns:
  78. str: the root directory of dataset.
  79. """
  80. # XXX: forward compatible
  81. # dataset_dir = [d for d in Path(dataset_dir).iterdir() if d.is_dir()]
  82. # assert len(dataset_dir) == 1
  83. # return dataset_dir[0].as_posix()
  84. return dataset_dir
  85. @abstractmethod
  86. def check_dataset(self, dataset_dir: str):
  87. """check if the dataset meets the specifications and get dataset summary
  88. Args:
  89. dataset_dir (str): the root directory of dataset.
  90. Raises:
  91. NotImplementedError
  92. """
  93. raise NotImplementedError
  94. def convert_dataset(self, src_dataset_dir: str) -> str:
  95. """convert the dataset from other type to specified type
  96. Args:
  97. src_dataset_dir (str): the root directory of dataset.
  98. Returns:
  99. str: the root directory of converted dataset.
  100. """
  101. dst_dataset_dir = src_dataset_dir
  102. return dst_dataset_dir
  103. def split_dataset(self, src_dataset_dir: str) -> str:
  104. """repartition the train and validation dataset
  105. Args:
  106. src_dataset_dir (str): the root directory of dataset.
  107. Returns:
  108. str: the root directory of splited dataset.
  109. """
  110. dst_dataset_dir = src_dataset_dir
  111. return dst_dataset_dir
  112. def analyse(self, dataset_dir: str) -> dict:
  113. """deep analyse dataset
  114. Args:
  115. dataset_dir (str): the root directory of dataset.
  116. Returns:
  117. dict: the deep analysis results.
  118. """
  119. return {}
  120. @abstractmethod
  121. def get_show_type(self):
  122. """return the dataset show type
  123. Raises:
  124. NotImplementedError
  125. """
  126. raise NotImplementedError
  127. @abstractmethod
  128. def get_dataset_type(self):
  129. """return the dataset type
  130. Raises:
  131. NotImplementedError
  132. """
  133. raise NotImplementedError