__init__.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. from abc import ABC, abstractmethod
  16. from .utils import build_res_dict
  17. from ....utils.misc import AutoRegisterABCMetaClass
  18. from ....utils.config import AttrDict
  19. from ....utils.logging import info
  20. def build_dataset_checker(config: AttrDict) -> "BaseDatasetChecker":
  21. """build dataset checker
  22. Args:
  23. config (AttrDict): PaddleX pipeline config, which is loaded from pipeline yaml file.
  24. Returns:
  25. BaseDatasetChecker: the dataset checker, which is subclass of BaseDatasetChecker.
  26. """
  27. model_name = config.Global.model
  28. return BaseDatasetChecker.get(model_name)(config)
  29. class BaseDatasetChecker(ABC, metaclass=AutoRegisterABCMetaClass):
  30. """ Base Dataset Checker """
  31. __is_base = True
  32. def __init__(self, config):
  33. """Initialize the instance.
  34. Args:
  35. config (AttrDict): PaddleX pipeline config, which is loaded from pipeline yaml file.
  36. """
  37. super().__init__()
  38. self.global_config = config.Global
  39. self.check_dataset_config = config.CheckDataset
  40. self.output_dir = os.path.join(self.global_config.output,
  41. "check_dataset")
  42. def __call__(self) -> dict:
  43. """execute dataset checking
  44. Returns:
  45. dict: the dataset checking result.
  46. """
  47. dataset_dir = self.get_dataset_root(self.global_config.dataset_dir)
  48. if not os.path.exists(self.output_dir):
  49. os.makedirs(self.output_dir)
  50. if self.check_dataset_config.get("convert", None):
  51. if self.check_dataset_config.convert.get("enable", False):
  52. self.convert_dataset(dataset_dir)
  53. info("Convert dataset successfully !")
  54. if self.check_dataset_config.get("split", None):
  55. if self.check_dataset_config.split.get("enable", False):
  56. self.split_dataset(dataset_dir)
  57. info("Split dataset successfully !")
  58. attrs = self.check_dataset(dataset_dir)
  59. analysis = self.analyse(dataset_dir)
  60. check_result = build_res_dict(True)
  61. check_result["attributes"] = attrs
  62. check_result["analysis"] = analysis
  63. check_result["dataset_path"] = self.global_config.dataset_dir
  64. check_result["show_type"] = self.get_show_type()
  65. check_result["dataset_type"] = self.get_dataset_type()
  66. info("Check dataset passed !")
  67. return check_result
  68. def get_dataset_root(self, dataset_dir: str) -> str:
  69. """find the dataset root dir
  70. Args:
  71. dataset_dir (str): the directory that contain dataset.
  72. Returns:
  73. str: the root directory of dataset.
  74. """
  75. # XXX: forward compatible
  76. # dataset_dir = [d for d in Path(dataset_dir).iterdir() if d.is_dir()]
  77. # assert len(dataset_dir) == 1
  78. # return dataset_dir[0].as_posix()
  79. return dataset_dir
  80. @abstractmethod
  81. def check_dataset(self, dataset_dir: str):
  82. """check if the dataset meets the specifications and get dataset summary
  83. Args:
  84. dataset_dir (str): the root directory of dataset.
  85. Raises:
  86. NotImplementedError
  87. """
  88. raise NotImplementedError
  89. def convert_dataset(self, src_dataset_dir: str) -> str:
  90. """convert the dataset from other type to specified type
  91. Args:
  92. src_dataset_dir (str): the root directory of dataset.
  93. Returns:
  94. str: the root directory of converted dataset.
  95. """
  96. dst_dataset_dir = src_dataset_dir
  97. return dst_dataset_dir
  98. def split_dataset(self, src_dataset_dir: str) -> str:
  99. """repartition the train and validation dataset
  100. Args:
  101. src_dataset_dir (str): the root directory of dataset.
  102. Returns:
  103. str: the root directory of splited dataset.
  104. """
  105. dst_dataset_dir = src_dataset_dir
  106. return dst_dataset_dir
  107. def analyse(self, dataset_dir: str) -> dict:
  108. """deep analyse dataset
  109. Args:
  110. dataset_dir (str): the root directory of dataset.
  111. Returns:
  112. dict: the deep analysis results.
  113. """
  114. return {}
  115. @abstractmethod
  116. def get_show_type(self):
  117. """return the dataset show type
  118. Raises:
  119. NotImplementedError
  120. """
  121. raise NotImplementedError
  122. @abstractmethod
  123. def get_dataset_type(self):
  124. """ return the dataset type
  125. Raises:
  126. NotImplementedError
  127. """
  128. raise NotImplementedError