file_interface.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import logging
  15. import os
  16. import chardet
  17. import ruamel.yaml
  18. import yaml
  19. from filelock import FileLock
  20. try:
  21. import ujson as json
  22. except:
  23. logging.warning("failed to import ujson, using json instead")
  24. import json
  25. from contextlib import contextmanager
  26. @contextmanager
  27. def custom_open(file_path, mode):
  28. """
  29. 自定义打开文件函数
  30. Args:
  31. file_path (str): 文件路径
  32. mode (str): 文件打开模式,'r','w' 或 'a'
  33. Returns:
  34. Any: 返回文件对象
  35. Raises:
  36. FileNotFoundError: 当文件不存在时,raise FileNotFoundError
  37. ValueError: 当 mode 参数不是 'r', 'w' 和 'a' 时,raise ValueError
  38. """
  39. if mode == "r":
  40. if not os.path.exists(file_path):
  41. raise FileNotFoundError("file {} not found".format(file_path))
  42. file = open(file_path, "r", encoding="utf-8")
  43. try:
  44. file.read()
  45. file.seek(0)
  46. yield file
  47. except UnicodeDecodeError:
  48. file = open(file_path, "r", encoding="gbk")
  49. try:
  50. file.read()
  51. file.seek(0)
  52. yield file
  53. except UnicodeDecodeError:
  54. with open(file_path, "rb") as f:
  55. encoding = chardet.detect(f.read())["encoding"]
  56. file = open(file_path, "r", encoding=encoding)
  57. yield file
  58. finally:
  59. file.close()
  60. elif mode == "w":
  61. file = open(file_path, "w", encoding="utf-8")
  62. yield file
  63. file.close()
  64. elif mode == "a":
  65. encoding = "utf-8"
  66. if os.path.exists(file_path):
  67. file = open(file_path, "r", encoding=encoding)
  68. try:
  69. file.read()
  70. file.seek(0)
  71. except UnicodeDecodeError:
  72. encoding = "gbk"
  73. file = open(file_path, "r", encoding=encoding)
  74. try:
  75. file.read()
  76. file.seek(0)
  77. except UnicodeDecodeError:
  78. with open(file_path, "rb") as f:
  79. encoding = chardet.detect(f.read())["encoding"]
  80. finally:
  81. file.close()
  82. file = open(file_path, "a", encoding=encoding)
  83. yield file
  84. file.close()
  85. else:
  86. raise ValueError("mode must be 'r', 'w' or 'a', but got {}".format(mode))
  87. # --------------- yaml ---------------
  88. def read_yaml_file(yaml_path: str, to_dict=True):
  89. """read from yaml file"""
  90. try:
  91. with open(yaml_path, "r", encoding="utf-8") as file:
  92. yaml_content = yaml.full_load(file)
  93. except UnicodeDecodeError:
  94. with open(yaml_path, "r", encoding="gbk") as file:
  95. yaml_content = yaml.full_load(file)
  96. yaml_content = dict(yaml_content) if to_dict else yaml_content
  97. return yaml_content
  98. def write_config_file(yaml_dict: dict, yaml_path: str):
  99. """write to config yaml file"""
  100. yaml = ruamel.yaml.YAML()
  101. lock = FileLock(yaml_path + ".lock")
  102. with lock:
  103. with open(yaml_path, "w", encoding="utf-8") as file:
  104. # yaml.safe_dump(yaml_dict, file, sort_keys=False)
  105. yaml.dump(yaml_dict, file)
  106. def update_yaml_file_with_dict(yaml_path, key_values: dict):
  107. """update yaml file with key_values
  108. key_values is a dict
  109. """
  110. yaml_dict = read_yaml_file(yaml_path)
  111. yaml_dict.update(key_values)
  112. write_config_file(yaml_dict, yaml_path)
  113. def get_yaml_keys(yaml_path):
  114. """get all keys of yaml file"""
  115. yaml_dict = read_yaml_file(yaml_path)
  116. return yaml_dict.keys()
  117. # --------------- markdown ---------------
  118. def generate_markdown_from_dict(metrics):
  119. """generate_markdown_from_dict"""
  120. mk = ""
  121. keys = metrics.keys()
  122. mk += "| ".join(keys())
  123. mk += os.linesep
  124. mk += "|".join([" :----: "])
  125. # ------------------- jsonl ---------------------
  126. def read_jsonl_file(jsonl_path: str):
  127. """read from jsonl file"""
  128. with custom_open(jsonl_path, "r") as file:
  129. jsonl_content = [json.loads(line) for line in file]
  130. return jsonl_content
  131. def write_json_file(content, jsonl_path: str, ensure_ascii=False, **kwargs):
  132. """write to json file"""
  133. with custom_open(jsonl_path, "w") as file:
  134. json.dump(content, file, ensure_ascii=ensure_ascii, **kwargs)
  135. # --------------- check webui yaml -----------------
  136. def check_dict_keys(to_checked_dict, standard_dict, escape_list=None):
  137. """check if all keys of to_checked_dict is the same as standard_dict, and the value is the same type
  138. Args:
  139. escape_list: if set, will not check the keys in white_list
  140. """
  141. escape_list = [] if escape_list is None else escape_list
  142. for key in standard_dict.keys():
  143. if key not in to_checked_dict:
  144. logging.error(f"key {key} not in yaml file")
  145. return False
  146. if not isinstance(standard_dict[key], type(to_checked_dict[key])):
  147. logging.error(
  148. f"value type of key {key} is not the same as standard: "
  149. f"{type(standard_dict[key])}, {type(to_checked_dict[key])}"
  150. )
  151. return False
  152. if (
  153. isinstance(standard_dict[key], dict)
  154. and isinstance(to_checked_dict[key], dict)
  155. and key not in escape_list
  156. ):
  157. return check_dict_keys(
  158. to_checked_dict[key], standard_dict[key], escape_list
  159. )
  160. if len(to_checked_dict.keys()) != len(standard_dict.keys()):
  161. logging.error(f"yaml file has extra keys")
  162. return False
  163. return True
  164. def check_dataset_valid(path_list):
  165. """check if dataset valid in path_list for datset_ui"""
  166. if path_list is not None and len(path_list) > 0:
  167. for path in path_list:
  168. if not os.path.exists(path):
  169. return False
  170. return True
  171. else:
  172. return False