readers.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import enum
  15. import itertools
  16. import cv2
  17. import fitz
  18. from PIL import Image, ImageOps
  19. import pandas as pd
  20. import numpy as np
  21. import yaml
  22. import soundfile
  23. import decord
  24. import random
  25. import platform
  26. from ....utils import logging
  27. if not platform.machine().startswith("arm"):
  28. import decord
  29. else:
  30. logging.warning(
  31. "Please install `decord` manually on ARM machine. Otherwise, the related model cannot work."
  32. )
  33. __all__ = [
  34. "ReaderType",
  35. "ImageReader",
  36. "VideoReader",
  37. "CSVReader",
  38. "PDFReader",
  39. "YAMLReader",
  40. "AudioReader",
  41. ]
  42. class ReaderType(enum.Enum):
  43. """ReaderType"""
  44. IMAGE = 1
  45. GENERATIVE = 2
  46. POINT_CLOUD = 3
  47. JSON = 4
  48. TS = 5
  49. PDF = 6
  50. YAML = 8
  51. class _BaseReader(object):
  52. """_BaseReader"""
  53. def __init__(self, backend, **bk_args):
  54. super().__init__()
  55. if len(bk_args) == 0:
  56. bk_args = self.get_default_backend_args()
  57. self.bk_type = backend
  58. self.bk_args = bk_args
  59. self._backend = self.get_backend()
  60. def read(self, in_path):
  61. """read file from path"""
  62. raise NotImplementedError
  63. def get_backend(self, bk_args=None):
  64. """get the backend"""
  65. if bk_args is None:
  66. bk_args = self.bk_args
  67. return self._init_backend(self.bk_type, bk_args)
  68. def set_backend(self, backend, **bk_args):
  69. self.bk_type = backend
  70. self.bk_args = bk_args
  71. self._backend = self.get_backend()
  72. def _init_backend(self, bk_type, bk_args):
  73. """init backend"""
  74. raise NotImplementedError
  75. def get_type(self):
  76. """get type"""
  77. raise NotImplementedError
  78. def get_default_backend_args(self):
  79. """get default backend arguments"""
  80. return {}
  81. class PDFReader(_BaseReader):
  82. """PDFReader"""
  83. def __init__(self, backend="fitz", **bk_args):
  84. super().__init__(backend, **bk_args)
  85. def read(self, in_path):
  86. yield from self._backend.read_file(str(in_path))
  87. def _init_backend(self, bk_type, bk_args):
  88. return PDFReaderBackend(**bk_args)
  89. def get_type(self):
  90. return ReaderType.PDF
  91. class ImageReader(_BaseReader):
  92. """ImageReader"""
  93. def __init__(self, backend="opencv", **bk_args):
  94. super().__init__(backend=backend, **bk_args)
  95. def read(self, in_path):
  96. """read the image file from path"""
  97. arr = self._backend.read_file(str(in_path))
  98. return arr
  99. def _init_backend(self, bk_type, bk_args):
  100. """init backend"""
  101. if bk_type == "opencv":
  102. return OpenCVImageReaderBackend(**bk_args)
  103. elif bk_type == "pil" or bk_type == "pillow":
  104. return PILImageReaderBackend(**bk_args)
  105. else:
  106. raise ValueError("Unsupported backend type")
  107. def get_type(self):
  108. """get type"""
  109. return ReaderType.IMAGE
  110. class _GenerativeReader(_BaseReader):
  111. """_GenerativeReader"""
  112. def get_type(self):
  113. """get type"""
  114. return ReaderType.GENERATIVE
  115. def is_generative_reader(reader):
  116. """is_generative_reader"""
  117. return isinstance(reader, _GenerativeReader)
  118. class VideoReader(_GenerativeReader):
  119. """VideoReader"""
  120. def __init__(
  121. self,
  122. backend="opencv",
  123. st_frame_id=0,
  124. max_num_frames=None,
  125. auto_close=True,
  126. **bk_args,
  127. ):
  128. super().__init__(backend=backend, **bk_args)
  129. self.st_frame_id = st_frame_id
  130. self.max_num_frames = max_num_frames
  131. self.auto_close = auto_close
  132. self._fps = 0
  133. def read(self, in_path):
  134. """read vide file from path"""
  135. self._backend.set_pos(self.st_frame_id)
  136. gen = self._backend.read_file(str(in_path))
  137. if self.max_num_frames is not None:
  138. gen = itertools.islice(gen, self.num_frames)
  139. yield from gen
  140. if self.auto_close:
  141. self._backend.close()
  142. def get_fps(self):
  143. """get fps"""
  144. return self._backend.get_fps()
  145. def _init_backend(self, bk_type, bk_args):
  146. """init backend"""
  147. if bk_type == "opencv":
  148. return OpenCVVideoReaderBackend(**bk_args)
  149. elif bk_type == "decord":
  150. return DecordVideoReaderBackend(**bk_args)
  151. else:
  152. raise ValueError("Unsupported backend type")
  153. class YAMLReader(_BaseReader):
  154. def __init__(self, backend="PyYAML", **bk_args):
  155. super().__init__(backend, **bk_args)
  156. def read(self, in_path):
  157. return self._backend.read_file(str(in_path))
  158. def _init_backend(self, bk_type, bk_args):
  159. if bk_type == "PyYAML":
  160. return YAMLReaderBackend(**bk_args)
  161. else:
  162. raise ValueError("Unsupported backend type")
  163. def get_type(self):
  164. return ReaderType.YAML
  165. class _BaseReaderBackend(object):
  166. """_BaseReaderBackend"""
  167. def read_file(self, in_path):
  168. """read file from path"""
  169. raise NotImplementedError
  170. class _ImageReaderBackend(_BaseReaderBackend):
  171. """_ImageReaderBackend"""
  172. pass
  173. class OpenCVImageReaderBackend(_ImageReaderBackend):
  174. """OpenCVImageReaderBackend"""
  175. def __init__(self, flags=cv2.IMREAD_COLOR):
  176. super().__init__()
  177. self.flags = flags
  178. def read_file(self, in_path):
  179. """read image file from path by OpenCV"""
  180. return cv2.imread(in_path, flags=self.flags)
  181. class PILImageReaderBackend(_ImageReaderBackend):
  182. """PILImageReaderBackend"""
  183. def __init__(self):
  184. super().__init__()
  185. def read_file(self, in_path):
  186. """read image file from path by PIL"""
  187. return ImageOps.exif_transpose(Image.open(in_path))
  188. class PDFReaderBackend(_BaseReaderBackend):
  189. def __init__(self, rotate=0, zoom_x=2.0, zoom_y=2.0):
  190. super().__init__()
  191. self.mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
  192. def read_file(self, in_path):
  193. for page in fitz.open(in_path):
  194. pix = page.get_pixmap(matrix=self.mat, alpha=False)
  195. getpngdata = pix.tobytes(output="png")
  196. # decode as np.uint8
  197. image_array = np.frombuffer(getpngdata, dtype=np.uint8)
  198. img_cv = cv2.imdecode(image_array, cv2.IMREAD_ANYCOLOR)
  199. yield img_cv
  200. class _VideoReaderBackend(_BaseReaderBackend):
  201. """_VideoReaderBackend"""
  202. def set_pos(self, pos):
  203. """set pos"""
  204. raise NotImplementedError
  205. def close(self):
  206. """close io"""
  207. raise NotImplementedError
  208. class OpenCVVideoReaderBackend(_VideoReaderBackend):
  209. """OpenCVVideoReaderBackend"""
  210. def __init__(self, **bk_args):
  211. super().__init__()
  212. self.cap_init_args = bk_args
  213. self._cap = None
  214. self._pos = 0
  215. self._max_num_frames = None
  216. def get_fps(self):
  217. return self._cap.get(cv2.CAP_PROP_FPS)
  218. def read_file(self, in_path):
  219. """read vidio file from path"""
  220. if self._cap is not None:
  221. self._cap_release()
  222. self._cap = self._cap_open(in_path)
  223. if self._pos is not None:
  224. self._cap_set_pos()
  225. return self._read_frames(self._cap)
  226. def _read_frames(self, cap):
  227. """read frames"""
  228. while True:
  229. ret, frame = cap.read()
  230. if not ret:
  231. break
  232. yield frame
  233. self._cap_release()
  234. def _cap_open(self, video_path):
  235. self.cap_init_args.pop("num_seg")
  236. self.cap_init_args.pop("seg_len")
  237. self.cap_init_args.pop("sample_type")
  238. self._cap = cv2.VideoCapture(video_path, **self.cap_init_args)
  239. if not self._cap.isOpened():
  240. raise RuntimeError(f"Failed to open {video_path}")
  241. return self._cap
  242. def _cap_release(self):
  243. self._cap.release()
  244. def _cap_set_pos(self):
  245. self._cap.set(cv2.CAP_PROP_POS_FRAMES, self._pos)
  246. def set_pos(self, pos):
  247. self._pos = pos
  248. def close(self):
  249. if self._cap is not None:
  250. self._cap_release()
  251. self._cap = None
  252. class DecordVideoReaderBackend(_VideoReaderBackend):
  253. """DecordVideoReaderBackend"""
  254. def __init__(self, **bk_args):
  255. super().__init__()
  256. self.cap_init_args = bk_args
  257. self._cap = None
  258. self._pos = 0
  259. self._max_num_frames = None
  260. self.num_seg = bk_args.get("num_seg", 8)
  261. self.seg_len = bk_args.get("seg_len", 1)
  262. self.sample_type = bk_args.get("sample_type", 1)
  263. self.valid_mode = True
  264. self._fps = 0
  265. def set_pos(self, pos):
  266. self._pos = pos
  267. def sample(self, frames_len, video_object):
  268. frames_idx = []
  269. average_dur = int(frames_len / self.num_seg)
  270. for i in range(self.num_seg):
  271. idx = 0
  272. if not self.valid_mode:
  273. if average_dur >= self.seg_len:
  274. idx = random.randint(0, average_dur - self.seg_len)
  275. idx += i * average_dur
  276. elif average_dur >= 1:
  277. idx += i * average_dur
  278. else:
  279. idx = i
  280. else:
  281. if average_dur >= self.seg_len:
  282. idx = (average_dur - 1) // 2
  283. idx += i * average_dur
  284. elif average_dur >= 1:
  285. idx += i * average_dur
  286. else:
  287. idx = i
  288. for jj in range(idx, idx + self.seg_len):
  289. frames_idx.append(int(jj % frames_len))
  290. frames_select = video_object.get_batch(frames_idx)
  291. # dearray_to_img
  292. np_frames = frames_select.asnumpy()
  293. imgs = []
  294. for i in range(np_frames.shape[0]):
  295. imgbuf = np_frames[i]
  296. imgs.append(imgbuf)
  297. return imgs
  298. def get_fps(self):
  299. return self._cap.get_avg_fps()
  300. def read_file(self, in_path):
  301. """read vidio file from path"""
  302. self._cap = decord.VideoReader(in_path)
  303. frame_len = len(self._cap)
  304. if self.sample_type == "uniform":
  305. sample_video = self.sample(frame_len, self._cap)
  306. return sample_video
  307. else:
  308. return self._cap
  309. def close(self):
  310. pass
  311. class CSVReader(_BaseReader):
  312. """CSVReader"""
  313. def __init__(self, backend="pandas", **bk_args):
  314. super().__init__(backend=backend, **bk_args)
  315. def read(self, in_path):
  316. """read the image file from path"""
  317. arr = self._backend.read_file(str(in_path))
  318. return arr
  319. def _init_backend(self, bk_type, bk_args):
  320. """init backend"""
  321. if bk_type == "pandas":
  322. return PandasCSVReaderBackend(**bk_args)
  323. else:
  324. raise ValueError("Unsupported backend type")
  325. def get_type(self):
  326. """get type"""
  327. return ReaderType.TS
  328. class _CSVReaderBackend(_BaseReaderBackend):
  329. """_CSVReaderBackend"""
  330. pass
  331. class PandasCSVReaderBackend(_CSVReaderBackend):
  332. """PandasCSVReaderBackend"""
  333. def __init__(self):
  334. super().__init__()
  335. def read_file(self, in_path):
  336. """read image file from path by OpenCV"""
  337. return pd.read_csv(in_path)
  338. class YAMLReaderBackend(_BaseReaderBackend):
  339. def read_file(self, in_path, **kwargs):
  340. with open(in_path, "r", encoding="utf-8", **kwargs) as yaml_file:
  341. data = yaml.load(yaml_file, Loader=yaml.FullLoader)
  342. return data
  343. class AudioReader(_BaseReader):
  344. def __init__(self, backend="wav", **bk_args):
  345. super().__init__(backend="wav", **bk_args)
  346. def _init_backend(self, bk_type, bk_args):
  347. """init backend"""
  348. if bk_type == "wav":
  349. return WAVReaderBackend(**bk_args)
  350. else:
  351. raise ValueError("Unsupported backend type")
  352. def read(self, in_path):
  353. audio, audio_sample_rate = self._backend.read_file(str(in_path))
  354. return audio, audio_sample_rate
  355. class _AudioReaderBackend(_BaseReaderBackend):
  356. """_AudioReaderBackend"""
  357. pass
  358. class WAVReaderBackend(_AudioReaderBackend):
  359. """PandasCSVReaderBackend"""
  360. def __init__(self):
  361. super().__init__()
  362. def read_file(self, in_path):
  363. """read wav file from path"""
  364. audio, audio_sample_rate = soundfile.read(
  365. in_path, dtype="float32", always_2d=True
  366. )
  367. return audio, audio_sample_rate