model_outputs.py 82 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import functools
  15. from collections import OrderedDict
  16. from dataclasses import dataclass, fields
  17. from typing import Any, Optional, Tuple
  18. import numpy as np
  19. import paddle
  20. from paddle import Tensor
  21. from paddle.distributed.fleet.utils import recompute
  22. from paddle.nn import MultiHeadAttention
  23. from paddle.nn.layer.transformer import _convert_attention_mask
  24. from ...tokenizer.tokenizer_utils import adapt_stale_fwd_patch
  25. def tuple_output(outputs: Tuple[Tensor], loss: Optional[Tensor] = None):
  26. """re-construct the outputs with one method which contains the simple logic
  27. Args:
  28. outputs (Tuple[Tensor]): the source of the outputs
  29. loss (Optional[Tensor], optional): the loss of the model. Defaults to None.
  30. """
  31. if loss is not None:
  32. outputs = (loss,) + outputs
  33. if len(outputs) == 1:
  34. return outputs[0]
  35. return outputs
  36. def convert_encoder_output(encoder_output):
  37. """
  38. Convert encoder_output from tuple to class:`~paddlenlp.transformers.model_outputs.BaseModelOutput`.
  39. Args:
  40. encoder_output (tuple or ModelOutput):
  41. The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional).
  42. The data type of `last_hidden_state` is float32 and its shape is [batch_size, sequence_length, hidden_size].
  43. """
  44. return BaseModelOutput(
  45. last_hidden_state=encoder_output[0],
  46. hidden_states=encoder_output[1] if len(encoder_output) > 1 else None,
  47. attentions=encoder_output[2] if len(encoder_output) > 2 else None,
  48. )
  49. def layer_init_wrapper(func):
  50. @functools.wraps(func)
  51. def _impl(self, *args, **kwargs):
  52. enable_recompute = kwargs.pop("enable_recompute", False)
  53. func(self, *args, **kwargs)
  54. if paddle.in_dynamic_mode():
  55. self.enable_recompute = enable_recompute
  56. else:
  57. self.enable_recompute = False
  58. return _impl
  59. @paddle.jit.not_to_static
  60. def _transformer_encoder_layer_fwd(
  61. self, src, src_mask=None, cache=None, output_attentions=False
  62. ):
  63. self.self_attn.need_weights = output_attentions
  64. src_mask = _convert_attention_mask(src_mask, src.dtype)
  65. residual = src
  66. if self.normalize_before:
  67. src = self.norm1(src)
  68. attn_outputs = self.self_attn(src, src, src, src_mask, cache)
  69. if isinstance(attn_outputs, tuple):
  70. src = attn_outputs[0]
  71. outputs = attn_outputs[1:]
  72. else:
  73. src = attn_outputs
  74. outputs = None
  75. src = residual + self.dropout1(src)
  76. if not self.normalize_before:
  77. src = self.norm1(src)
  78. residual = src
  79. if self.normalize_before:
  80. src = self.norm2(src)
  81. src = self.linear2(self.dropout(self.activation(self.linear1(src))))
  82. src = residual + self.dropout2(src)
  83. if not self.normalize_before:
  84. src = self.norm2(src)
  85. return (
  86. src if outputs is None else ((src,) + outputs[::-1])
  87. ) # hidden_states, cache, attentions
  88. @paddle.jit.not_to_static
  89. def _transformer_decoder_layer_fwd(
  90. self,
  91. tgt,
  92. memory,
  93. tgt_mask=None,
  94. memory_mask=None,
  95. cache=None,
  96. output_attentions=False,
  97. ):
  98. residual = tgt
  99. # self attention
  100. self.self_attn.need_weights = output_attentions
  101. tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
  102. if self.normalize_before:
  103. tgt = self.norm1(tgt)
  104. self_attn_outputs = self.self_attn(
  105. tgt, tgt, tgt, tgt_mask, cache[0] if cache else None
  106. )
  107. # self_attn_outputs = (tgt, attn_weights, incremental_cache) or only tgt
  108. if isinstance(self_attn_outputs, type(tgt)):
  109. tgt = self_attn_outputs
  110. else:
  111. tgt = self_attn_outputs[0]
  112. if output_attentions:
  113. self_attn_weights = self_attn_outputs[1]
  114. if cache:
  115. incremental_cache = self_attn_outputs[-1]
  116. tgt = residual + self.dropout1(tgt)
  117. if not self.normalize_before:
  118. tgt = self.norm1(tgt)
  119. residual = tgt
  120. # cross attention
  121. if memory is not None:
  122. self.cross_attn.need_weights = output_attentions
  123. memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
  124. if self.normalize_before:
  125. tgt = self.norm2(tgt)
  126. cross_attn_outputs = self.cross_attn(
  127. tgt, memory, memory, memory_mask, cache[1] if cache else None
  128. )
  129. if isinstance(cross_attn_outputs, type(tgt)):
  130. tgt = cross_attn_outputs
  131. else:
  132. tgt = cross_attn_outputs[0]
  133. if output_attentions:
  134. cross_attn_weights = cross_attn_outputs[1]
  135. if cache:
  136. static_cache = cross_attn_outputs[-1]
  137. tgt = residual + self.dropout2(tgt)
  138. if not self.normalize_before:
  139. tgt = self.norm2(tgt)
  140. residual = tgt
  141. if self.normalize_before:
  142. tgt = self.norm3(tgt)
  143. tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
  144. tgt = residual + self.dropout3(tgt)
  145. if not self.normalize_before:
  146. tgt = self.norm3(tgt)
  147. if not output_attentions and cache is None:
  148. return tgt
  149. else:
  150. outputs = (tgt,)
  151. if output_attentions:
  152. outputs += (
  153. self_attn_weights,
  154. cross_attn_weights if memory is not None else None,
  155. )
  156. if cache:
  157. outputs += (
  158. (incremental_cache, static_cache if memory is not None else None),
  159. )
  160. return outputs
  161. @paddle.jit.not_to_static
  162. def _transformer_decoder_fwd(
  163. self,
  164. tgt,
  165. memory=None,
  166. tgt_mask=None,
  167. memory_mask=None,
  168. cache=None,
  169. output_attentions=False,
  170. output_hidden_states=False,
  171. return_dict=False,
  172. ):
  173. tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
  174. if memory is not None:
  175. memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
  176. new_caches = [] if cache else None
  177. all_hidden_states = [tgt] if output_hidden_states else None
  178. all_self_attns = [] if output_attentions else None
  179. all_cross_attns = [] if output_attentions else None
  180. for i, mod in enumerate(self.layers):
  181. if cache is None:
  182. # if output has no gradient, recompute is unnecessary
  183. memory_stop_gradient = memory is not None and memory.stop_gradient
  184. has_gradient = (not tgt.stop_gradient) or (not memory_stop_gradient)
  185. if self.enable_recompute and has_gradient:
  186. outputs = recompute(
  187. mod, tgt, memory, tgt_mask, memory_mask, None, output_attentions
  188. )
  189. else:
  190. outputs = mod(
  191. tgt,
  192. memory,
  193. tgt_mask=tgt_mask,
  194. memory_mask=memory_mask,
  195. cache=None,
  196. output_attentions=output_attentions,
  197. )
  198. else:
  199. outputs = mod(
  200. tgt,
  201. memory,
  202. tgt_mask=tgt_mask,
  203. memory_mask=memory_mask,
  204. cache=cache[i] if cache else None,
  205. output_attentions=output_attentions,
  206. )
  207. if isinstance(outputs, type(tgt)):
  208. tgt = outputs
  209. else:
  210. tgt = outputs[0]
  211. if cache:
  212. new_caches.append(outputs[-1])
  213. if output_attentions:
  214. all_self_attns.append(outputs[1])
  215. all_cross_attns.append(outputs[2])
  216. if output_hidden_states:
  217. all_hidden_states.append(tgt)
  218. if self.norm is not None:
  219. tgt = self.norm(tgt)
  220. if output_hidden_states:
  221. all_hidden_states[-1] = tgt
  222. if not return_dict:
  223. if isinstance(outputs, type(tgt)):
  224. return tgt
  225. temp_list = [
  226. tgt,
  227. new_caches if cache else None,
  228. all_hidden_states,
  229. all_self_attns,
  230. all_cross_attns,
  231. ]
  232. return tuple(v for v in temp_list if v is not None)
  233. return BaseModelOutputWithPastAndCrossAttentions(
  234. last_hidden_state=tgt,
  235. past_key_values=new_caches,
  236. hidden_states=all_hidden_states,
  237. attentions=all_self_attns,
  238. cross_attentions=all_cross_attns,
  239. )
  240. @paddle.jit.not_to_static
  241. def _transformer_encoder_fwd(
  242. self,
  243. src,
  244. src_mask=None,
  245. cache=None,
  246. output_attentions=False,
  247. output_hidden_states=False,
  248. return_dict=False,
  249. ):
  250. src_mask = _convert_attention_mask(src_mask, src.dtype)
  251. output = src
  252. # To get cache from None when use_cache is True, which is compatible with HF
  253. # while HF requires decoder. The implementation here uses cache update in the
  254. # MultiHeadAttention not so efficiently, and maybe optimize it later.
  255. if cache is None and getattr(self, "_use_cache", False):
  256. cache = [tuple(self.layers[0].gen_cache(src))] * len(self.layers)
  257. # To be compatible with `TransformerEncoder.forward`, `_use_cache` defaults
  258. # to True when cache is not None.
  259. new_caches = [] if cache is not None and getattr(self, "_use_cache", True) else None
  260. all_attentions = [] if output_attentions else None
  261. # NOTE: Also includes embedding output which is same as HF.
  262. all_hidden_states = [output] if output_hidden_states else None
  263. for i, mod in enumerate(self.layers):
  264. # if output has no gradient, recompute is unnecessary
  265. has_gradient = not output.stop_gradient
  266. if self.enable_recompute and has_gradient:
  267. # Note: recompute do not support pass as **kwargs yet.
  268. layer_outputs = recompute(
  269. mod,
  270. output,
  271. src_mask,
  272. (
  273. None
  274. if cache is None
  275. else (
  276. cache[i]
  277. if isinstance(cache[i], MultiHeadAttention.Cache)
  278. else MultiHeadAttention.Cache(*cache[i])
  279. )
  280. ),
  281. output_attentions,
  282. )
  283. else:
  284. layer_outputs = mod(
  285. output,
  286. src_mask=src_mask,
  287. cache=(
  288. None
  289. if cache is None
  290. else (
  291. cache[i]
  292. if isinstance(cache[i], MultiHeadAttention.Cache)
  293. else MultiHeadAttention.Cache(*cache[i])
  294. )
  295. ),
  296. output_attentions=output_attentions,
  297. )
  298. if isinstance(layer_outputs, tuple):
  299. output = layer_outputs[0]
  300. outputs = layer_outputs[1:]
  301. else:
  302. output = layer_outputs
  303. outputs = None
  304. if output_hidden_states:
  305. all_hidden_states.append(output)
  306. if output_attentions:
  307. all_attentions.append(outputs[-1])
  308. if new_caches is not None:
  309. new_caches.append(
  310. outputs[0]
  311. if isinstance(cache[i], MultiHeadAttention.Cache)
  312. else (tuple(outputs[0]))
  313. )
  314. if self.norm is not None:
  315. output = self.norm(output)
  316. if output_hidden_states:
  317. all_hidden_states[-1] = output
  318. if not return_dict:
  319. outputs = tuple(
  320. tuple(v) if isinstance(v, list) else v
  321. for v in [
  322. output,
  323. new_caches,
  324. all_hidden_states,
  325. all_attentions,
  326. ]
  327. if v is not None
  328. )
  329. if len(outputs) == 1:
  330. return output
  331. else:
  332. return outputs
  333. return BaseModelOutputWithPastAndCrossAttentions(
  334. last_hidden_state=output,
  335. past_key_values=new_caches,
  336. hidden_states=all_hidden_states,
  337. attentions=all_attentions,
  338. )
  339. _transformer_encoder_fwd.__name__ = "forward"
  340. _transformer_encoder_layer_fwd.__name__ = "forward"
  341. # patches of paddle.nn.Transformer to get all hidden_states and attentions
  342. paddle.nn.TransformerEncoderLayer.forward = _transformer_encoder_layer_fwd
  343. paddle.nn.TransformerDecoderLayer.forward = _transformer_decoder_layer_fwd
  344. paddle.nn.TransformerEncoder.forward = _transformer_encoder_fwd
  345. paddle.nn.TransformerDecoder.forward = _transformer_decoder_fwd
  346. _encoder_init = paddle.nn.TransformerEncoder.__init__
  347. _decoder_init = paddle.nn.TransformerDecoder.__init__
  348. paddle.nn.TransformerEncoder.__init__ = layer_init_wrapper(_encoder_init)
  349. paddle.nn.TransformerDecoder.__init__ = layer_init_wrapper(_decoder_init)
  350. def _get_wrap_setattr(cls):
  351. def _wrap_setattr(self, name, value):
  352. value = adapt_stale_fwd_patch(self, name, value)
  353. return super(cls, self).__setattr__(name, value)
  354. return _wrap_setattr
  355. paddle.nn.TransformerEncoderLayer.__setattr__ = functools.wraps(
  356. paddle.nn.TransformerEncoderLayer.__setattr__
  357. )(_get_wrap_setattr(paddle.nn.TransformerEncoderLayer))
  358. paddle.nn.TransformerEncoder.__setattr__ = functools.wraps(
  359. paddle.nn.TransformerEncoder.__setattr__
  360. )(_get_wrap_setattr(paddle.nn.TransformerEncoder))
  361. paddle.nn.TransformerDecoder.__setattr__ = functools.wraps(
  362. paddle.nn.TransformerDecoder.__setattr__
  363. )(_get_wrap_setattr(paddle.nn.TransformerDecoder))
  364. def is_tensor(x):
  365. if isinstance(x, paddle.Tensor):
  366. return True
  367. return isinstance(x, np.ndarray)
  368. class ModelOutput(OrderedDict):
  369. """
  370. Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
  371. tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
  372. python dictionary.
  373. <Tip warning={true}>
  374. You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple
  375. before.
  376. </Tip>
  377. """
  378. def __post_init__(self):
  379. class_fields = fields(self)
  380. # note(guosheng): Convert list to tuple automatically, and better to
  381. # check if it is frozen.
  382. # assert not getattr(self, dataclasses._PARAMS).frozen
  383. for f in class_fields:
  384. value = getattr(self, f.name)
  385. if isinstance(value, list):
  386. setattr(self, f.name, tuple(value))
  387. # Safety and consistency checks
  388. if not len(class_fields):
  389. raise ValueError(f"{self.__class__.__name__} has no fields.")
  390. if not all(field.default is None for field in class_fields[1:]):
  391. raise ValueError(
  392. f"{self.__class__.__name__} should not have more than one required field."
  393. )
  394. first_field = getattr(self, class_fields[0].name)
  395. other_fields_are_none = all(
  396. getattr(self, field.name) is None for field in class_fields[1:]
  397. )
  398. if other_fields_are_none and not is_tensor(first_field):
  399. if isinstance(first_field, dict):
  400. iterator = first_field.items()
  401. first_field_iterator = True
  402. else:
  403. try:
  404. iterator = iter(first_field)
  405. first_field_iterator = True
  406. except TypeError:
  407. first_field_iterator = False
  408. # if we provided an iterator as first field and the iterator is a (key, value) iterator
  409. # set the associated fields
  410. if first_field_iterator:
  411. for element in iterator:
  412. if (
  413. not isinstance(element, (list, tuple))
  414. or not len(element) == 2
  415. or not isinstance(element[0], str)
  416. ):
  417. break
  418. setattr(self, element[0], element[1])
  419. if element[1] is not None:
  420. self[element[0]] = element[1]
  421. elif first_field is not None:
  422. self[class_fields[0].name] = first_field
  423. else:
  424. for field in class_fields:
  425. v = getattr(self, field.name)
  426. if v is not None:
  427. self[field.name] = v
  428. def __delitem__(self, *args, **kwargs):
  429. raise Exception(
  430. f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance."
  431. )
  432. def setdefault(self, *args, **kwargs):
  433. raise Exception(
  434. f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance."
  435. )
  436. def pop(self, *args, **kwargs):
  437. raise Exception(
  438. f"You cannot use ``pop`` on a {self.__class__.__name__} instance."
  439. )
  440. def update(self, *args, **kwargs):
  441. raise Exception(
  442. f"You cannot use ``update`` on a {self.__class__.__name__} instance."
  443. )
  444. def __getitem__(self, k):
  445. if isinstance(k, str):
  446. inner_dict = {k: v for (k, v) in self.items()}
  447. return inner_dict[k]
  448. else:
  449. return self.to_tuple()[k]
  450. def __setattr__(self, name, value):
  451. if name in self.keys() and value is not None:
  452. # Don't call self.__setitem__ to avoid recursion errors
  453. super().__setitem__(name, value)
  454. super().__setattr__(name, value)
  455. def __setitem__(self, key, value):
  456. # Will raise a KeyException if needed
  457. super().__setitem__(key, value)
  458. # Don't call self.__setattr__ to avoid recursion errors
  459. super().__setattr__(key, value)
  460. def to_tuple(self) -> Tuple[Any]:
  461. """
  462. Convert self to a tuple containing all the attributes/keys that are not `None`.
  463. """
  464. # try to fix: https://github.com/PaddlePaddle/PaddleNLP/issues/3355
  465. # when trying to get the keys of `OrderedDict`, `keys` method return empty values.
  466. # TODO(wj-Mcat): this bug should be fixed in Paddle framework
  467. tuples = ()
  468. for field in fields(self):
  469. if getattr(self, field.name, None) is None:
  470. continue
  471. tuples = tuples + (getattr(self, field.name),)
  472. return tuples
  473. @dataclass
  474. class BaseModelOutput(ModelOutput):
  475. """
  476. Base class for model's outputs, with potential hidden states and attentions.
  477. Args:
  478. last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  479. Sequence of hidden-states at the output of the last layer of the model.
  480. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  481. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  482. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  483. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  484. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  485. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  486. sequence_length)`.
  487. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  488. heads.
  489. """
  490. last_hidden_state: paddle.Tensor = None
  491. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  492. attentions: Optional[Tuple[paddle.Tensor]] = None
  493. @dataclass
  494. class BaseModelOutputWithNoAttention(ModelOutput):
  495. """
  496. Base class for model's outputs, with potential hidden states.
  497. Args:
  498. last_hidden_state (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
  499. Sequence of hidden-states at the output of the last layer of the model.
  500. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  501. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  502. one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  503. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  504. """
  505. last_hidden_state: paddle.Tensor = None
  506. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  507. @dataclass
  508. class BaseModelOutputWithPooling(ModelOutput):
  509. """
  510. Base class for model's outputs that also contains a pooling of the last hidden states.
  511. Args:
  512. last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  513. Sequence of hidden-states at the output of the last layer of the model.
  514. pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
  515. Last layer hidden-state of the first token of the sequence (classification token) after further processing
  516. through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
  517. the classification token after processing through a linear layer and a tanh activation function. The linear
  518. layer weights are trained from the next sentence prediction (classification) objective during pretraining.
  519. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  520. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  521. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  522. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  523. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  524. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  525. sequence_length)`.
  526. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  527. heads.
  528. """
  529. last_hidden_state: paddle.Tensor = None
  530. pooler_output: paddle.Tensor = None
  531. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  532. attentions: Optional[Tuple[paddle.Tensor]] = None
  533. @dataclass
  534. class BaseModelOutputWithPast(ModelOutput):
  535. """
  536. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
  537. Args:
  538. last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  539. Sequence of hidden-states at the output of the last layer of the model.
  540. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  541. hidden_size)` is output.
  542. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  543. Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  544. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
  545. `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
  546. encoder_sequence_length, embed_size_per_head)`.
  547. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  548. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  549. input) to speed up sequential decoding.
  550. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  551. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  552. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  553. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  554. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  555. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  556. sequence_length)`.
  557. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  558. heads.
  559. """
  560. last_hidden_state: paddle.Tensor = None
  561. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  562. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  563. attentions: Optional[Tuple[paddle.Tensor]] = None
  564. @dataclass
  565. class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
  566. """
  567. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
  568. Args:
  569. last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  570. Sequence of hidden-states at the output of the last layer of the model.
  571. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  572. hidden_size)` is output.
  573. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  574. Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  575. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
  576. `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
  577. encoder_sequence_length, embed_size_per_head)`.
  578. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  579. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  580. input) to speed up sequential decoding.
  581. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  582. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  583. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  584. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  585. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  586. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  587. sequence_length)`.
  588. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  589. heads.
  590. cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
  591. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  592. sequence_length)`.
  593. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  594. weighted average in the cross-attention heads.
  595. cum_offsets (`tuple(paddle.Tensor)`, *optional*, needed when `return_full_hidden_states=True`:
  596. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, 1)`.
  597. Offset of the current batch.
  598. """
  599. last_hidden_state: paddle.Tensor = None
  600. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  601. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  602. attentions: Optional[Tuple[paddle.Tensor]] = None
  603. cross_attentions: Optional[Tuple[paddle.Tensor]] = None
  604. cum_offsets: Optional[Tuple[paddle.Tensor]] = None
  605. @dataclass
  606. class BaseModelOutputWithPastAndMTP(ModelOutput):
  607. """
  608. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
  609. Args:
  610. last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  611. Sequence of hidden-states at the output of the last layer of the model.
  612. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  613. hidden_size)` is output.
  614. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  615. Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  616. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
  617. `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
  618. encoder_sequence_length, embed_size_per_head)`.
  619. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  620. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  621. input) to speed up sequential decoding.
  622. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  623. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  624. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  625. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  626. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  627. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  628. sequence_length)`.
  629. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  630. heads.
  631. mtp_outputs (`tuple(paddle.Tensor)`, *optional*):
  632. MTP Layers outputs, used to compute the mtp loss.
  633. heads.
  634. """
  635. last_hidden_state: paddle.Tensor = None
  636. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  637. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  638. attentions: Optional[Tuple[paddle.Tensor]] = None
  639. mtp_outputs: Optional[Tuple[paddle.Tensor]] = None
  640. @dataclass
  641. class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
  642. """
  643. Base class for model's outputs that also contains a pooling of the last hidden states.
  644. Args:
  645. last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  646. Sequence of hidden-states at the output of the last layer of the model.
  647. pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
  648. Last layer hidden-state of the first token of the sequence (classification token) after further processing
  649. through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
  650. the classification token after processing through a linear layer and a tanh activation function. The linear
  651. layer weights are trained from the next sentence prediction (classification) objective during pretraining.
  652. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  653. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  654. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  655. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  656. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  657. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  658. sequence_length)`.
  659. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  660. heads.
  661. cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
  662. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  663. sequence_length)`.
  664. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  665. weighted average in the cross-attention heads.
  666. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  667. Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  668. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
  669. `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
  670. encoder_sequence_length, embed_size_per_head)`.
  671. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  672. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  673. input) to speed up sequential decoding.
  674. """
  675. last_hidden_state: paddle.Tensor = None
  676. pooler_output: paddle.Tensor = None
  677. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  678. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  679. attentions: Optional[Tuple[paddle.Tensor]] = None
  680. cross_attentions: Optional[Tuple[paddle.Tensor]] = None
  681. @dataclass
  682. class SequenceClassifierOutput(ModelOutput):
  683. """
  684. Base class for outputs of sentence classification models.
  685. Args:
  686. loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  687. Classification (or regression if config.num_labels==1) loss.
  688. logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
  689. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  690. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  691. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  692. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  693. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  694. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  695. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  696. sequence_length)`.
  697. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  698. heads.
  699. """
  700. loss: Optional[paddle.Tensor] = None
  701. logits: paddle.Tensor = None
  702. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  703. attentions: Optional[Tuple[paddle.Tensor]] = None
  704. @dataclass
  705. class TokenClassifierOutput(ModelOutput):
  706. """
  707. Base class for outputs of token classification models.
  708. Args:
  709. loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
  710. Classification loss.
  711. logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
  712. Classification scores (before SoftMax).
  713. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  714. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  715. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  716. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  717. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  718. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  719. sequence_length)`.
  720. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  721. heads.
  722. """
  723. loss: Optional[paddle.Tensor] = None
  724. logits: paddle.Tensor = None
  725. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  726. attentions: Optional[Tuple[paddle.Tensor]] = None
  727. @dataclass
  728. class QuestionAnsweringModelOutput(ModelOutput):
  729. """
  730. Base class for outputs of question answering models.
  731. Args:
  732. loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  733. Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
  734. start_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
  735. Span-start scores (before SoftMax).
  736. end_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
  737. Span-end scores (before SoftMax).
  738. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  739. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  740. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  741. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  742. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  743. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  744. sequence_length)`.
  745. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  746. heads.
  747. """
  748. loss: Optional[paddle.Tensor] = None
  749. start_logits: paddle.Tensor = None
  750. end_logits: paddle.Tensor = None
  751. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  752. attentions: Optional[Tuple[paddle.Tensor]] = None
  753. @dataclass
  754. class MultipleChoiceModelOutput(ModelOutput):
  755. """
  756. Base class for outputs of multiple choice models.
  757. Args:
  758. loss (`paddle.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
  759. Classification loss.
  760. logits (`paddle.Tensor` of shape `(batch_size, num_choices)`):
  761. *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
  762. Classification scores (before SoftMax).
  763. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  764. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  765. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  766. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  767. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  768. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  769. sequence_length)`.
  770. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  771. heads.
  772. """
  773. loss: Optional[paddle.Tensor] = None
  774. logits: paddle.Tensor = None
  775. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  776. attentions: Optional[Tuple[paddle.Tensor]] = None
  777. @dataclass
  778. class MaskedLMOutput(ModelOutput):
  779. """
  780. Base class for masked language models outputs.
  781. Args:
  782. loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  783. Masked language modeling (MLM) loss.
  784. logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  785. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  786. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  787. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  788. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  789. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  790. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  791. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  792. sequence_length)`.
  793. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  794. heads.
  795. """
  796. loss: Optional[paddle.Tensor] = None
  797. logits: paddle.Tensor = None
  798. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  799. attentions: Optional[Tuple[paddle.Tensor]] = None
  800. @dataclass
  801. class CausalLMOutputWithPast(ModelOutput):
  802. """
  803. Base class for causal language model (or autoregressive) outputs.
  804. Args:
  805. loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  806. Language modeling loss (for next-token prediction).
  807. logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  808. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  809. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  810. Tuple of `paddle.Tensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
  811. value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
  812. setting. Only relevant if `config.is_decoder = True`.
  813. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  814. `past_key_values` input) to speed up sequential decoding.
  815. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  816. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  817. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  818. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  819. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  820. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  821. sequence_length)`.
  822. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  823. heads.
  824. """
  825. loss: Optional[paddle.Tensor] = None
  826. logits: paddle.Tensor = None
  827. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  828. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  829. attentions: Optional[Tuple[paddle.Tensor]] = None
  830. @dataclass
  831. class CausalLMOutputWithCrossAttentions(ModelOutput):
  832. """
  833. Base class for causal language model (or autoregressive) outputs.
  834. Args:
  835. loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  836. Language modeling loss (for next-token prediction).
  837. logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  838. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  839. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  840. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  841. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  842. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  843. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  844. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  845. sequence_length)`.
  846. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  847. heads.
  848. cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  849. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  850. sequence_length)`.
  851. Cross attentions weights after the attention softmax, used to compute the weighted average in the
  852. cross-attention heads.
  853. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  854. Tuple of `paddle.Tensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
  855. value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
  856. setting. Only relevant if `config.is_decoder = True`.
  857. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  858. `past_key_values` input) to speed up sequential decoding.
  859. """
  860. loss: Optional[paddle.Tensor] = None
  861. logits: paddle.Tensor = None
  862. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  863. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  864. attentions: Optional[Tuple[paddle.Tensor]] = None
  865. cross_attentions: Optional[Tuple[paddle.Tensor]] = None
  866. @dataclass
  867. class Seq2SeqModelOutput(ModelOutput):
  868. """
  869. Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
  870. decoding.
  871. Args:
  872. last_hidden_state (`paddle.Tensor`):
  873. Sequence of hidden-states at the output of the last layer of the decoder of the model, whose shape is `(batch_size, Sequence_length, hidden_size)`.
  874. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  875. hidden_size)` is output.
  876. past_key_values (`tuple(tuple(paddle.Tensor))`, optional):
  877. Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  878. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  879. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  880. Returned when `use_cache=True` is passed or when `config.use_cache=True`.
  881. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  882. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  883. decoder_hidden_states (`tuple(paddle.Tensor)`, optional):
  884. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  885. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  886. Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`.
  887. Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
  888. decoder_attentions (`tuple(paddle.Tensor)`, optional):
  889. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  890. sequence_length)`.
  891. Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
  892. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  893. self-attention heads.
  894. cross_attentions (`tuple(paddle.Tensor)`, optional):
  895. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  896. sequence_length)`.
  897. Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
  898. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  899. weighted average in the cross-attention heads.
  900. encoder_last_hidden_state (`paddle.Tensor`, optional):
  901. Sequence of hidden-states at the output of the last layer of the encoder of the model whose shape is `(batch_size, sequence_length, hidden_size)`,
  902. encoder_hidden_states (`tuple(paddle.Tensor)`, optional):
  903. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  904. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  905. Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`.
  906. Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
  907. encoder_attentions (`tuple(paddle.Tensor)`, optional):
  908. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  909. sequence_length)`.
  910. Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
  911. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  912. self-attention heads.
  913. """
  914. last_hidden_state: paddle.Tensor = None
  915. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  916. decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
  917. decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
  918. cross_attentions: Optional[Tuple[paddle.Tensor]] = None
  919. encoder_last_hidden_state: Optional[paddle.Tensor] = None
  920. encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
  921. encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
  922. @dataclass
  923. class Seq2SeqLMOutput(ModelOutput):
  924. """
  925. Base class for sequence-to-sequence language models outputs.
  926. Args:
  927. loss (`paddle.Tensor`, optional):
  928. Language modeling loss whose shape is `(1,)`. Returned when `labels` is provided.
  929. logits (`paddle.Tensor`):
  930. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) whose shape is `(batch_size, sequence_length, config.vocab_size)`).
  931. past_key_values (`tuple(tuple(paddle.Tensor))`, optional):
  932. Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  933. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  934. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  935. Returned when `use_cache=True` is passed or when `config.use_cache=True`.
  936. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  937. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  938. decoder_hidden_states (`tuple(paddle.Tensor)`, optional):
  939. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  940. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  941. Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`.
  942. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  943. decoder_attentions (`tuple(paddle.Tensor)`, optional):
  944. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  945. sequence_length)`.
  946. Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
  947. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  948. self-attention heads.
  949. cross_attentions (`tuple(paddle.Tensor)`, optional):
  950. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  951. sequence_length)`.
  952. Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
  953. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  954. weighted average in the cross-attention heads.
  955. encoder_last_hidden_state (`paddle.Tensor`, optional):
  956. Sequence of hidden-states at the output of the last layer of the encoder of the model whose shape is `(batch_size, sequence_length, hidden_size)`.
  957. encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  958. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  959. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  960. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  961. encoder_attentions (`tuple(paddle.Tensor)`, optional):
  962. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  963. sequence_length)`.
  964. Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
  965. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  966. self-attention heads.
  967. """
  968. loss: Optional[paddle.Tensor] = None
  969. logits: paddle.Tensor = None
  970. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  971. decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
  972. decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
  973. cross_attentions: Optional[Tuple[paddle.Tensor]] = None
  974. encoder_last_hidden_state: Optional[paddle.Tensor] = None
  975. encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
  976. encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
  977. @dataclass
  978. class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
  979. """
  980. Base class for outputs of sequence-to-sequence question answering models.
  981. Args:
  982. loss (`paddle.Tensor` ,optional):
  983. Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
  984. A Tensor of shape `(1,)`, returned when `labels` is provided.
  985. start_logits (`paddle.Tensor`):
  986. Span-start scores (before SoftMax). Tensor of shape `(batch_size, sequence_length)`).
  987. end_logits (`paddle.Tensor`):
  988. Span-end scores (before SoftMax). Tensor of shape `(batch_size, sequence_length)`).
  989. past_key_values (`tuple(tuple(paddle.Tensor))`, optional):
  990. Tuple of `tuple(paddle.Tensor)` of length `n_layers`, with each tuple having 2 tensors of shape
  991. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  992. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  993. Returned when `use_cache=True` is passed.
  994. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  995. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  996. decoder_hidden_states (`tuple(paddle.Tensor)`, optional):
  997. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  998. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  999. Returned when `output_hidden_states=True` is passed.
  1000. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  1001. decoder_attentions (`tuple(paddle.Tensor)`, optional):
  1002. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1003. sequence_length)`. Returned when `output_attentions=True` is passed.
  1004. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  1005. self-attention heads.
  1006. cross_attentions (`tuple(paddle.Tensor)`, optional):
  1007. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1008. sequence_length)`. Returned when `output_attentions=True` is passed.
  1009. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  1010. weighted average in the cross-attention heads.
  1011. encoder_last_hidden_state (`paddle.Tensor` optional):
  1012. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  1013. Tensor of shape `(batch_size, sequence_length, hidden_size)`.
  1014. encoder_hidden_states (`tuple(paddle.Tensor)`, optional):
  1015. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1016. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1017. Returned when `output_hidden_states=True` is passed.
  1018. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  1019. encoder_attentions (`tuple(paddle.Tensor)`, optional):
  1020. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1021. sequence_length)`. Returned when `output_attentions=True` is passed.
  1022. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  1023. self-attention heads.
  1024. """
  1025. loss: Optional[paddle.Tensor] = None
  1026. start_logits: paddle.Tensor = None
  1027. end_logits: paddle.Tensor = None
  1028. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  1029. decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1030. decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
  1031. cross_attentions: Optional[Tuple[paddle.Tensor]] = None
  1032. encoder_last_hidden_state: Optional[paddle.Tensor] = None
  1033. encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1034. encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
  1035. @dataclass
  1036. class Seq2SeqSequenceClassifierOutput(ModelOutput):
  1037. """
  1038. Base class for outputs of sequence-to-sequence sentence classification models.
  1039. Args:
  1040. loss (`paddle.Tensor` optional):
  1041. Classification (or regression if config.num_labels==1) loss of shape `(1,)`. Returned when `label` is provided).
  1042. logits (`paddle.Tensor`):
  1043. Classification (or regression if config.num_labels==1) scores (before SoftMax) of shape `(batch_size, config.num_labels)`
  1044. past_key_values (`tuple(tuple(paddle.Tensor))`, optional):
  1045. Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  1046. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  1047. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  1048. Returned when `use_cache=True` is passed.
  1049. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  1050. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  1051. decoder_hidden_states (`tuple(paddle.Tensor)`, optional):
  1052. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1053. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1054. Returned when `output_hidden_states=True` is passed.
  1055. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  1056. decoder_attentions (`tuple(paddle.Tensor)`, optional):
  1057. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1058. sequence_length)`. Returned when `output_attentions=True` is passed.
  1059. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  1060. self-attention heads.
  1061. cross_attentions (`tuple(paddle.Tensor)`, optional):
  1062. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1063. sequence_length)`. Returned when `output_attentions=True` is passed.
  1064. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  1065. weighted average in the cross-attention heads.
  1066. encoder_last_hidden_state (`paddle.Tensor`, optional):
  1067. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  1068. Tensor of shape `(batch_size, sequence_length, hidden_size)`.
  1069. encoder_hidden_states (`tuple(paddle.Tensor)`, optional):
  1070. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1071. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1072. Returned when `output_hidden_states=True` is passed.
  1073. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  1074. encoder_attentions (`tuple(paddle.Tensor)`, optional):
  1075. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1076. sequence_length)`.
  1077. Returned when `output_attentions=True` is passed.
  1078. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  1079. self-attention heads.
  1080. """
  1081. loss: Optional[paddle.Tensor] = None
  1082. logits: paddle.Tensor = None
  1083. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  1084. decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1085. decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
  1086. cross_attentions: Optional[Tuple[paddle.Tensor]] = None
  1087. encoder_last_hidden_state: Optional[paddle.Tensor] = None
  1088. encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1089. encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
  1090. @dataclass
  1091. class SequenceClassifierOutputWithPast(ModelOutput):
  1092. """
  1093. Base class for outputs of sentence classification models.
  1094. Args:
  1095. loss (`paddle.Tensor`, optional):
  1096. Classification (or regression if config.num_labels==1) loss whose shape is `(1,)`.
  1097. Returned when `labels` is provided.
  1098. logits (`paddle.Tensor`):
  1099. Classification (or regression if config.num_labels==1) scores (before SoftMax)
  1100. whose shape is `(batch_size, num_labels)`
  1101. past_key_values (`tuple(tuple(paddle.Tensor))`, optional):
  1102. Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  1103. `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
  1104. Returned when `use_cache=True` is passed or when `config.use_cache=True`).
  1105. Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
  1106. `past_key_values` input) to speed up sequential decoding.
  1107. hidden_states (`tuple(paddle.Tensor)`, optional):
  1108. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1109. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1110. Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`).
  1111. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  1112. attentions (`tuple(paddle.Tensor)`, optional):
  1113. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1114. sequence_length)`. Returned when `output_attentions=True` is passed or when `config.output_attentions=True`).
  1115. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1116. heads.
  1117. """
  1118. loss: Optional[paddle.Tensor] = None
  1119. logits: paddle.Tensor = None
  1120. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  1121. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1122. attentions: Optional[Tuple[paddle.Tensor]] = None
  1123. @dataclass
  1124. class BackboneOutput(ModelOutput):
  1125. """
  1126. Base class for outputs of backbones.
  1127. Args:
  1128. feature_maps (`tuple(paddle.Tensor)` of shape `(batch_size, num_channels, height, width)`):
  1129. Feature maps of the stages.
  1130. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1131. Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
  1132. shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, num_channels, height, width)`,
  1133. depending on the backbone.
  1134. Hidden-states of the model at the output of each stage plus the initial embedding outputs.
  1135. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1136. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1137. sequence_length)`. Only applicable if the backbone uses attention.
  1138. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1139. heads.
  1140. """
  1141. feature_maps: Tuple[paddle.Tensor] = None
  1142. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1143. attentions: Optional[Tuple[paddle.Tensor]] = None
  1144. @dataclass
  1145. class BaseModelOutputWithPoolingAndNoAttention(ModelOutput):
  1146. """
  1147. Base class for model's outputs that also contains a pooling of the last hidden states.
  1148. Args:
  1149. last_hidden_state (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
  1150. Sequence of hidden-states at the output of the last layer of the model.
  1151. pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
  1152. Last layer hidden-state after a pooling operation on the spatial dimensions.
  1153. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1154. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1155. one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  1156. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  1157. """
  1158. last_hidden_state: paddle.Tensor = None
  1159. pooler_output: paddle.Tensor = None
  1160. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1161. @dataclass
  1162. class ImageClassifierOutputWithNoAttention(ModelOutput):
  1163. """
  1164. Base class for outputs of image classification models.
  1165. Args:
  1166. loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1167. Classification (or regression if config.num_labels==1) loss.
  1168. logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
  1169. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  1170. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1171. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1172. one for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
  1173. called feature maps) of the model at the output of each stage.
  1174. """
  1175. loss: Optional[paddle.Tensor] = None
  1176. logits: paddle.Tensor = None
  1177. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1178. @dataclass
  1179. class DepthEstimatorOutput(ModelOutput):
  1180. """
  1181. Base class for outputs of depth estimation models.
  1182. Args:
  1183. loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1184. Classification (or regression if config.num_labels==1) loss.
  1185. predicted_depth (`paddle.Tensor` of shape `(batch_size, height, width)`):
  1186. Predicted depth for each pixel.
  1187. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1188. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1189. one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  1190. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  1191. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1192. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
  1193. sequence_length)`.
  1194. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1195. heads.
  1196. """
  1197. loss: Optional[paddle.Tensor] = None
  1198. predicted_depth: paddle.Tensor = None
  1199. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1200. attentions: Optional[Tuple[paddle.Tensor]] = None
  1201. @dataclass
  1202. class SemanticSegmenterOutput(ModelOutput):
  1203. """
  1204. Base class for outputs of semantic segmentation models.
  1205. Args:
  1206. loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1207. Classification (or regression if config.num_labels==1) loss.
  1208. logits (`paddle.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
  1209. Classification scores for each pixel.
  1210. <Tip warning={true}>
  1211. The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
  1212. to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
  1213. original image size as post-processing. You should always check your logits shape and resize as needed.
  1214. </Tip>
  1215. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1216. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1217. one for the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
  1218. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  1219. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1220. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
  1221. sequence_length)`.
  1222. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1223. heads.
  1224. """
  1225. loss: Optional[paddle.Tensor] = None
  1226. logits: paddle.Tensor = None
  1227. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1228. attentions: Optional[Tuple[paddle.Tensor]] = None
  1229. @dataclass
  1230. class Seq2SeqSpectrogramOutput(ModelOutput):
  1231. """
  1232. Base class for sequence-to-sequence spectrogram outputs.
  1233. Args:
  1234. loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1235. Spectrogram generation loss.
  1236. spectrogram (`paddle.Tensor` of shape `(batch_size, sequence_length, num_bins)`):
  1237. The predicted spectrogram.
  1238. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  1239. Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  1240. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
  1241. `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
  1242. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  1243. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  1244. decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1245. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1246. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1247. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  1248. decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1249. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1250. sequence_length)`.
  1251. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  1252. self-attention heads.
  1253. cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1254. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1255. sequence_length)`.
  1256. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  1257. weighted average in the cross-attention heads.
  1258. encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  1259. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  1260. encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1261. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1262. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1263. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  1264. encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1265. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1266. sequence_length)`.
  1267. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  1268. self-attention heads.
  1269. """
  1270. loss: Optional[paddle.Tensor] = None
  1271. spectrogram: paddle.Tensor = None
  1272. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  1273. decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1274. decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
  1275. cross_attentions: Optional[Tuple[paddle.Tensor]] = None
  1276. encoder_last_hidden_state: Optional[paddle.Tensor] = None
  1277. encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1278. encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
  1279. @dataclass
  1280. class MoEModelOutputWithPast(ModelOutput):
  1281. """
  1282. Base class for model's outputs, with potential hidden states and attentions.
  1283. Args:
  1284. last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
  1285. Sequence of hidden-states at the output of the last layer of the model.
  1286. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  1287. Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  1288. `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
  1289. `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
  1290. encoder_sequence_length, embed_size_per_head)`.
  1291. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  1292. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  1293. input) to speed up sequential decoding.
  1294. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1295. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1296. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1297. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  1298. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1299. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1300. sequence_length)`.
  1301. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1302. heads.
  1303. router_logits (`tuple(paddle.Tensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
  1304. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  1305. Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
  1306. loss for Mixture of Experts models.
  1307. """
  1308. last_hidden_state: paddle.Tensor = None
  1309. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  1310. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1311. attentions: Optional[Tuple[paddle.Tensor]] = None
  1312. router_logits: Optional[Tuple[paddle.Tensor]] = None
  1313. @dataclass
  1314. class MoECausalLMOutputWithPast(ModelOutput):
  1315. """
  1316. Base class for causal language model (or autoregressive) with mixture of experts outputs.
  1317. Args:
  1318. loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1319. Language modeling loss (for next-token prediction).
  1320. logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  1321. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  1322. aux_loss (`paddle.Tensor`, *optional*, returned when `labels` is provided):
  1323. aux_loss for the sparse modules.
  1324. router_logits (`tuple(paddle.Tensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
  1325. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  1326. Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
  1327. loss for Mixture of Experts models.
  1328. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  1329. Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
  1330. `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
  1331. Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
  1332. `past_key_values` input) to speed up sequential decoding.
  1333. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1334. Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1335. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1336. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  1337. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1338. Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1339. sequence_length)`.
  1340. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1341. heads.
  1342. """
  1343. loss: Optional[paddle.Tensor] = None
  1344. aux_loss: Optional[paddle.Tensor] = None
  1345. logits: paddle.Tensor = None
  1346. past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
  1347. hidden_states: Optional[Tuple[paddle.Tensor]] = None
  1348. attentions: Optional[Tuple[paddle.Tensor]] = None
  1349. router_logits: Optional[Tuple[paddle.Tensor]] = None