zhengchun
/
MinerU


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631
							import copy
import math
import re
import numpy as np
import inspect
import warnings
from collections import OrderedDict
from typing import Optional, Tuple, Union, List, Dict, Any
from dataclasses import dataclass, fields, is_dataclass

import torch
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

from mineru.utils.config_reader import get_device


class ModelOutput(OrderedDict):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __post_init__(self):
        class_fields = fields(self)

        if not len(class_fields):
            raise ValueError(f"{self.__class__.__name__} has no fields.")
        if not all(field.default is None for field in class_fields[1:]):
            raise ValueError(
                f"{self.__class__.__name__} should not have more than one required field."
            )

        first_field = getattr(self, class_fields[0].name)
        other_fields_are_none = all(
            getattr(self, field.name) is None for field in class_fields[1:]
        )
        if other_fields_are_none:
            if isinstance(first_field, dict):
                iterator = first_field.items()
                first_field_iterator = True
            else:
                try:
                    iterator = iter(first_field)
                    first_field_iterator = True
                except TypeError:
                    first_field_iterator = False

            if first_field_iterator:
                for idx, element in enumerate(iterator):
                    if (
                            not isinstance(element, (list, tuple))
                            or not len(element) == 2
                            or not isinstance(element[0], str)
                    ):
                        if idx == 0:
                            self[class_fields[0].name] = first_field
                        else:
                            raise ValueError(
                                f"Cannot set key/value for {element}. It needs to be a tuple (key, value)."
                            )
                        break
                    setattr(self, element[0], element[1])
                    if element[1] is not None:
                        self[element[0]] = element[1]
            elif first_field is not None:
                self[class_fields[0].name] = first_field
        else:
            for field in class_fields:
                v = getattr(self, field.name)
                if v is not None:
                    self[field.name] = v

    def __delitem__(self, *args, **kwargs):
        raise Exception(
            f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance."
        )

    def setdefault(self, *args, **kwargs):
        raise Exception(
            f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance."
        )

    def pop(self, *args, **kwargs):
        raise Exception(
            f"You cannot use ``pop`` on a {self.__class__.__name__} instance."
        )

    def update(self, *args, **kwargs):
        raise Exception(
            f"You cannot use ``update`` on a {self.__class__.__name__} instance."
        )

    def __getitem__(self, k):
        if isinstance(k, str):
            inner_dict = dict(self.items())
            return inner_dict[k]
        else:
            return self.to_tuple()[k]

    def __setattr__(self, name, value):
        if name in self.keys() and value is not None:
            super().__setitem__(name, value)
        super().__setattr__(name, value)

    def __setitem__(self, key, value):
        super().__setitem__(key, value)
        super().__setattr__(key, value)

    def __reduce__(self):
        if not is_dataclass(self):
            return super().__reduce__()
        callable, _args, *remaining = super().__reduce__()
        args = tuple(getattr(self, field.name) for field in fields(self))
        return callable, args, *remaining

    def to_tuple(self):
        return tuple(self[k] for k in self.keys())


@dataclass
class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
    last_hidden_state = None
    past_key_values = None
    hidden_states = None
    attentions = None
    cross_attentions = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)


@dataclass
class Seq2SeqLMOutput(ModelOutput):
    loss = None
    logits = None
    past_key_values = None
    decoder_hidden_states = None
    decoder_attentions = None
    cross_attentions = None
    encoder_last_hidden_state = None
    encoder_hidden_states = None
    encoder_attentions = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)


class MBartConfig(object):
    model_type = "mbart"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "num_attention_heads": "encoder_attention_heads",
        "hidden_size": "d_model",
    }

    def __init__(
            self,
            vocab_size=50265,
            max_position_embeddings=1024,
            encoder_layers=12,
            encoder_ffn_dim=4096,
            encoder_attention_heads=16,
            decoder_layers=12,
            decoder_ffn_dim=4096,
            decoder_attention_heads=16,
            encoder_layerdrop=0.0,
            decoder_layerdrop=0.0,
            use_cache=True,
            is_encoder_decoder=True,
            activation_function="gelu",
            d_model=1024,
            dropout=0.1,
            output_hidden_states=False,
            use_return_dict=True,
            attention_dropout=0.0,
            activation_dropout=0.0,
            init_std=0.02,
            classifier_dropout=0.0,
            scale_embedding=False,
            pad_token_id=1,
            bos_token_id=0,
            eos_token_id=2,
            forced_eos_token_id=2,
            _attn_implementation="eager",
            hidden_size=1024,
            use_parallel=False,
            parallel_step=2,
            is_export=False,
            **kwargs,
    ):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.max_position_embeddings = max_position_embeddings
        self.d_model = d_model
        self.encoder_ffn_dim = encoder_ffn_dim
        self.encoder_layers = encoder_layers
        self.encoder_attention_heads = encoder_attention_heads
        self.decoder_ffn_dim = decoder_ffn_dim
        self.decoder_layers = decoder_layers
        self.decoder_attention_heads = decoder_attention_heads
        self.dropout = dropout
        self.output_hidden_states = output_hidden_states
        self.use_return_dict = use_return_dict
        self.attention_dropout = attention_dropout
        self.activation_dropout = activation_dropout
        self.activation_function = activation_function
        self.init_std = init_std
        self.encoder_layerdrop = encoder_layerdrop
        self.decoder_layerdrop = decoder_layerdrop
        self.classifier_dropout = classifier_dropout
        self.use_cache = use_cache
        self.num_hidden_layers = encoder_layers
        self.scale_embedding = (
            scale_embedding  # scale factor will be sqrt(d_model) if True
        )
        self.pad_token_id = pad_token_id
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.is_encoder_decoder = is_encoder_decoder
        self.forced_eos_token_id = forced_eos_token_id
        self._attn_implementation = _attn_implementation
        self.use_parallel = use_parallel
        self.parallel_step = parallel_step
        self.is_export = is_export
        super().__init__()


@dataclass
class AttentionMaskConverter:
    """
    A utility class for converting attention masks used in transformer models.

    This class handles the conversion of attention masks based on whether the
    attention mechanism is causal (i.e., preventing information flow from future
    tokens to past tokens) and whether a sliding window approach is used.

    Attributes:
        is_causal (bool): Indicates if the attention mechanism is causal.
        sliding_window (Optional[int]): Specifies the size of the sliding window
                                        for local attention, if applicable.

    Args:
        is_causal (bool): Determines if the attention mask should enforce causality.
        sliding_window (Optional[int], optional): The size of the sliding window
                                                  for local attention. Default is None.
    """

    is_causal: bool
    sliding_window: int

    def __init__(self, is_causal: bool, sliding_window=None):
        self.is_causal = is_causal
        self.sliding_window = sliding_window

        if self.sliding_window is not None and self.sliding_window <= 0:
            raise ValueError(
                f"Make sure that when passing `sliding_window` that its value is a strictly positive integer, not `{self.sliding_window}`"
            )

    @staticmethod
    def _make_causal_mask(
            input_ids_shape,
            dtype,
            past_key_values_length=0,
            sliding_window=None,
            is_export=False,
    ):
        bsz, tgt_len = input_ids_shape
        if is_export:
            mask = torch.full(
                [tgt_len, tgt_len], fill_value=torch.finfo(dtype).min, dtype=torch.float64
            )
        else:
            mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min)
        mask_cond = torch.arange(mask.shape[-1])
        mask = mask.masked_fill_(
            mask_cond < (mask_cond + 1).reshape([mask.shape[-1], 1]), 0
        )
        return mask[None, None, :, :].expand(
            [bsz, 1, tgt_len, tgt_len + past_key_values_length]
        )

    def to_4d_export(
            self,
            attention_mask_2d,
            query_length,
            dtype,
            key_value_length,
            is_export=False,
    ):
        input_shape = (attention_mask_2d.shape[0], query_length)
        expanded_attn_mask = self._expand_mask(
            attention_mask_2d, dtype, tgt_len=input_shape[-1]
        )
        expanded_4d_mask = expanded_attn_mask

        return expanded_4d_mask

    def to_4d(
            self,
            attention_mask_2d,
            query_length,
            dtype,
            key_value_length,
            is_export=False,
    ):

        input_shape = (attention_mask_2d.shape[0], query_length)
        causal_4d_mask = None
        if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
            if key_value_length is None:
                raise ValueError(
                    "This attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask."
                )

            past_key_values_length = key_value_length - query_length

            causal_4d_mask = self._make_causal_mask(
                input_shape,
                dtype,
                past_key_values_length=past_key_values_length,
                sliding_window=self.sliding_window,
                is_export=is_export,
            )
        elif self.sliding_window is not None:
            raise NotImplementedError(
                "Sliding window is currently only implemented for causal masking"
            )

        expanded_attn_mask = self._expand_mask(
            attention_mask_2d, dtype, tgt_len=input_shape[-1]
        )

        if causal_4d_mask is not None:
            if is_export:
                expanded_attn_mask = causal_4d_mask
                return expanded_attn_mask
            else:
                expanded_attn_mask = causal_4d_mask.masked_fill_(
                    expanded_attn_mask.to(torch.bool), torch.finfo(dtype).min
                )

        expanded_4d_mask = expanded_attn_mask

        return expanded_4d_mask

    def _expand_mask(self, mask, dtype, tgt_len=None):
        bsz, src_len = mask.shape
        tgt_len = tgt_len if tgt_len is not None else src_len
        expanded_mask = (
            mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).to(dtype)
        )
        inverted_mask = 1.0 - expanded_mask
        return inverted_mask.masked_fill_(
            inverted_mask.to(torch.bool), torch.finfo(dtype).min
        )


def _prepare_4d_attention_mask(mask, dtype, tgt_len=None):
    return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)


def _prepare_4d_causal_attention_mask_export(
        attention_mask,
        input_shape,
        inputs_embeds,
        past_key_values_length,
        sliding_window=None,
        is_export=False,
):
    attn_mask_converter = AttentionMaskConverter(
        is_causal=True, sliding_window=sliding_window
    )
    key_value_length = input_shape[-1] + past_key_values_length

    shape = attention_mask.shape
    len_shape = len(shape)

    attention_mask = attn_mask_converter.to_4d_export(
        attention_mask,
        input_shape[-1],
        key_value_length=key_value_length,
        dtype=inputs_embeds.dtype,
        is_export=is_export,
    )
    return attention_mask


def _prepare_4d_causal_attention_mask(
        attention_mask,
        input_shape,
        inputs_embeds,
        past_key_values_length,
        sliding_window=None,
        is_export=False,
):
    attn_mask_converter = AttentionMaskConverter(
        is_causal=True, sliding_window=sliding_window
    )
    key_value_length = input_shape[-1] + past_key_values_length

    shape = attention_mask.shape
    len_shape = len(shape)
    if (attention_mask is not None) and (len_shape == 2):
        attention_mask = attn_mask_converter.to_4d(
            attention_mask,
            input_shape[-1],
            key_value_length=key_value_length,
            dtype=inputs_embeds.dtype,
            is_export=is_export,
        )

        return attention_mask
    elif attention_mask is not None and len(attention_mask.shape) == 4:
        expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
        if tuple(attention_mask.shape) != expected_shape:
            raise ValueError(
                f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
            )
        else:
            inverted_mask = 1.0 - attention_mask
            attention_mask = inverted_mask.masked_fill_(
                inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
            )
    else:
        attention_mask = attn_mask_converter.to_causal_4d(
            input_shape[0],
            input_shape[-1],
            key_value_length,
            dtype=inputs_embeds.dtype,
        )

    return attention_mask


class MBartLearnedPositionalEmbedding(nn.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings, embedding_dim):
        self.offset = 2
        super().__init__(num_embeddings + self.offset, embedding_dim)
        self.device = torch.device(get_device())

    def forward(self, input_ids, past_key_values_length=0):
        """`input_ids' shape is expected to be [bsz x seqlen]."""
        bsz, seq_len = input_ids.shape[:2]
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.int64
        ).expand([bsz, -1]).to(self.device)
        return nn.Embedding.forward(self, positions + self.offset)


class MBartPreTrainedModel(nn.Module):
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["MBartDecoderLayer", "MBartAttention"]
    _supports_flash_attn_2 = True

    def __init__(self, config):
        super().__init__()
        self.config = config

    def _initialize_weights(self, module):
        """
        Initialize the weights if they are not already initialized.
        """
        if getattr(module, "_is_hf_initialized", False):
            return
        self._init_weights(module)

    def post_init(self):
        self.apply(self._initialize_weights)

    def _init_weights(self, module):
        std = self.config.init_std
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.constant_(module.bias, val=0.0)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.padding_idx is not None:
                torch.nn.init.constant_(module.weight[module.padding_idx], val=0.0)

    @property
    def dummy_inputs(self):
        pad_token = self.config.pad_token_id
        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]])
        dummy_inputs = {
            "attention_mask": input_ids.ne(pad_token),
            "input_ids": input_ids,
        }
        return dummy_inputs


class MBartAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
            self,
            embed_dim,
            num_heads,
            dropout: float = 0.0,
            is_decoder: bool = False,
            bias: bool = True,
            is_causal: bool = False,
            config=None,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        self.config = config

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim ** -0.5
        self.is_decoder = is_decoder
        self.is_causal = is_causal

        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor, seq_len, bsz):
        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).permute(
            0, 2, 1, 3
        )

    def forward(
            self,
            hidden_states,
            key_value_states=None,
            past_key_value=None,
            attention_mask=None,
            layer_head_mask=None,
            output_attentions=False,
    ):

        is_cross_attention = key_value_states is not None

        bsz, tgt_len, _ = hidden_states.shape
        query_states = self.q_proj(hidden_states) * self.scaling
        if (
                is_cross_attention
                and past_key_value is not None
                and past_key_value[0].shape[2] == key_value_states.shape[1]
        ):
            key_states = past_key_value[0]
            value_states = past_key_value[1]
        elif is_cross_attention:
            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
        elif past_key_value is not None:
            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
            key_states = torch.concat([past_key_value[0], key_states], dim=2)
            value_states = torch.concat([past_key_value[1], value_states], dim=2)
        else:
            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

        if self.is_decoder:
            past_key_value = (key_states, value_states)

        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
        query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape)
        key_states = key_states.reshape(proj_shape)
        value_states = value_states.reshape(proj_shape)

        src_len = key_states.shape[1]
        attn_weights = torch.bmm(query_states, key_states.permute([0, 2, 1]))

        if attention_mask is not None:
            attn_weights = (
                    attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
                    + attention_mask
            )
            attn_weights = attn_weights.reshape(
                [bsz * self.num_heads, tgt_len, src_len]
            )

        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
        if layer_head_mask is not None:
            if tuple(layer_head_mask.shape) != (self.num_heads,):
                raise ValueError(
                    f"Head mask for a single layer should be of shape {(self.num_heads,)}, but is"
                    f" {layer_head_mask.shape}"
                )
            attn_weights = layer_head_mask.reshape(
                [1, -1, 1, 1]
            ) * attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
            attn_weights = attn_weights.reshape(
                [bsz * self.num_heads, tgt_len, src_len]
            )

        if output_attentions:
            attn_weights_reshaped = attn_weights.reshape(
                [bsz, self.num_heads, tgt_len, src_len]
            )
            attn_weights = attn_weights_reshaped.reshape(
                [bsz * self.num_heads, tgt_len, src_len]
            )
        else:
            attn_weights_reshaped = None
        attn_probs = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )
        attn_output = torch.bmm(attn_probs, value_states)

        attn_output = attn_output.reshape([bsz, self.num_heads, tgt_len, self.head_dim])
        attn_output = attn_output.permute([0, 2, 1, 3])

        attn_output = attn_output.reshape([bsz, tgt_len, self.embed_dim])
        attn_output = self.out_proj(attn_output)
        return attn_output, attn_weights_reshaped, past_key_value


MBART_ATTENTION_CLASSES = {
    "eager": MBartAttention,
}


class MBartDecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embed_dim = config.d_model
        self.self_attn = MBART_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            is_causal=True,
            config=config,
        )
        self.is_export = config.is_export
        self.dropout = config.dropout
        self.activation_fn = F.gelu
        self.activation_dropout = config.activation_dropout

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.encoder_attn = MBART_ATTENTION_CLASSES[config._attn_implementation](
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            config=config,
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
        self.device = torch.device(get_device())

    def forward(
            self,
            hidden_states,
            attention_mask=None,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            layer_head_mask=None,
            cross_attn_layer_head_mask=None,
            past_key_value: Optional[Tuple[torch.Tensor]] = None,
            output_attentions: Optional[bool] = False,
            use_cache: Optional[bool] = True,
    ) -> torch.Tensor:

        residual = hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)

        self_attn_past_key_value = None
        if past_key_value is not None:
            self_attn_past_key_value = tuple(
                t.to(self.device) if isinstance(t, torch.Tensor) else t for t in past_key_value[:2]
            )

        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            past_key_value=self_attn_past_key_value,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        hidden_states = nn.functional.dropout(
            hidden_states, p=self.dropout, training=self.training
        )
        hidden_states = residual + hidden_states

        cross_attn_present_key_value = None
        cross_attn_weights = None
        if encoder_hidden_states is not None:
            residual = hidden_states
            hidden_states = self.encoder_attn_layer_norm(hidden_states)
            cross_attn_past_key_value = (
                past_key_value[-2:] if past_key_value is not None else None
            )
            hidden_states, cross_attn_weights, cross_attn_present_key_value = (
                self.encoder_attn(
                    hidden_states=hidden_states,
                    key_value_states=encoder_hidden_states,
                    attention_mask=encoder_attention_mask,
                    layer_head_mask=cross_attn_layer_head_mask,
                    past_key_value=cross_attn_past_key_value,
                    output_attentions=output_attentions,
                )
            )
            hidden_states = nn.functional.dropout(
                hidden_states, p=self.dropout, training=self.training
            )
            hidden_states = residual + hidden_states

            present_key_value = present_key_value + cross_attn_present_key_value

        residual = hidden_states
        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = nn.functional.dropout(
            hidden_states, p=self.activation_dropout, training=self.training
        )
        hidden_states = self.fc2(hidden_states)
        hidden_states = nn.functional.dropout(
            hidden_states, p=self.dropout, training=self.training
        )
        hidden_states = residual + hidden_states
        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights, cross_attn_weights)

        if self.is_export:
            outputs += (present_key_value,)
        else:
            if use_cache:
                outputs += (present_key_value,)
        return outputs


class MBartForCausalLM(MBartPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        config = copy.deepcopy(config)
        config.is_decoder = True
        config.is_encoder_decoder = False
        super().__init__(config)
        self.model = MBartDecoderWrapper(config)
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.post_init()

    def get_input_embeddings(self):
        return self.model.decoder.embed_tokens

    def set_input_embeddings(self, value):
        self.model.decoder.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model.decoder = decoder

    def get_decoder(self):
        return self.model.decoder

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            head_mask=None,
            cross_attn_head_mask=None,
            past_key_values=None,
            inputs_embeds=None,
            labels=None,
            use_cache=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
    ):

        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.model.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            head_mask=head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        logits = self.lm_head(outputs[0])

        loss = None
        if labels is not None:
            labels = labels
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(
                logits.reshape([-1, self.config.vocab_size]), labels.reshape([-1])
            )

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )

    def prepare_inputs_for_generation(
            self,
            input_ids,
            past_key_values=None,
            attention_mask=None,
            use_cache=None,
            **kwargs,
    ):
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_ids.shape)

        if past_key_values:
            past_length = past_key_values[0][0].shape[2]

            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(
                    past_state.index_select(0, beam_idx) for past_state in layer_past
                ),
            )
        return reordered_past


class myLayerNorm(nn.LayerNorm):
    """
    Custom implementation of Layer Normalization, with additional options.

    This class extends the standard LayerNorm to include optional features,
    such as drop block regularization, which might be used for improving
    model generalization.

    Args:
        num_channels (int): The number of features or channels in the input.
        eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-5.
        affine (bool, optional): If True, this module has learnable affine parameters (gamma and beta). Default is True.
        drop_block (optional): Additional regularization technique that might be applied. Default is None.

    """

    def __init__(
            self,
            num_channels,
            eps=1e-5,
            affine=True,
            drop_block=None,
    ):
        super(nn.LayerNorm, self).__init__()
        self._epsilon = eps
        self.num_channels = num_channels
        if affine:
            self.weight = torch.nn.Parameter(torch.randn([num_channels]) * 0.01)
            self.bias = torch.nn.Parameter(torch.randn([num_channels]) * 0.01)
            torch.nn.init.ones_(self.weight)
            torch.nn.init.zeros_(self.bias)

    def forward(self, x):
        x = F.layer_norm(
            x,
            [self.num_channels],
            weight=self.weight,
            bias=self.bias,
            eps=self._epsilon,
        )
        return x


class MBartDecoder(MBartPreTrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]

    Args:
        config
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self, config, embed_tokens=None):
        super().__init__(config)
        self.dropout = config.dropout
        self.layerdrop = config.decoder_layerdrop
        self.padding_idx = config.pad_token_id
        self.max_target_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.d_model, self.padding_idx
        )

        if embed_tokens is not None:
            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = MBartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
        )
        self.layers = nn.ModuleList(
            [MBartDecoderLayer(config) for _ in range(config.decoder_layers)]
        )
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
        self.layernorm_embedding = myLayerNorm(config.d_model, affine=True)
        self.layer_norm = nn.LayerNorm(config.d_model)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()
        self.is_export = config.is_export

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            head_mask=None,
            cross_attn_head_mask=None,
            past_key_values=None,
            inputs_embeds=None,
            use_cache=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
    ):

        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input = input_ids
            input_shape = input.shape
            input_ids = input_ids.reshape([-1, input_shape[-1]])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.shape[:-1]
            input = inputs_embeds[:, :, -1]
        else:
            raise ValueError(
                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
            )

        past_key_values_length = (
            past_key_values[0][0].shape[2] if past_key_values is not None else 0
        )

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        if self._use_flash_attention_2:
            attention_mask = (
                attention_mask
                if (attention_mask is not None and 0 in attention_mask)
                else None
            )
        else:
            attention_mask = _prepare_4d_causal_attention_mask(
                attention_mask,
                input_shape,
                inputs_embeds,
                past_key_values_length,
                is_export=self.is_export,
            )

        if encoder_hidden_states is not None and encoder_attention_mask is not None:
            if self._use_flash_attention_2:
                encoder_attention_mask = (
                    encoder_attention_mask if 0 in encoder_attention_mask else None
                )
            else:
                encoder_attention_mask = _prepare_4d_attention_mask(
                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
                )

        # embed positions
        positions = self.embed_positions(input, past_key_values_length)

        hidden_states = inputs_embeds + positions
        hidden_states = self.layernorm_embedding(hidden_states)

        hidden_states = nn.functional.dropout(
            hidden_states, p=self.dropout, training=self.training
        )

        if self.gradient_checkpointing and self.training:
            if use_cache:
                print(
                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
                )
                use_cache = False

        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_cross_attentions = (
            () if (output_attentions and encoder_hidden_states is not None) else None
        )
        next_decoder_cache = () if use_cache else None

        for attn_mask, mask_name in zip(
                [head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]
        ):
            if attn_mask is not None:
                if attn_mask.shape[0] != len(self.layers):
                    raise ValueError(
                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
                        f" {attn_mask.shape[0]}."
                    )

        for idx, decoder_layer in enumerate(self.layers):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)
            if self.training:
                dropout_probability = torch.rand([])
                if dropout_probability < self.layerdrop:
                    continue

            past_key_value = (
                past_key_values[idx] if past_key_values is not None else None
            )

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    attention_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    head_mask[idx] if head_mask is not None else None,
                    (
                        cross_attn_head_mask[idx]
                        if cross_attn_head_mask is not None
                        else None
                    ),
                    None,
                    output_attentions,
                    use_cache,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=attention_mask,
                    encoder_hidden_states=encoder_hidden_states,
                    encoder_attention_mask=encoder_attention_mask,
                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                    cross_attn_layer_head_mask=(
                        cross_attn_head_mask[idx]
                        if cross_attn_head_mask is not None
                        else None
                    ),
                    past_key_value=past_key_value,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                )
            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

                if encoder_hidden_states is not None:
                    all_cross_attentions += (layer_outputs[2],)

        hidden_states = self.layer_norm(hidden_states)

        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_cache,
                    all_hidden_states,
                    all_self_attns,
                    all_cross_attentions,
                ]
                if v is not None
            )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            cross_attentions=all_cross_attentions,
        )


class MBartDecoderWrapper(MBartPreTrainedModel):
    """
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    """

    def __init__(self, config):
        super().__init__(config)
        self.decoder = MBartDecoder(config)

    def forward(self, *args, **kwargs):
        return self.decoder(*args, **kwargs)


def _in_projection(
        q: torch.Tensor,
        k: torch.Tensor,
        v: torch.Tensor,
        w_q: torch.Tensor,
        w_k: torch.Tensor,
        w_v: torch.Tensor,
        b_q: Optional[torch.Tensor] = None,
        b_k: Optional[torch.Tensor] = None,
        b_v: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    Eq, Ek, Ev = q.shape[-1], k.shape[-1], v.shape[-1]
    assert w_q.shape == (
        Eq,
        Eq,
    ), f"expecting query weights shape of {(Eq, Eq)}, but got {w_q.shape}"
    assert w_k.shape == (
        Eq,
        Ek,
    ), f"expecting key weights shape of {(Eq, Ek)}, but got {w_k.shape}"
    assert w_v.shape == (
        Eq,
        Ev,
    ), f"expecting value weights shape of {(Eq, Ev)}, but got {w_v.shape}"
    assert b_q is None or b_q.shape == (
        Eq,
    ), f"expecting query bias shape of {(Eq,)}, but got {b_q.shape}"
    assert b_k is None or b_k.shape == (
        Eq,
    ), f"expecting key bias shape of {(Eq,)}, but got {b_k.shape}"
    assert b_v is None or b_v.shape == (
        Eq,
    ), f"expecting value bias shape of {(Eq,)}, but got {b_v.shape}"
    return linear(q, w_q.T, b_q), linear(k, w_k.T, b_k), linear(v, w_v.T, b_v)


def _scaled_dot_product_attention(
        q: torch.Tensor,
        k: torch.Tensor,
        v: torch.Tensor,
        attn_mask: Optional[torch.Tensor] = None,
        dropout_p: float = 0.0,
) -> Tuple[torch.Tensor, torch.Tensor]:
    B, Nt, E = q.shape
    q = q / math.sqrt(E)
    attn = torch.bmm(q, k.permute([0, 2, 1]))
    if attn_mask is not None:
        attn += attn_mask
    attn = F.softmax(attn, dim=-1)
    if dropout_p > 0.0:
        attn = F.dropout(attn, p=dropout_p)
    output = torch.bmm(attn, v)
    return output, attn


def linear(x, w, b, is_transpose):
    if is_transpose:
        w = w.T
    if b is not None:
        return torch.matmul(x, w) + b
    else:
        return torch.matmul(x, w)


def _in_projection_packed(
        q: Tensor,
        k: Tensor,
        v: Tensor,
        w: Tensor,
        b: Optional[Tensor] = None,
        is_export=False,
) -> List[Tensor]:
    E = q.shape[-1]
    if k is v:
        if q is k:
            proj = linear(q, w, b, is_transpose=True)
            if is_export:
                B, D, L = proj.shape
                proj = proj.reshape([B, D, 3, E])
                proj = (
                    proj.unsqueeze(0)
                    .permute([3, 1, 2, 0, 4])
                    .squeeze(-2)
                    .contiguous()
                )
            else:
                proj = (
                    proj.unflatten(-1, (3, E))
                    .unsqueeze(0)
                    .permute([3, 1, 2, 0, 4])
                    .squeeze(-2)
                    .contiguous()
                )
            return proj[0], proj[1], proj[2]
    else:
        w_q, w_k, w_v = w.chunk(3)
        if b is None:
            b_q = b_k = b_v = None
        else:
            b_q, b_k, b_v = b.chunk(3)
        return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)


def multi_head_attention_forward(
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        embed_dim_to_check: int,
        num_heads: int,
        in_proj_weight: torch.Tensor,
        in_proj_bias: Optional[torch.Tensor],
        bias_k: Optional[torch.Tensor],
        bias_v: Optional[torch.Tensor],
        add_zero_attn: bool,
        dropout_p: float,
        out_proj_weight: torch.Tensor,
        out_proj_bias: Optional[torch.Tensor],
        training: bool = True,
        key_padding_mask: Optional[torch.Tensor] = None,
        need_weights: bool = True,
        attn_mask: Optional[torch.Tensor] = None,
        use_separate_proj_weight: bool = False,
        q_proj_weight: Optional[torch.Tensor] = None,
        k_proj_weight: Optional[torch.Tensor] = None,
        v_proj_weight: Optional[torch.Tensor] = None,
        static_k: Optional[torch.Tensor] = None,
        static_v: Optional[torch.Tensor] = None,
        is_export=False,
):
    tgt_len, bsz, embed_dim = query.shape
    src_len, _, _ = key.shape

    if isinstance(embed_dim, torch.Tensor):
        head_dim = embed_dim.div(num_heads, rounding_mode="trunc")
    else:
        head_dim = embed_dim // num_heads
    q, k, v = _in_projection_packed(
        query, key, value, in_proj_weight, in_proj_bias, is_export
    )

    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
        warnings.warn(
            "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
        )
        key_padding_mask = key_padding_mask.to(torch.bool)

    if bias_k is not None and bias_v is not None:  # False
        assert static_k is None, "bias cannot be added to static key."
        assert static_v is None, "bias cannot be added to static value."
        k = torch.concat([k, bias_k.repeat(1, bsz, 1)])
        v = torch.concat([v, bias_v.repeat(1, bsz, 1)])
    else:
        assert bias_k is None
        assert bias_v is None

    q = q.reshape([tgt_len, bsz * num_heads, head_dim]).permute([1, 0, 2])
    if static_k is None:  # True
        k = k.reshape([k.shape[0], bsz * num_heads, head_dim]).permute([1, 0, 2])
    else:
        assert (
                static_k.shape[0] == bsz * num_heads
        ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.shape[0]}"
        assert (
                static_k.shape[2] == head_dim
        ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.shape[2]}"
        k = static_k
    if static_v is None:  # True
        v = v.reshape([v.shape[0], bsz * num_heads, head_dim]).transpose([1, 0, 2])
    else:
        assert (
                static_v.shape[0] == bsz * num_heads
        ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.shape[0]}"
        assert (
                static_v.shape[2] == head_dim
        ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.shape[2]}"
        v = static_v

    src_len = k.shape[1]

    if not training:
        dropout_p = 0.0

    attn_output, attn_output_weights = _scaled_dot_product_attention(
        q, k, v, attn_mask, dropout_p
    )

    attn_output = attn_output.permute([1, 0, 2]).reshape([tgt_len, bsz, embed_dim])
    attn_output = linear(
        attn_output, out_proj_weight, out_proj_bias, is_transpose=False
    )

    if need_weights:
        attn_output_weights = attn_output_weights.reshape(
            [bsz, num_heads, tgt_len, src_len]
        )
        return attn_output, attn_output_weights.sum(dim=1) / num_heads
    else:
        return attn_output, None


class MyMultiheadAttention(nn.Module):
    """
    Custom implementation of a multi-head attention layer.

    Attributes:
        __constants__ (list): List of constant attributes.
        bias_k (Optional[paddle.Tensor]): Optional tensor for key bias.
        bias_v (Optional[paddle.Tensor]): Optional tensor for value bias.

    Args:
        embed_dim (int): Total dimension of the model. This is the size of the input feature vectors.
        num_heads (int): Number of parallel attention heads. The input dimension must be divisible by the number of heads.
        dropout (float, optional): Dropout probability on the attention weights. Default is 0.0.
        bias (bool, optional): If True, adds a learnable bias to the output. Default is True.
        add_bias_kv (bool, optional): If True, adds bias to the key and value sequences. Default is False.
        add_zero_attn (bool, optional): If True, adds a zero attention head. Default is False.
        kdim (int, optional): Total number of features for keys. If None, defaults to embed_dim.
        vdim (int, optional): Total number of features for values. If None, defaults to embed_dim.
        batch_first (bool, optional): If True, the input and output tensors are provided as (batch, seq, feature). Default is False.
        device (optional): The device on which the layer's parameters should be initialized. Default is None.
        dtype (optional): The data type for the parameters. Default is None.
        is_export (bool, optional): If True, the layer is set up for export, potentially changing behavior for compatibility. Default is False.
    """

    __constants__ = ["batch_first"]
    bias_k: Optional[torch.Tensor]
    bias_v: Optional[torch.Tensor]

    def __init__(
            self,
            embed_dim,
            num_heads,
            dropout=0.0,
            bias=True,
            add_bias_kv=False,
            add_zero_attn=False,
            kdim=None,
            vdim=None,
            batch_first=False,
            device=None,
            dtype=None,
            is_export=False,
    ) -> None:
        super(MyMultiheadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout = dropout
        self.batch_first = batch_first
        self.head_dim = embed_dim // num_heads
        self.is_export = is_export
        assert (
                self.head_dim * num_heads == self.embed_dim
        ), "embed_dim must be divisible by num_heads"

        if self._qkv_same_embed_dim is False:
            pass
        else:
            if dtype is None:
                dtype = torch.float32
            self.in_proj_weight = torch.nn.Parameter(torch.randn(3 * embed_dim, embed_dim) * 0.01)
            self.q_proj_weight = None
            self.k_proj_weight = None
            self.v_proj_weight = None

        if bias:
            self.in_proj_bias = torch.nn.Parameter(torch.randn(3 * embed_dim, ) * 0.01)
            torch.nn.init.zeros_(self.in_proj_bias)
        else:
            self.in_proj_bias = None
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

        if add_bias_kv:
            pass
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self._reset_parameters()

    def _reset_parameters(self):

        if self._qkv_same_embed_dim:
            torch.nn.init.xavier_normal_(self.in_proj_weight)
        else:
            torch.nn.init.xavier_normal_(self.q_proj_weight)
            torch.nn.init.xavier_normal_(self.k_proj_weight)
            torch.nn.init.xavier_normal_(self.v_proj_weight)

        if self.in_proj_bias is not None:
            torch.nn.init.zeros_(self.in_proj_bias)
            torch.nn.init.zeros_(self.out_proj.bias)
        if self.bias_k is not None:
            torch.nn.init.xavier_normal_(self.bias_k)
        if self.bias_v is not None:
            torch.nn.init.xavier_normal_(self.bias_v)

    def forward(
            self,
            query: torch.Tensor,
            key: torch.Tensor,
            value: torch.Tensor,
            key_padding_mask: Optional[torch.Tensor] = None,
            need_weights: bool = True,
            attn_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:

        attn_output, attn_output_weights = multi_head_attention_forward(
            query,
            key,
            value,
            self.embed_dim,
            self.num_heads,
            self.in_proj_weight,
            self.in_proj_bias,
            self.bias_k,
            self.bias_v,
            self.add_zero_attn,
            self.dropout,
            self.out_proj.weight,
            self.out_proj.bias,
            training=self.training,
            key_padding_mask=key_padding_mask,
            need_weights=need_weights,
            attn_mask=attn_mask,
            is_export=self.is_export,
        )

        return attn_output, attn_output_weights


class LogitsProcessorList(list):
    """
    A list of logits processors that can be applied sequentially.

    Methods:
        __call__(input_ids, scores, **kwargs): Apply all processors to the given inputs.
    """

    def __call__(self, input_ids, scores, **kwargs):
        for processor in self:
            function_args = inspect.signature(processor.__call__).parameters
            if len(function_args) > 2:
                if not all(arg in kwargs for arg in list(function_args.keys())[2:]):
                    raise ValueError(
                        f"Make sure that all the required parameters: {list(function_args.keys())} for "
                        f"{processor.__class__} are passed to the logits processor."
                    )
                scores = processor(input_ids, scores, **kwargs)
            else:
                scores = processor(input_ids, scores)
        return scores


class ForcedEOSTokenLogitsProcessor(object):
    """
    A processor that forces the generation of an end-of-sequence (EOS) token
    at a specified position in the sequence.

    This is typically used in language generation tasks to ensure that the
    generated sequence ends properly when it reaches a certain length.

    Args:
        max_length (int): The maximum length of the sequence. Forces EOS when this length is reached.
        eos_token_id (Union[int, List[int]]): The ID(s) of the EOS token(s) to be forced in the sequence.
    """

    def __init__(self, max_length: int, eos_token_id: Union[int, List[int]]):
        self.max_length = max_length
        if isinstance(eos_token_id, int):
            eos_token_id = [eos_token_id]
        self.eos_token_id = eos_token_id

    def __call__(self, input_ids, scores):
        cur_len = input_ids.shape[-1]
        scores_processed = scores
        if cur_len == self.max_length - 1:
            scores_processed = torch.full_like(scores, -math.inf)
            scores_processed[:, self.eos_token_id] = 0
        return scores_processed


@dataclass
class CausalLMOutputWithCrossAttentions(ModelOutput):
    loss = None
    logits = None
    past_key_values = None
    hidden_states = None
    attentions = None
    cross_attentions = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)


@dataclass
class CausalLMOutputWithCrossAttentionsAndCounting(ModelOutput):
    """
    Base class for causal language model (or autoregressive) outputs.
    """

    logits = None
    counting = None
    past_key_values = None
    hidden_states = None
    attentions = None
    cross_attentions = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)


class CustomMBartDecoder(MBartDecoder):
    """
    A custom MBartDecoder that includes additional processing layers.

    This class extends the MBartDecoder by adding a customizable neural network
    component called `counting_context_weight`, which applies a series of linear
    transformations followed by ReLU activations. This can be used to modify or
    enhance the decoder's behavior for specific tasks.

    Args:
        config: The configuration object containing model parameters.
    """

    def __init__(self, config):
        super().__init__(config)
        hidden_size = config.d_model
        self.is_export = config.is_export
        self.counting_context_weight = nn.Sequential(
            nn.Linear(config.vocab_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, config.d_model),
        )

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            count_pred=None,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            head_mask=None,
            cross_attn_head_mask=None,
            past_key_values=None,
            inputs_embeds=None,
            use_cache=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
    ):
        self.is_export = False if self.training else True
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input = input_ids
            input_shape = input.shape
            input_ids = input_ids.reshape([-1, input_shape[-1]])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.shape[:-1]
            input = inputs_embeds[:, :, -1]
        else:
            raise ValueError(
                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
            )

        past_key_values_length = (
            past_key_values[0][0].shape[2] if past_key_values is not None else 0
        )

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        if self._use_flash_attention_2:
            attention_mask = (
                attention_mask
                if (attention_mask is not None and 0 in attention_mask)
                else None
            )
        else:
            if self.is_export:
                attention_mask = _prepare_4d_causal_attention_mask_export(
                    attention_mask,
                    input_shape,
                    inputs_embeds,
                    past_key_values_length,
                    is_export=self.is_export,
                ).to(torch.float32)
            else:
                attention_mask = _prepare_4d_causal_attention_mask(
                    attention_mask,
                    input_shape,
                    inputs_embeds,
                    past_key_values_length,
                    is_export=self.is_export,
                )

        if encoder_hidden_states is not None and encoder_attention_mask is not None:
            if self._use_flash_attention_2:
                encoder_attention_mask = (
                    encoder_attention_mask if 0 in encoder_attention_mask else None
                )
            else:
                encoder_attention_mask = _prepare_4d_attention_mask(
                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
                )

        # embed positions
        positions = self.embed_positions(input, past_key_values_length)

        hidden_states = inputs_embeds + positions

        # TODO: add counting context weight to hidden_states
        if count_pred is not None:
            count_context_weight = self.counting_context_weight(count_pred)
            hidden_states = hidden_states + 0.5 * count_context_weight.unsqueeze(1)

        hidden_states = self.layernorm_embedding(hidden_states)
        hidden_states = nn.functional.dropout(
            hidden_states, p=self.dropout, training=self.training
        )

        if self.gradient_checkpointing and self.training:
            if use_cache:
                print(
                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
                )
                use_cache = False

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_cross_attentions = (
            () if (output_attentions and encoder_hidden_states is not None) else None
        )
        next_decoder_cache = () if use_cache else None

        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
        for attn_mask, mask_name in zip(
                [head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]
        ):
            if attn_mask is not None:
                if attn_mask.size()[0] != len(self.layers):
                    raise ValueError(
                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
                        f" {attn_mask.size()[0]}."
                    )

        for idx, decoder_layer in enumerate(self.layers):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)
            if self.training:
                dropout_probability = torch.rand()
                if dropout_probability < self.layerdrop:
                    continue

            past_key_value = (
                past_key_values[idx] if past_key_values is not None else None
            )

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    attention_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    head_mask[idx] if head_mask is not None else None,
                    (
                        cross_attn_head_mask[idx]
                        if cross_attn_head_mask is not None
                        else None
                    ),
                    None,
                    output_attentions,
                    use_cache,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=attention_mask,
                    encoder_hidden_states=encoder_hidden_states,
                    encoder_attention_mask=encoder_attention_mask,
                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                    cross_attn_layer_head_mask=(
                        cross_attn_head_mask[idx]
                        if cross_attn_head_mask is not None
                        else None
                    ),
                    past_key_value=past_key_value,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                )
            hidden_states = layer_outputs[0]
            if self.is_export:
                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
            else:
                if use_cache:
                    next_decoder_cache += (
                        layer_outputs[3 if output_attentions else 1],
                    )

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

                if encoder_hidden_states is not None:
                    all_cross_attentions += (layer_outputs[2],)

        hidden_states = self.layer_norm(hidden_states)

        if output_hidden_states:
            all_hidden_states += (hidden_states,)
        if self.is_export:
            next_cache = next_decoder_cache
        else:
            next_cache = next_decoder_cache if use_cache else None
        if not self.is_export:
            if not return_dict:
                return tuple(
                    v
                    for v in [
                        hidden_states,
                        next_cache,
                        all_hidden_states,
                        all_self_attns,
                        all_cross_attentions,
                    ]
                    if v is not None
                )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            cross_attentions=all_cross_attentions,
        )


class SelfAttentionBlock(nn.Module):
    """
    A self-attention block that implements multi-head self-attention
    followed by a feed-forward network, typically used in transformer architectures.

    Args:
        embed_size (int): The size of the embedding vector.
        num_heads (int): The number of attention heads.
        is_export (bool): Flag indicating whether to configure the layer for export.
    """

    def __init__(self, embed_size, num_heads, is_export):
        super(SelfAttentionBlock, self).__init__()
        self.self_attention = MyMultiheadAttention(
            embed_dim=embed_size, num_heads=num_heads, is_export=is_export
        )
        self.norm = nn.LayerNorm(embed_size)

    def forward(self, x):
        attn_output, _ = self.self_attention(x, x, x)
        x = self.norm(attn_output + x)
        return x


class SeqCountingDecoder(nn.Module):
    """
    A custom sequence counting decoder that incorporates multi-head attention layers
    and feed-forward networks to process sequences, potentially for latex code counting .

    Args:
        in_features (int): The number of input features.
        out_features (int): The number of output features.
        num_heads (int): The number of attention heads. Defaults to 8.
        num_layers (int): The number of attention layers. Defaults to 4.
        is_export (bool): Flag indicating whether to configure the layer for export.
    """

    def __init__(
            self, in_features, out_features, num_heads=8, num_layers=4, is_export=False
    ):
        super(SeqCountingDecoder, self).__init__()

        self.attention_blocks = nn.ModuleList(
            [
                SelfAttentionBlock(
                    embed_size=in_features, num_heads=num_heads, is_export=is_export
                )
                for i in range(num_layers)
            ]
        )
        self.fc1 = nn.Linear(in_features, in_features // 2)
        self.relu = nn.ReLU()
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc2 = nn.Linear(in_features // 2, out_features)

    def forward(self, x):
        for block in self.attention_blocks:
            x = block(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = x.transpose([0, 2, 1])
        x = self.global_avg_pool(x)
        x = x.squeeze(-1)
        x = self.fc2(x)
        return x


class CustomMBartForCausalLM(MBartForCausalLM):
    """
    Custom MBart model for causal language modeling with a custom decoder.

    This class extends the MBartForCausalLM by replacing its decoder with a
    custom decoder, allowing for additional flexibility and features in the
    decoding process.

    Args:
        config: The configuration object containing model parameters.
        length_aware (bool): A flag to enable or configure length-aware mechanisms.
    """

    def __init__(self, config, length_aware=True):
        super().__init__(config)
        self.model.decoder = CustomMBartDecoder(config)
        self.counting_decoder = SeqCountingDecoder(
            config.d_model, config.vocab_size, is_export=config.is_export
        )
        self.length_aware = length_aware

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            head_mask=None,
            cross_attn_head_mask=None,
            past_key_values=None,
            inputs_embeds=None,
            labels=None,
            use_cache=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            count_gt=None,
    ):
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if self.length_aware:
            count_pred = self.counting_decoder(encoder_hidden_states)
        else:
            count_pred = None

        outputs = self.model.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            count_pred=count_pred,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            head_mask=head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        logits = self.lm_head(outputs[0])

        return CausalLMOutputWithCrossAttentionsAndCounting(
            logits=logits,
            counting=count_pred,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )


class UniMERNetHead(nn.Module):
    """Implementation of UniMERNetHead decoder.

    Args:
         max_new_tokens (int): Maximum number of new tokens to generate.
         decoder_start_token_id (int): ID of the token that starts the decoding.
         temperature (float): Sampling temperature for generation.
         do_sample (bool): Whether to use sampling; if False, uses greedy decoding.
         top_p (float): Top-p (nucleus) sampling parameter.
         in_channels (int): Number of input channels/features.
         encoder_hidden_size (int): Hidden size of the encoder.
         decoder_hidden_size (int): Hidden size of the decoder.
         decoder_ffn_dim (int): Dimension of the decoder's feed-forward network.
         decoder_layers (int): Number of layers in the decoder.
         is_export (bool): Flag indicating if the model is being prepared for export.
         length_aware (bool): Flag to enable length-aware mechanisms.
    """

    def __init__(
            self,
            max_new_tokens=1536,
            decoder_start_token_id=0,
            temperature=0.2,
            do_sample=False,
            top_p=0.95,
            in_channels=1024,
            encoder_hidden_size=1024,
            decoder_hidden_size=1024,
            decoder_ffn_dim=4096,
            decoder_layers=8,
            is_export=False,
            length_aware=True,
    ):
        super().__init__()
        mbart_config_dict = {
            "activation_dropout": 0.0,
            "activation_function": "gelu",
            "add_cross_attention": True,
            "add_final_layer_norm": True,
            "attention_dropout": 0.0,
            "bos_token_id": 0,
            "classifier_dropout": 0.0,
            "d_model": decoder_hidden_size,
            "decoder_attention_heads": 16,
            "decoder_ffn_dim": decoder_ffn_dim,
            "decoder_layerdrop": 0.0,
            "decoder_layers": decoder_layers,
            "dropout": 0.1,
            "encoder_attention_heads": 16,
            "encoder_ffn_dim": 4096,
            "encoder_layerdrop": 0.0,
            "encoder_layers": 12,
            "eos_token_id": 2,
            "forced_eos_token_id": 2,
            "init_std": 0.02,
            "is_decoder": True,
            "is_encoder_decoder": False,
            "output_hidden_states": False,
            "max_position_embeddings": max_new_tokens,
            "model_type": "mbart",
            "num_hidden_layers": 12,
            "pad_token_id": 1,
            "scale_embedding": True,
            "tie_word_embeddings": False,
            "transformers_version": "4.40.0",
            "use_cache": True,
            "use_return_dict": True,
            "vocab_size": 50000,
            "_attn_implementation": "eager",
            "hidden_size": decoder_hidden_size,
            "is_export": is_export,
        }

        self.max_new_tokens = max_new_tokens
        self.decoder_start_token_id = decoder_start_token_id
        self.temperature = temperature
        self.do_sample = do_sample
        self.top_p = top_p
        self.max_seq_len = max_new_tokens
        self.config_decoder = MBartConfig(**mbart_config_dict)
        self.encoder_hidden_size = encoder_hidden_size
        self.is_export = self.config_decoder.is_export
        self.decoder = CustomMBartForCausalLM(
            self.config_decoder, length_aware=length_aware
        )
        if self.config_decoder.hidden_size != self.encoder_hidden_size:
            self.enc_to_dec_proj = nn.Linear(
                self.encoder_hidden_size, self.config_decoder.hidden_size
            )
        generation_config = {
            "max_length": 1537,
            "forced_eos_token_id": 2,
        }
        self.eos_token_id = generation_config["forced_eos_token_id"]
        self.pad_token_id = self.config_decoder.pad_token_id
        self.logits_processor = LogitsProcessorList()
        self.logits_processor.append(
            ForcedEOSTokenLogitsProcessor(
                generation_config["max_length"],
                generation_config["forced_eos_token_id"],
            )
        )

    def _get_decoder_start_token_id(
            self, decoder_start_token_id=None, bos_token_id=None
    ) -> int:
        decoder_start_token_id = (
            decoder_start_token_id
            if decoder_start_token_id is not None
            else self.generation_config.decoder_start_token_id
        )
        bos_token_id = (
            bos_token_id
            if bos_token_id is not None
            else self.generation_config.bos_token_id
        )
        if decoder_start_token_id is not None:
            return decoder_start_token_id
        elif bos_token_id is not None:
            return bos_token_id
        raise ValueError(
            "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
        )

    def _prepare_decoder_input_ids_for_generation(
            self,
            batch_size,
            model_kwargs,
            decoder_start_token_id=None,
            bos_token_id=None,
    ):
        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
        elif "input_ids" in model_kwargs:
            decoder_input_ids = model_kwargs.pop("input_ids")
        else:
            decoder_input_ids = None

        decoder_start_token_id = self._get_decoder_start_token_id(
            decoder_start_token_id, bos_token_id
        )

        if isinstance(decoder_start_token_id, list):
            if len(decoder_start_token_id) != batch_size:
                raise ValueError(
                    f"`decoder_start_token_id` expected to have length {batch_size} but got {len(decoder_start_token_id)}"
                )
            decoder_input_ids_start = torch.LongTensor(decoder_start_token_id)
            decoder_input_ids_start = decoder_input_ids_start.view(-1, 1)
        else:
            decoder_input_ids_start = (
                    torch.ones(
                        (batch_size, 1),
                        dtype=torch.int64,
                    )
                    * decoder_start_token_id
            )

        if decoder_input_ids is None:
            decoder_input_ids = decoder_input_ids_start
        elif (
                self.config.model_type == "vision-encoder-decoder"
                and "donut" in self.name_or_path.lower()
        ):
            pass
        elif self.config.model_type in ["whisper"]:
            pass
        elif (
                isinstance(decoder_start_token_id, int)
                and (decoder_input_ids[:, 0] != decoder_start_token_id).all().item()
        ) or (
                isinstance(decoder_start_token_id, torch.Tensor)
                and (decoder_input_ids[:, 0] != decoder_start_token_id[:, 0]).all().item()
        ):
            decoder_input_ids = torch.concat(
                [decoder_input_ids_start, decoder_input_ids], dim=-1
            )
            if "decoder_attention_mask" in model_kwargs:
                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
                decoder_attention_mask = torch.cat(
                    (
                        torch.ones_like(decoder_attention_mask)[:, :1],
                        decoder_attention_mask,
                    ),
                    dim=-1,
                )
                model_kwargs["decoder_attention_mask"] = decoder_attention_mask

        return decoder_input_ids, model_kwargs

    def prepare_inputs_for_generation_mbart(
            self,
            input_ids,
            past_key_values=None,
            attention_mask=None,
            use_cache=None,
            **kwargs,
    ):

        if attention_mask is None:
            attention_mask = torch.ones(input_ids.shape)

        if past_key_values:
            past_length = past_key_values[0][0].shape[2]

            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }

    def prepare_inputs_for_generation(
            self,
            input_ids,
            past_key_values=None,
            attention_mask=None,
            use_cache=None,
            encoder_outputs=None,
            **kwargs,
    ):
        decoder_inputs = self.prepare_inputs_for_generation_mbart(
            input_ids, past_key_values=past_key_values
        )
        decoder_attention_mask = (
            decoder_inputs["attention_mask"]
            if "attention_mask" in decoder_inputs
            else None
        )
        input_dict = {
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "decoder_input_ids": decoder_inputs["input_ids"],
            "encoder_outputs": encoder_outputs,
            "past_key_values": decoder_inputs["past_key_values"],
            "use_cache": use_cache,
        }
        return input_dict

    def prepare_inputs_for_generation_export(
            self,
            past_key_values=None,
            attention_mask=None,
            use_cache=None,
            encoder_outputs=None,
            **kwargs,
    ):

        input_dict = {
            "decoder_attention_mask": None,
            "use_cache": use_cache,
        }
        return input_dict

    def _extract_past_from_model_output(
            self, outputs: ModelOutput, standardize_cache_format: bool = False
    ):
        past_key_values = None
        if "past_key_values" in outputs:
            past_key_values = outputs.past_key_values
        elif "mems" in outputs:
            past_key_values = outputs.mems
        elif "past_buckets_states" in outputs:
            past_key_values = outputs.past_buckets_states

        return past_key_values

    def _update_model_kwargs_for_generation(
            self,
            outputs: ModelOutput,
            model_kwargs: Dict[str, Any],
            is_encoder_decoder: bool = False,
            standardize_cache_format: bool = False,
    ) -> Dict[str, Any]:
        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
            outputs, standardize_cache_format=standardize_cache_format
        )
        if getattr(outputs, "state", None) is not None:
            model_kwargs["state"] = outputs.state

        if "token_type_ids" in model_kwargs:
            token_type_ids = model_kwargs["token_type_ids"]
            model_kwargs["token_type_ids"] = torch.concat(
                [token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1
            )

        if not is_encoder_decoder:
            if "attention_mask" in model_kwargs:
                attention_mask = model_kwargs["attention_mask"]
                model_kwargs["attention_mask"] = torch.concat(
                    [
                        attention_mask,
                        attention_mask.new_ones((attention_mask.shape[0], 1)),
                    ],
                    dim=-1,
                )
        else:
            if "decoder_attention_mask" in model_kwargs:
                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
                model_kwargs["decoder_attention_mask"] = torch.concat(
                    [
                        decoder_attention_mask,
                        decoder_attention_mask.new_ones(
                            (decoder_attention_mask.shape[0], 1)
                        ),
                    ],
                    dim=-1,
                )

        if (
                "cache_position" in model_kwargs
                and model_kwargs["cache_position"] is not None
        ):
            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1

        return model_kwargs

    def stopping_criteria(self, input_ids):
        if self.is_export:
            return input_ids[:, -1] == torch.Tensor([self.eos_token_id])
        is_done = torch.isin(input_ids[:, -1], torch.Tensor([self.eos_token_id]))
        return is_done

    def generate_single_iter(
            self,
            decoder_input_ids=None,
            decoder_attention_mask=None,
            encoder_outputs=None,
            past_key_values=None,
            decoder_inputs_embeds=None,
            labels=None,
            use_cache=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            **kwargs,
    ):
        encoder_hidden_states = encoder_outputs[0]
        if self.config_decoder.hidden_size != self.encoder_hidden_size:
            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
        kwargs_decoder = {}

        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=None,
            inputs_embeds=None,
            output_attentions=False,
            output_hidden_states=output_hidden_states,
            use_cache=use_cache,
            past_key_values=past_key_values,
            return_dict=return_dict,
            **kwargs_decoder,
        )

        return Seq2SeqLMOutput(
            loss=None,
            logits=decoder_outputs.logits,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )

    @torch.no_grad()
    def generate(
            self,
            model_kwargs,
    ):
        """
        Generate sequences using the UniMERNetHead for inference tasks.

        Args:
            model_kwargs (dict): A dictionary of model configurations and inputs, which typically include:
                - encoder_outputs: Outputs from the encoder.
                - use_cache: Boolean flag to indicate if caching should be used.
                - output_attentions: Boolean flag for outputting attention scores.
                - output_hidden_states: Boolean flag for outputting hidden states.

        Returns:
            A tensor containing the generated sequences.
        """
        batch_size = model_kwargs["encoder_outputs"]["last_hidden_state"].shape[0]
        generation_config = {
            "decoder_start_token_id": 0,
            "bos_token_id": 0,
        }
        input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
            batch_size=batch_size,
            model_kwargs=model_kwargs,
            decoder_start_token_id=generation_config["decoder_start_token_id"],
            bos_token_id=generation_config["bos_token_id"],
        )
        model_kwargs["key use_cache"] = True
        batch_size, cur_len = input_ids.shape

        if "inputs_embeds" in model_kwargs:
            cur_len = model_kwargs["inputs_embeds"].shape[1]
        model_kwargs["cache_position"] = torch.arange(cur_len)
        pad_token_id = self.pad_token_id
        eos_token_id = [self.eos_token_id]
        eos_token = self.eos_token_id
        unfinished_sequences = torch.ones(batch_size, dtype=torch.int64)
        for idx in range(self.max_seq_len):
            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
            outputs = self.generate_single_iter(
                **model_inputs,
                return_dict=True,
                output_attentions=False,
                output_hidden_states=False,
            )
            next_token_logits = outputs.logits[:, -1, :]

            next_tokens_scores = self.logits_processor(input_ids, next_token_logits)
            next_tokens = torch.argmax(next_tokens_scores, dim=-1)
            if eos_token_id is not None:
                if pad_token_id is None:
                    raise ValueError(
                        "If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
                    )
                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (
                        1 - unfinished_sequences
                )
            input_ids = torch.concat([input_ids, next_tokens[:, None]], dim=-1)
            model_kwargs = self._update_model_kwargs_for_generation(
                outputs,
                model_kwargs,
                is_encoder_decoder=self.config_decoder.is_encoder_decoder,
            )
            unfinished_sequences = unfinished_sequences & ~self.stopping_criteria(
                input_ids
            ).to(torch.int64)

            if (
                    eos_token is not None
                    and (
                    torch.cumsum((input_ids == eos_token).to(torch.int64), 1)[:, -1]
                    >= 1
            ).all()
            ):
                break

        return input_ids

    @torch.no_grad()
    def generate_export(
            self,
            encoder_outputs,
            model_kwargs,
    ):
        batch_size = encoder_outputs["last_hidden_state"].shape[0]
        generation_config = {
            "decoder_start_token_id": 0,
            "bos_token_id": 0,
        }
        input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
            batch_size=batch_size,
            model_kwargs=model_kwargs,
            decoder_start_token_id=generation_config["decoder_start_token_id"],
            bos_token_id=generation_config["bos_token_id"],
        )
        input_ids = input_ids.reshape([-1, 1])
        decoder_input_ids = input_ids
        model_kwargs["key use_cache"] = True
        batch_size, cur_len = input_ids.shape

        if "inputs_embeds" in model_kwargs:
            cur_len = model_kwargs["inputs_embeds"].shape[1]
        cache_position = torch.arange(cur_len)
        pad_token_id = self.pad_token_id
        eos_token_id = [self.eos_token_id]
        eos_token = self.eos_token_id
        unfinished_sequences = torch.ones([batch_size], dtype=torch.int64)
        i_idx = torch.full([], 0)
        past_key_values = []
        for i in range(8):
            init_arr = torch.zeros([batch_size, 16, 0, 64])
            cache = (init_arr, init_arr, init_arr, init_arr)
            past_key_values.append(cache)
        idx = 0
        while i_idx < torch.Tensor(self.max_seq_len):

            model_inputs = self.prepare_inputs_for_generation_export(
                past_key_values=past_key_values, **model_kwargs
            )
            decoder_attention_mask = model_inputs["decoder_attention_mask"]
            decoder_attention_mask = torch.ones(input_ids.shape)

            outputs = self.generate_single_iter(
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
                encoder_outputs=encoder_outputs,
                past_key_values=past_key_values,
                return_dict=True,
                output_attentions=False,
                output_hidden_states=False,
            )

            next_token_logits = outputs.logits[:, -1, :]

            next_tokens_scores = self.logits_processor(input_ids, next_token_logits)
            next_tokens = torch.argmax(next_tokens_scores, dim=-1)
            if eos_token_id is not None:
                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (
                        1 - unfinished_sequences
                )
            input_ids = torch.concat([input_ids, next_tokens.unsqueeze(1)], dim=-1)
            past_length = past_key_values[0][0].shape[2]
            decoder_input_ids = next_tokens.unsqueeze(1)
            past_key_values = outputs.past_key_values
            cache_position = cache_position[-1:] + 1
            unfinished_sequences = unfinished_sequences & ~self.stopping_criteria(
                input_ids
            ).to(torch.int64)
            if (
                    eos_token is not None
                    and (
                    torch.cumsum((input_ids == eos_token).to(torch.int64), 1)[:, -1]
                    >= 1
            ).all()
            ):
                break

            i_idx += 1
        return input_ids

    def forwad_train(
            self,
            encoder_outputs,
            decoder_input_ids,
            decoder_attention_mask,
            past_key_values=None,
            decoder_inputs_embeds=None,
            labels=None,
            use_cache=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            **kwargs,
    ):
        """
        Training for the UniMERNetHead.

        Args:
            encoder_outputs: Outputs from the encoder, used as input to the decoder.
            decoder_input_ids: Input IDs for the decoder.
            decoder_attention_mask: Attention mask for the decoder inputs.
            past_key_values: Cached key/values for faster decoding.
            decoder_inputs_embeds: Optional embeddings for the decoder inputs.
            labels: Target labels for calculating loss.
            use_cache: Whether to use cache during decoding.
            output_attentions: Whether to return attention scores.
            output_hidden_states: Whether to return hidden states.
            return_dict: Whether to return a dictionary of outputs.
            **kwargs: Additional keyword arguments.

        Returns:
            logits: The raw, unnormalized predictions from the model.
            count_pred: Optional prediction related to sequence length or other counts.
            masked_labels: The labels used during training, possibly masked.
        """
        labels = decoder_input_ids * 1
        labels = labels.masked_fill_(labels == self.pad_token_id, -100)
        input_decoder_input_ids = decoder_input_ids[:, :-1]
        input_decoder_attention_mask = decoder_attention_mask[:, :-1]
        encoder_hidden_states = encoder_outputs[0]
        if self.config_decoder.hidden_size != self.encoder_hidden_size:
            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
        kwargs_decoder = {}
        decoder_outputs = self.decoder(
            input_ids=input_decoder_input_ids,
            attention_mask=input_decoder_attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=None,
            inputs_embeds=None,
            output_attentions=False,
            output_hidden_states=output_hidden_states,
            use_cache=use_cache,
            past_key_values=past_key_values,
            return_dict=return_dict,
            **kwargs_decoder,
        )

        logits = decoder_outputs.logits
        count_pred = decoder_outputs.counting
        return logits, count_pred, labels

    def forward(self, inputs, targets=None):
        """
        Forward pass for the UniMERNetHead, handling both training and inference.

        Args:
            inputs: The input data, which can vary based on training or inference.
            targets: The target labels, used only during training.

        Returns:
            During inference: Returns predicted latex code.
            During training: Returns logits, predicted counts, and masked labels.
        """
        self.is_export = False if self.training else True
        if not self.training:
            encoder_outputs = inputs
            if self.is_export:
                model_kwargs = {
                    "output_attentions": False,
                    "output_hidden_states": False,
                    "use_cache": True,
                }
                word_pred = self.generate_export(encoder_outputs, model_kwargs)
            else:
                model_kwargs = {
                    "output_attentions": False,
                    "output_hidden_states": False,
                    "use_cache": True,
                    "encoder_outputs": encoder_outputs,
                }
                word_pred = self.generate(model_kwargs)

            return word_pred

        encoder_outputs, tgt_seq, mask = inputs
        logits, count_pred, masked_labels = self.forwad_train(
            encoder_outputs, tgt_seq, mask
        )
        return logits, count_pred, masked_labels