Transformer貌似也是可以使用state递归解码和训练的
import paddle
import numpy as npclass HeadLoss(paddle.nn.Layer):def __init__(self):super(HeadLoss, self).__init__()
import paddle
import numpy as npclass HeadLoss(paddle.nn.Layer):def __init__(self):super(HeadLoss, self).__init__()