特征交叉-DeepCross Network学习
一 tensorflow官方实现
tensorflow的官方实现已经是V2版本
class Cross(tf.keras.layers.Layer):"""Cross Layer in Deep & Cross Network to learn explicit feature interactions.Args:projection_dim: int,低秩矩阵的维度,应该小于input_dim/2, 官方建议input_dim/4diag_scale: float,增加交互权重矩阵对角线的缩放因子,主要用于增强低秩分解的稳定性。use_bias: 决定是否在交互计算中加入偏置项。preactivation: 在权重矩阵和输入特征点乘前使用的激活函数。kernel_initializer: Initializer to use on the kernel matrix.bias_initializer: Initializer to use on the bias vector.kernel_regularizer: Regularizer to use on the kernel matrix.bias_regularizer: Regularizer to use on bias vector.Input shape: A tuple of 2 (batch_size, `input_dim`) dimensional inputs.Output shape: A single (batch_size, `input_dim`) dimensional output."""def __init__(self,projection_dim: Optional[int] = None,diag_scale: Optional[float] = 0.0,use_bias: bool = True,preactivation: Optional[Union[str, tf.keras.layers.Activation]] = None,kernel_initializer: Union[Text, tf.keras.initializers.Initializer] = "truncated_normal",bias_initializer: Union[Text,tf.keras.initializers.Initializer] = "zeros",kernel_regularizer: Union[Text, None,tf.keras.regularizers.Regularizer] = None,bias_regularizer: Union[Text, None,tf.keras.regularizers.Regularizer] = None,**kwargs):super(Cross, self).__init__(**kwargs)self._projection_dim = projection_dimself._diag_scale = diag_scaleself._use_bias = use_biasself._preactivation = tf.keras.activations.get(preactivation)self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)self._bias_initializer = tf.keras.initializers.get(bias_initializer)self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)self._input_dim = Noneself._supports_masking = Trueif self._diag_scale < 0: # pytype: disable=unsupported-operandsraise ValueError("`diag_scale` should be non-negative. Got `diag_scale` = {}".format(self._diag_scale))def build(self, input_shape):# 根据输入特征的维度动态初始化交互权重矩阵last_dim = input_shape[-1] # 获取输入特征维度 input-dimif self._projection_dim is None:# 全参数模式,Dense 层会负责计算 𝑊⋅𝑥self._dense = tf.keras.layers.Dense(last_dim, # 输入等于输出kernel_initializer=_clone_initializer(self._kernel_initializer), # 初始化权重方式bias_initializer=self._bias_initializer, # 偏置初始化方式kernel_regularizer=self._kernel_regularizer, # 权重正则bias_regularizer=self._bias_regularizer, # 偏置正则use_bias=self._use_bias,dtype=self.dtype,activation=self._preactivation, # 激活函数)else:# 低秩分解模式:U 负责将输入降维到 r 维,V 再将降维结果恢复到原始维度self._dense_u = tf.keras.layers.Dense(self._projection_dim,kernel_initializer=_clone_initializer(self._kernel_initializer),kernel_regularizer=self._kernel_regularizer,use_bias=False,dtype=self.dtype,)self._dense_v = tf.keras.layers.Dense(last_dim,kernel_initializer=_clone_initializer(self._kernel_initializer),bias_initializer=self._bias_initializer,kernel_regularizer=self._kernel_regularizer,bias_regularizer=self._bias_regularizer,use_bias=self._use_bias,dtype=self.dtype,activation=self._preactivation,)self.built = Truedef call(self, x0: tf.Tensor, x: Optional[tf.Tensor] = None) -> tf.Tensor:"""Computes the feature cross.Args:x0: The input tensorx: Optional second input tensor. If provided, the layer will computecrosses between x0 and x; if not provided, the layer will computecrosses between x0 and itself.Returns:Tensor of crosses."""if not self.built:self.build(x0.shape)if x is None: # 如果不输入待交叉x,那么就是自己和自己交叉x = x0if x0.shape[-1] != x.shape[-1]:raise ValueError("`x0` and `x` dimension mismatch! Got `x0` dimension {}, and x ""dimension {}. This case is not supported yet.".format(x0.shape[-1], x.shape[-1]))# W * x if self._projection_dim is None:prod_output = self._dense(x)else:prod_output = self._dense_v(self._dense_u(x))# 确保计算结果与层的数据类型(compute_dtype)一致prod_output = tf.cast(prod_output, self.compute_dtype)# 添加对角线缩放if self._diag_scale:prod_output = prod_output + self._diag_scale * xreturn x0 * prod_output + xclass DCN(tfrs.Model):def __init__(self, use_cross_layer, deep_layer_sizes, projection_dim=None):super().__init__()self.embedding_dimension = 32 # embedding维度str_features = ["movie_id", "user_id", "user_zip_code","user_occupation_text"] int_features = ["user_gender", "bucketized_user_age"]self._all_features = str_features + int_featuresself._embeddings = {}# Compute embeddings for string features.# 对于类别特征进行embedding编码for feature_name in str_features:vocabulary = vocabularies[feature_name]self._embeddings[feature_name] = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary=vocabulary, mask_token=None),tf.keras.layers.Embedding(len(vocabulary) + 1,self.embedding_dimension)])# Compute embeddings for int features.# 对于数字类型进行编码,这里int,所以也可以embedding,如果是float,这么做就不ok了for feature_name in int_features:vocabulary = vocabularies[feature_name]self._embeddings[feature_name] = tf.keras.Sequential([tf.keras.layers.IntegerLookup(vocabulary=vocabulary, mask_value=None),tf.keras.layers.Embedding(len(vocabulary) + 1, self.embedding_dimension)])# 论文中的cross模块if use_cross_layer:self._cross_layer = tfrs.layers.dcn.Cross(projection_dim=projection_dim,kernel_initializer="glorot_uniform")else:self._cross_layer = None# DNN模块self._deep_layers = [tf.keras.layers.Dense(layer_size, activation="relu")for layer_size in deep_layer_sizes]self._logit_layer = tf.keras.layers.Dense(1)self.task = tfrs.tasks.Ranking(loss=tf.keras.losses.MeanSquaredError(),metrics=[tf.keras.metrics.RootMeanSquaredError("RMSE")])def call(self, features):"""官方实现,这里的来源是DCN-V2,其中探讨了串联和并联以及mixed"""# Concatenate embeddingsembeddings = []for feature_name in self._all_features:embedding_fn = self._embeddings[feature_name]embeddings.append(embedding_fn(features[feature_name]))x = tf.concat(embeddings, axis=1)# Build Cross Networkif self._cross_layer is not None:x = self._cross_layer(x)# Build Deep Network, 串联模式for deep_layer in self._deep_layers:x = deep_layer(x)return self._logit_layer(x)def compute_loss(self, features, training=False):labels = features.pop("user_rating")scores = self(features)return self.task(labels=labels,predictions=scores,)# 使用
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()def run_models(use_cross_layer, deep_layer_sizes, projection_dim=None, num_runs=5):models = []rmses = []for i in range(num_runs):model = DCN(use_cross_layer=use_cross_layer,deep_layer_sizes=deep_layer_sizes,projection_dim=projection_dim)model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))models.append(model)model.fit(cached_train, epochs=epochs, verbose=False)metrics = model.evaluate(cached_test, return_dict=True)rmses.append(metrics["RMSE"])mean, stdv = np.average(rmses), np.std(rmses)return {"model": models, "mean": mean, "stdv": stdv}
一. torch实现
代码摘录于deepctr-torch。
import torch
import torch.nn as nn
from .basemodel import BaseModel
from ..inputs import combined_dnn_input
from ..layers import CrossNet, DNNclass CrossNet(nn.Module):"""The Cross Network part of Deep&Cross Network model,which leans both low and high degree cross feature.Input shape- 2D tensor with shape: ``(batch_size, units)``.Output shape- 2D tensor with shape: ``(batch_size, units)``.Arguments- in_features : Positive integer, dimensionality of input features.- input_feature_num: Positive integer, shape(Input tensor)[-1]- layer_num: Positive integer, the cross layer number- parameterization: string, ``"vector"``or ``"matrix"`` , way to parameterize the cross network.- l2_reg: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix- seed: A Python integer to use as random seed."""def __init__(self, in_features, layer_num=2, parameterization='vector', seed=1024, device='cpu'):super(CrossNet, self).__init__()self.layer_num = layer_numself.parameterization = parameterizationif self.parameterization == 'vector':# weight in DCN. (in_features, 1)self.kernels = nn.Parameter(torch.Tensor(self.layer_num, in_features, 1))elif self.parameterization == 'matrix':# weight matrix in DCN-M. (in_features, in_features)self.kernels = nn.Parameter(torch.Tensor(self.layer_num, in_features, in_features))else: # errorraise ValueError("parameterization should be 'vector' or 'matrix'")self.bias = nn.Parameter(torch.Tensor(self.layer_num, in_features, 1))for i in range(self.kernels.shape[0]):nn.init.xavier_normal_(self.kernels[i])for i in range(self.bias.shape[0]):nn.init.zeros_(self.bias[i])self.to(device)def forward(self, inputs):x_0 = inputs.unsqueeze(2)x_l = x_0for i in range(self.layer_num):if self.parameterization == 'vector':xl_w = torch.tensordot(x_l, self.kernels[i], dims=([1], [0]))dot_ = torch.matmul(x_0, xl_w)x_l = dot_ + self.bias[i] + x_lelif self.parameterization == 'matrix':xl_w = torch.matmul(self.kernels[i], x_l) # W * xi (bs, in_features, 1)dot_ = xl_w + self.bias[i] # W * xi + bx_l = x_0 * dot_ + x_l # x0 · (W * xi + b) + xl Hadamard-productelse: # errorraise ValueError("parameterization should be 'vector' or 'matrix'")x_l = torch.squeeze(x_l, dim=2)return x_lclass DCN(BaseModel):"""Instantiates the Deep&Cross Network architecture. Including DCN-V (parameterization='vector')and DCN-M (parameterization='matrix').:param linear_feature_columns: An iterable containing all the features used by linear part of the model.:param dnn_feature_columns: An iterable containing all the features used by deep part of the model.:param cross_num: positive integet,cross layer number:param cross_parameterization: str, ``"vector"`` or ``"matrix"``, how to parameterize the cross network.:param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN:param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector:param l2_reg_cross: float. L2 regularizer strength applied to cross net:param l2_reg_dnn: float. L2 regularizer strength applied to DNN:param init_std: float,to use as the initialize std of embedding vector:param seed: integer ,to use as random seed.:param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.:param dnn_use_bn: bool. Whether use BatchNormalization before activation or not DNN:param dnn_activation: Activation function to use in DNN:param task: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss:param device: str, ``"cpu"`` or ``"cuda:0"``:param gpus: list of int or torch.device for multiple gpus. If None, run on `device`. `gpus[0]` should be the same gpu with `device`.:return: A PyTorch model instance."""def __init__(self, linear_feature_columns, dnn_feature_columns, cross_num=2, cross_parameterization='vector',dnn_hidden_units=(128, 128), l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_cross=0.00001,l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0, dnn_activation='relu', dnn_use_bn=False,task='binary', device='cpu', gpus=None):super(DCN, self).__init__(linear_feature_columns=linear_feature_columns,dnn_feature_columns=dnn_feature_columns, l2_reg_embedding=l2_reg_embedding,init_std=init_std, seed=seed, task=task, device=device, gpus=gpus)self.dnn_hidden_units = dnn_hidden_unitsself.cross_num = cross_numself.dnn = DNN(self.compute_input_dim(dnn_feature_columns), dnn_hidden_units,activation=dnn_activation, use_bn=dnn_use_bn, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout,init_std=init_std, device=device)if len(self.dnn_hidden_units) > 0 and self.cross_num > 0:dnn_linear_in_feature = self.compute_input_dim(dnn_feature_columns) + dnn_hidden_units[-1]elif len(self.dnn_hidden_units) > 0:dnn_linear_in_feature = dnn_hidden_units[-1]elif self.cross_num > 0:dnn_linear_in_feature = self.compute_input_dim(dnn_feature_columns)self.dnn_linear = nn.Linear(dnn_linear_in_feature, 1, bias=False).to(device)self.crossnet = CrossNet(in_features=self.compute_input_dim(dnn_feature_columns),layer_num=cross_num, parameterization=cross_parameterization, device=device)self.add_regularization_weight(filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2=l2_reg_dnn)self.add_regularization_weight(self.dnn_linear.weight, l2=l2_reg_linear)self.add_regularization_weight(self.crossnet.kernels, l2=l2_reg_cross)self.to(device)def forward(self, X):logit = self.linear_model(X)sparse_embedding_list, dense_value_list = self.input_from_feature_columns(X, self.dnn_feature_columns,self.embedding_dict)dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list)if len(self.dnn_hidden_units) > 0 and self.cross_num > 0: # Deep & Crossdeep_out = self.dnn(dnn_input)cross_out = self.crossnet(dnn_input)stack_out = torch.cat((cross_out, deep_out), dim=-1)logit += self.dnn_linear(stack_out)elif len(self.dnn_hidden_units) > 0: # Only Deepdeep_out = self.dnn(dnn_input)logit += self.dnn_linear(deep_out)elif self.cross_num > 0: # Only Crosscross_out = self.crossnet(dnn_input)logit += self.dnn_linear(cross_out)y_pred = self.out(logit)return y_pred
cross 和deep串并联比较
对比维度 | 串联结构 | 并联结构 |
---|---|---|
设计复杂度 | 简单,直观,易实现 | 较复杂,需要对特征维度对齐和拼接有更多设计 |
特征交互建模 | 逐层提取,显式交互优先 | 并行建模,显式和隐式交互并重 |
计算效率 | 更高效,计算开销小 | 计算开销大,特别是高维稀疏特征 |
特征信息保留 | 特征在 Cross Network 后可能丢失部分信息 | 输入特征直接进入两条路径,信息无损 |
模型表现 | 适合低阶显式交互为主的任务 | 适合需要复杂高阶交互的任务 |
适用数据规模 | 小规模特征或低维度特征 | 大规模高维稀疏特征 |
鲁棒性 | 难以避免特征交互部分对后续网络的影响 | 路径独立,干扰小,更鲁棒 |
Reference:
1. DCN-V2论文
2. DCN论文地址
3.视频介绍-wangshusheng
4. tensorflow实现-官方
5. tensorflow实现-官方
6. pytorch实现,deepctr-torch
7. torchrec实现