当前位置: 首页 > article >正文

Kaggle-Predict Calorie Expenditure-(回归+xgb+cat+lgb+模型融合+预测结果)

Predict Calorie Expenditure

题意:

给出每个人的基本信息,预测运动后的卡路里消耗值。

数据处理:

1.构造出人体机能、运动相关的特征值。
2.所有特征值进行从新组合,注意唯独爆炸
3.对连续信息分箱变成离散

建立模型:

1.xgb模型,lgb模型,cat模型
2.使用stack堆叠融合,使用3折交叉验证
3.对xgb、lgb、cat进行K折交叉验证,最终和stack进行结果融合。

代码:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import lightgbm
from mlxtend.regressor import StackingCVRegressor
from sklearn import clone
from sklearn.ensemble import VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import Lasso, LogisticRegression, RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridgedef init():os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 仅输出错误日志warnings.simplefilter('ignore')  # 忽略警告日志pd.set_option('display.width', 1000)pd.set_option('display.max_colwidth', 1000)pd.set_option("display.max_rows", 1000)pd.set_option("display.max_columns", 1000)def show_dataframe(df):print("查看特征值和特征值类型\n" + str(df.dtypes) + "\n" + "-" * 100)print("查看前10行信息\n" + str(df.head()) + "\n" + "-" * 100)print("查看每个特征值的各种数据统计信息\n" + str(df.describe()) + "\n" + "-" * 100)print("输出重复行的个数\n" + str(df.duplicated().sum()) + "\n" + "-" * 100)print("查看每列的缺失值个数\n" + str(df.isnull().sum()) + "\n" + "-" * 100)print("查看缺失值的具体信息\n" + str(df.info()) + "\n" + "-" * 100)#print("输出X所有值出现的是什么,还有对应出现的次数\n" + str(df['X'].value_counts()) + "\n" + "-" * 100)def show_relation(data, colx, coly):  # 输出某一特征值与目标值的关系if data[colx].dtype == 'object' or data[colx].dtype == 'category' or len(data[colx].unique()) < 20:seaborn.boxplot(x=colx, y=coly, data=data)else:plt.scatter(data[colx], data[coly])plt.xlabel(colx)plt.ylabel(coly)plt.show()# 自定义RMSLE评分函数(GridSearchCV需要最大化评分,因此返回负RMSLE)
def rmsle_scorer(y_true, y_pred):y_pred = np.clip(y_pred, 1e-15, None)  # 防止对0取对数y_true = np.clip(y_true, 1e-15, None)log_error = np.log(y_pred + 1) - np.log(y_true + 1)rmsle = np.sqrt(np.mean(log_error ** 2))return -rmsle  # 返回负值,因为GridSearchCV默认最大化评分if __name__ == '__main__':init()df_train = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv')df_test = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv')#for col in df_train.columns:#   show_relation(df_train, col, 'Calories')#特征工程df_all = pd.concat([df_train.drop(['id', 'Calories'], axis=1), df_test.drop(['id'], axis=1)], axis=0)df_all['Sex'] = df_all['Sex'].map({'male': 0, 'female': 1})df_all = df_all.reset_index(drop=True)#构造BMIdf_all['BMI'] = df_all['Weight'] / (df_all['Height'] / 100) ** 2#Harris-Benedict公式df_all['BMR'] = 0df_all.loc[df_all['Sex'] == 0, 'BMR'] = 88.362 + (13.397 * df_all['Weight']) + (4.799 * df_all['Height']) - (5.677 * df_all['Age'])df_all.loc[df_all['Sex'] == 1, 'BMR'] = 447.593 + (9.247 * df_all['Weight']) + (3.098 * df_all['Height']) - (4.330 * df_all['Age'])# 数值特征标准化#numeric_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']#scaler = StandardScaler()#df_all[numeric_features] = scaler.fit_transform(df_all[numeric_features])#运动强度特征df_all['Max_HR'] = 220 - df_all['Age']  # 最大心率df_all['HR_Reserve_Ratio'] = df_all['Heart_Rate'] / df_all['Max_HR']#交互特征df_all['Weight_Duration'] = df_all['Weight'] * df_all['Duration']df_all['Sex_Weight'] = df_all['Sex'] * df_all['Weight']# 构造运动功率特征df_all['workload'] = df_all['Weight'] * df_all['Duration'] * df_all['Heart_Rate'] / 1000# 构造生理特征交互项df_all['age_heart_ratio'] = df_all['Age'] / df_all['Heart_Rate']# 时间维度特征(如有时间戳)df_all['hour_of_day'] = df_all['Duration']/60/24# 组合特征numeric_cols = df_all.columnsfor i in range(len(numeric_cols)):feature_1 = numeric_cols[i]for j in range(i + 1, len(numeric_cols)):feature_2 = numeric_cols[j]df_all[f'{feature_1}_x_{feature_2}'] = df_all[feature_1] * df_all[feature_2]#数值归一化#scaler = RobustScaler()#df_all = scaler.fit_transform(df_all)now_col = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI']for i in now_col:df_all[i + "_box"] = pd.cut(df_all[i], bins=10, labels=False, right=False)X_train = df_all[:df_train.shape[0]]Y_train = np.log1p(df_train['Calories'])x_test = df_all[df_train.shape[0]:]#xgbmodel_xgb =estimator=XGBRegressor(random_state=42,n_estimators=8000,objective='reg:squarederror',eval_metric='rmse',device='cuda',learning_rate=0.05,max_depth=8,colsample_bytree=0.75,subsample=0.9,#reg_lambda = 1,#reg_alpha = 0.5,early_stopping_rounds=500,)#lgbmodel_lgb = lightgbm.LGBMRegressor(n_estimators=3000,  # 增加迭代次数配合早停learning_rate=0.03,  # 减小学习率num_leaves=15,  # 限制模型复杂度min_child_samples=25,  # 增加最小叶子样本数reg_alpha=0.1,  # L1正则化reg_lambda=0.1,  # L2正则化objective='regression_l1',  # 改用MAE损失early_stopping_rounds=500,)#catmodel_cat = CatBoostRegressor(iterations=3500,learning_rate=0.02,depth=12,loss_function='RMSE',l2_leaf_reg=3,random_seed=42,eval_metric='RMSE',early_stopping_rounds=200,verbose=1000,task_type='GPU',)#融合#创建基模型列表(需禁用早停以生成完整预测)base_models = [('xgb', XGBRegressor(early_stopping_rounds=None,  # 禁用早停**{k: v for k, v in model_xgb.get_params().items() if k != 'early_stopping_rounds'})),('lgb', LGBMRegressor(early_stopping_rounds=None,  # 禁用早停**{k: v for k, v in model_lgb.get_params().items() if k != 'early_stopping_rounds'})),('cat', CatBoostRegressor(early_stopping_rounds=None,  # 禁用早停**{k: v for k, v in model_cat.get_params().items() if k != 'early_stopping_rounds'}))]meta_model = RidgeCV()model_stack = StackingRegressor(estimators=base_models,final_estimator=meta_model,cv=3,  # 使用3折交叉验证生成元特征passthrough=False,  # 不使用原始特征verbose=1)FOLDS = 20KF = KFold(n_splits=FOLDS, shuffle=True, random_state=42)cat_features = ['Sex']oof_cat = np.zeros(len(df_train))pred_cat = np.zeros(len(df_test))oof_xgb = np.zeros(len(df_train))pred_xgb = np.zeros(len(df_test))oof_lgb = np.zeros(len(df_train))pred_lgb = np.zeros(len(df_test))for i, (train_idx, valid_idx) in enumerate(KF.split(X_train, Y_train)):print('#' * 15, i + 1, '#' * 15)## SPLIT DSx_train, y_train = X_train.iloc[train_idx], Y_train.iloc[train_idx]x_valid, y_valid = X_train.iloc[valid_idx], Y_train.iloc[valid_idx]## CATBOOST fitmodel_cat.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], cat_features=cat_features,use_best_model=True, verbose=0)## XGB FIRmodel_xgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=0)## LGB MODELmodel_lgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])## PREDICTION CATBOOSToof_cat[valid_idx] = model_cat.predict(x_valid)pred_cat += model_cat.predict(x_test)## PREDICTION XGBoof_xgb[valid_idx] = model_xgb.predict(x_valid)pred_xgb += model_xgb.predict(x_test)## PREDICTION LGBoof_lgb[valid_idx] = model_lgb.predict(x_valid)pred_lgb += model_lgb.predict(x_test)cat_rmse = mean_squared_error(y_valid, oof_cat[valid_idx]) ** 0.5xgb_rmse = mean_squared_error(y_valid, oof_xgb[valid_idx]) ** 0.5lgb_rmse = mean_squared_error(y_valid, oof_lgb[valid_idx]) ** 0.5print(f'FOLD {i + 1} CATBOOST_RMSE = {cat_rmse:.4f} <=> XGB_RMSE = {xgb_rmse:.4f} <=> LGB_RMSE = {lgb_rmse:.4f}')#预测pred_cat /= FOLDSpred_xgb /= FOLDSpred_lgb /= FOLDSpred_stack = model_stack.predict(df_test)pred_all = np.expm1(pred_xgb) * 0.1 + np.expm1(pred_stack) * 0.80 + np.expm1(pred_cat) * 0.1submission = pd.DataFrame({'id': df_test['id'],'Calories': pred_all})submission['Calories'] = np.clip(submission['Calories'], a_min=1, a_max=20*df_test['Duration'])submission.to_csv('/kaggle/working/submission.csv', index=False)
代码

使用k折交叉验证,对预测结果再进行训练预测。

import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import lightgbm
from mlxtend.regressor import StackingCVRegressor
from sklearn import clone
from sklearn.ensemble import VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import Lasso, LogisticRegression, RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridgedef init():os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 仅输出错误日志warnings.simplefilter('ignore')  # 忽略警告日志pd.set_option('display.width', 1000)pd.set_option('display.max_colwidth', 1000)pd.set_option("display.max_rows", 1000)pd.set_option("display.max_columns", 1000)def show_dataframe(df):print("查看特征值和特征值类型\n" + str(df.dtypes) + "\n" + "-" * 100)print("查看前10行信息\n" + str(df.head()) + "\n" + "-" * 100)print("查看每个特征值的各种数据统计信息\n" + str(df.describe()) + "\n" + "-" * 100)print("输出重复行的个数\n" + str(df.duplicated().sum()) + "\n" + "-" * 100)print("查看每列的缺失值个数\n" + str(df.isnull().sum()) + "\n" + "-" * 100)print("查看缺失值的具体信息\n" + str(df.info()) + "\n" + "-" * 100)#print("输出X所有值出现的是什么,还有对应出现的次数\n" + str(df['X'].value_counts()) + "\n" + "-" * 100)def show_relation(data, colx, coly):  # 输出某一特征值与目标值的关系if data[colx].dtype == 'object' or data[colx].dtype == 'category' or len(data[colx].unique()) < 20:seaborn.boxplot(x=colx, y=coly, data=data)else:plt.scatter(data[colx], data[coly])plt.xlabel(colx)plt.ylabel(coly)plt.show()def show_score(model_name, pred):mse = mean_squared_error(y_val, pred)mae = mean_absolute_error(y_val, pred)score = r2_score(y_val, pred)print(model_name)print(f"{'MSE':<10}{mse:<15.4f}")print(f"{'MAE':<10}{mae:<15.4f}")print(f"{'R²':<10}{score:<15.4f}")print("-" * 100)# Function to calculate RMSLE
def rmsle(y_true, y_pred):return np.sqrt(np.mean(np.power(np.log1p(y_true) - np.log1p(y_pred), 2)))if __name__ == '__main__':init()df_train = pd.read_csv('train.csv')df_test = pd.read_csv('test.csv')#for col in df_train.columns:#   show_relation(df_train, col, 'Calories')#特征工程df_all = pd.concat([df_train.drop(['id', 'Calories'], axis=1), df_test.drop(['id'], axis=1)], axis=0)df_all['Sex_encoded'] = df_all['Sex'].map({'male': 0, 'female': 1})df_all.drop(['Sex'], axis=1, inplace=True)df_all = df_all.reset_index(drop=True)#构造BMIdf_all['BMI'] = df_all['Weight'] / (df_all['Height'] / 100) ** 2#Harris-Benedict公式df_all['BMR'] = 0df_all.loc[df_all['Sex_encoded'] == 0, 'BMR'] = 88.362 + (13.397 * df_all['Weight']) + (4.799 * df_all['Height']) - (5.677 * df_all['Age'])df_all.loc[df_all['Sex_encoded'] == 1, 'BMR'] = 447.593 + (9.247 * df_all['Weight']) + (3.098 * df_all['Height']) - (4.330 * df_all['Age'])# 数值特征标准化#numeric_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']#scaler = StandardScaler()#df_all[numeric_features] = scaler.fit_transform(df_all[numeric_features])#运动强度特征df_all['Max_HR'] = 220 - df_all['Age']  # 最大心率df_all['HR_Reserve_Ratio'] = df_all['Heart_Rate'] / df_all['Max_HR']#交互特征df_all['Weight_Duration'] = df_all['Weight'] * df_all['Duration']df_all['Sex_Weight'] = df_all['Sex_encoded'] * df_all['Weight']# 构造运动功率特征df_all['workload'] = df_all['Weight'] * df_all['Duration'] * df_all['Heart_Rate'] / 1000# 构造生理特征交互项df_all['age_heart_ratio'] = df_all['Age'] / df_all['Heart_Rate']# 时间维度特征(如有时间戳)df_all['hour_of_day'] = df_all['Duration']/60/24# 组合特征numeric_cols = df_all.columnsfor i in range(len(numeric_cols)):feature_1 = numeric_cols[i]for j in range(i + 1, len(numeric_cols)):feature_2 = numeric_cols[j]df_all[f'{feature_1}_x_{feature_2}'] = df_all[feature_1] * df_all[feature_2]#数值归一化#scaler = RobustScaler()#df_all = scaler.fit_transform(df_all)# 分箱,把连续变成离散的,看你在哪一类now_col = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI']for i in now_col:df_all[i + "_box"] = pd.cut(df_all[i], bins=10, labels=False, right=False)baseline_temp = 37.0# Calculate 'Temp_Change' for the training datadf_all['Temp_Change'] = df_all['Body_Temp'] - baseline_temp# Calculate 'Intensity' for the training datadf_all['Intensity'] = df_all['Heart_Rate'] / df_all['Duration']# Calculate 'Heart_Rate_Ratio' for the training datadf_all['Heart_Rate_Ratio'] = df_all['Heart_Rate'] / df_all['Age']# Calculate 'Duration_x_HeartRate' for the training datadf_all['Duration_x_HeartRate'] = df_all['Duration'] * df_all['Heart_Rate']# Calculate 'Weight_x_Duration' for the training datadf_all['Weight_x_Duration'] = df_all['Weight'] * df_all['Duration']# Calculate 'Height_x_Duration' for the training datadf_all['Height_x_Duration'] = df_all['Height'] * df_all['Duration']# Calculate 'Weight_x_Height' for the training datadf_all['Weight_x_Height'] = df_all['Weight'] * df_all['Height']# Calculate 'Weight_x_Intensity' for the training datadf_all['Weight_x_Intensity'] = df_all['Weight'] * df_all['Intensity']# Calculate 'Height_x_Intensity' for the training datadf_all['Height_x_Intensity'] = df_all['Height'] * df_all['Intensity']X_train = df_all[:df_train.shape[0]]Y_train = np.log1p(df_train['Calories'])x_test = df_all[df_train.shape[0]:]#xgbmodel_xgb =estimator=XGBRegressor(random_state=42,n_estimators=8000,objective='reg:squarederror',eval_metric='rmse',device='cuda',learning_rate=0.05,max_depth=8,colsample_bytree=0.75,subsample=0.9,#reg_lambda = 1,#reg_alpha = 0.5,early_stopping_rounds=200,)#lgbmodel_lgb = lightgbm.LGBMRegressor(n_estimators=3000,  # 增加迭代次数配合早停learning_rate=0.03,  # 减小学习率num_leaves=15,  # 限制模型复杂度min_child_samples=25,  # 增加最小叶子样本数reg_alpha=0.1,  # L1正则化reg_lambda=0.1,  # L2正则化objective='regression_l1',  # 改用MAE损失early_stopping_rounds=200,eval_metric='RMSE',)#catmodel_cat = CatBoostRegressor(iterations=3500,learning_rate=0.02,depth=12,loss_function='RMSE',l2_leaf_reg=3,random_seed=42,eval_metric='RMSE',verbose=1000,task_type='GPU',early_stopping_rounds=200,)#k折交叉print("🔄Generating Out-of-Fold (OOF) predictions and Test predictions for Base Models...\n" + "-"*70 + "\n")# --- Prediction Storage ---# Arrays to store out-of-fold (OOF)add_pred_val_cat = np.zeros(len(X_train))add_pred_val_xgb = np.zeros(len(X_train))add_pred_val_lgb = np.zeros(len(X_train))# Arrays to store test predictions (accumulated across folds for averaging)add_pred_test_cat = np.zeros(len(x_test))add_pred_test_xgb = np.zeros(len(x_test))add_pred_test_lgb = np.zeros(len(x_test))kf = KFold(n_splits=5, shuffle=True, random_state=42)for fold, (train_index, val_index) in enumerate(kf.split(X_train, Y_train)):print(f"\n---Fold {fold + 1}/{kf.n_splits} ---")x_train, x_val = X_train.iloc[train_index], X_train.iloc[val_index]y_train, y_val = Y_train.iloc[train_index], Y_train.iloc[val_index]#Apply log1p transformation to target for trainingy_train_log1p = np.log1p(y_train)y_val_log1p = np.log1p(y_val)# --- CatBoost Training and Prediction ---print("  ➡ Training CatBoost...")model_cat.fit(x_train, y_train_log1p,eval_set=[(x_val, y_val_log1p)],verbose=0  # Set to 100 if you want to see progress)pred_val_cat = model_cat.predict(x_val)pred_test_cat = model_cat.predict(x_test)# --- XGBoost Training and Prediction ---print("  ➡ Training XGBoost...")model_xgb.fit(x_train, y_train_log1p,eval_set=[(x_val, y_val_log1p)],verbose=0  # Set to 100 if you want to see progress)pred_val_xgb = model_xgb.predict(x_val)pred_test_xgb = model_xgb.predict(x_test)# --- LGBM Training and Prediction ---print("  ➡ Training LGBM...")model_lgb.fit(x_train, y_train_log1p,eval_set=[(x_val, y_val_log1p)],)pred_val_lgb = model_lgb.predict(x_val)pred_test_lgb = model_lgb.predict(x_test)# --- Store OOF and Test Predictions (transformed back to original scale) ---add_pred_val_cat[val_index] = np.expm1(pred_val_cat)add_pred_val_xgb[val_index] = np.expm1(pred_val_xgb)add_pred_val_lgb[val_index] = np.expm1(pred_val_lgb)add_pred_test_cat += np.expm1(pred_test_cat) / kf.n_splitsadd_pred_test_xgb += np.expm1(pred_test_xgb) / kf.n_splitsadd_pred_test_lgb += np.expm1(pred_test_lgb) / kf.n_splits# Ensure all predictions are non-negativeadd_pred_val_cat[add_pred_val_cat < 0] = 0add_pred_val_xgb[add_pred_val_xgb < 0] = 0add_pred_val_lgb[add_pred_val_lgb < 0] = 0# Note: test predictions will also be non-negative after final prediction step# Calculate and print RMSLE for individual models on this foldprint(f"  CatBoost RMSLE (Fold {fold + 1}): {rmsle(y_val, add_pred_val_cat[val_index]):.4f}")print(f"  XGBoost RMSLE (Fold {fold + 1}): {rmsle(y_val, add_pred_val_xgb[val_index]):.4f}")print(f"  LGBM RMSLE (Fold {fold + 1}): {rmsle(y_val, add_pred_val_lgb[val_index]):.4f}")x_meta_train = pd.DataFrame({'cat_pred': add_pred_val_cat,'xgb_pred': add_pred_val_xgb,'lgbm_pred': add_pred_val_lgb,})y_meta_train = Y_trainx_meta_test = pd.DataFrame({'cat_pred': add_pred_test_cat,'xgb_pred': add_pred_test_xgb,'lgbm_pred': add_pred_test_lgb,})model_meta = Ridge(random_state=42)model_meta.fit(x_meta_train, y_meta_train)print(f" meta RMSLE :{rmsle(y_meta_train, model_meta.predict(x_meta_train)):.4f}")pred_all = np.expm1(model_meta.predict(x_meta_test))submission = pd.DataFrame({'id': df_test['id'],'Calories': pred_all})submission['Calories'] = np.clip(submission['Calories'], a_min=df_test['Duration'], a_max=20*df_test['Duration'])submission.to_csv('submission.csv', index=False)
http://www.lryc.cn/news/2392788.html

相关文章:

  • 【解决办法】Git报错error: src refspec main does not match any.
  • React与Vue的内置指令对比
  • 2025年5月24号高项综合知识真题以及答案解析(第1批次)
  • 【NATURE氮化镓】GaN超晶格多沟道场效应晶体管的“闩锁效应”
  • Ubuntu24.04换源方法(新版源更换方式,包含Arm64)
  • 26 C 语言函数深度解析:定义与调用、返回值要点、参数机制(值传递)、原型声明、文档注释
  • 彻底理解一个知识点的具体步骤
  • FFmpeg 时间戳回绕处理:保障流媒体时间连续性的核心机制
  • yolov8改进模型
  • PostgreSQL日常运维
  • << C程序设计语言第2版 >> 练习 1-23 删除C语言程序中所有的注释语句
  • Fluence (FLT) 2026愿景:RWA代币化加速布局AI算力市场
  • 如何撰写一篇优质 Python 相关的技术文档 进阶指南
  • 选择if day5
  • MiniMax V-Triune让强化学习(RL)既擅长推理也精通视觉感知
  • Hash 的工程优势: port range 匹配
  • 同为.net/C#的跨平台运行时的mono和.net Core有什么区别?
  • 前端安全直传MinIO方案
  • HackMyVM-Dejavu
  • LeetCode Hot100(动态规划)
  • Opencv实用操作5 图像腐蚀膨胀
  • 【赵渝强老师】OceanBase的部署架构
  • (18)混合云架构部署
  • c/c++的opencv霍夫变换
  • AAOS系列之(七) --- AudioRecord录音逻辑分析(一)
  • MySQL大表结构变更利器:pt-online-schema-change原理与实战指南
  • LangChain【3】之进阶内容
  • 大规模JSON反序列化性能优化实战:Jackson vs FastJSON深度对比与定制化改造
  • 【OpenSearch】高性能 OpenSearch 数据导入
  • HTML5有那些更新