当前位置：首页 > news >正文
【XGBoost】两个单任务的模型 MAP - Charting Student Math Misunderstandings

news 2025/7/25 9:38:54
#!pip install xgboost scikit-learn matplotlib pandas scipyimport pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV # 导入 RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from scipy.stats import uniform, randint # 导入用于参数分布的模块# --- 1. 数据加载与预处理 ---
# Load the dataset
file_path = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
df_train = pd.read_csv(file_path)# Fill NaN in 'Misconception' with 'No_Misconception'
df_train['Misconception'] = df_train['Misconception'].fillna('No_Misconception')# Combine QuestionText, MC_Answer, and StudentExplanation
df_train['CombinedText'] = df_train['QuestionText'] + " " + df_train['MC_Answer'] + " " + df_train['StudentExplanation']# --- 1.1. 加载 GloVe 词向量 ---
print("--- 正在加载 GloVe 词向量 ---")
# Adjust this path if your GloVe file is located elsewhere
# glove_file_path = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt'
glove_file_path = '/kaggle/input/dataword/glove.6B.100d.txt'
word_embeddings = {}
embedding_dim = 100 # Using 100-dimensional GloVe vectorstry:with open(glove_file_path, 'r', encoding='utf-8') as f:for line in f:values = line.split()word = values[0]vector = np.asarray(values[1:], dtype='float32')word_embeddings[word] = vectorprint(f"已加载 {len(word_embeddings)} 个词的 GloVe 词向量 (维度: {embedding_dim})。")
except FileNotFoundError:print(f"错误: GloVe 文件 '{glove_file_path}' 未找到。请确保文件已上传或路径正确。")print("将跳过词嵌入，使用一个简化的特征提取器进行演示。")# Fallback: create a dummy word_embeddings if file not found for demonstrationword_embeddings = {"dummy": np.zeros(embedding_dim)} # Placeholderembedding_dim = 100 # Still define dimension for consistency# --- 1.2. 创建词嵌入特征提取器函数 ---
def get_embedding_features(texts, word_embeddings, embedding_dim):"""将文本列表转换为词嵌入特征矩阵。每个文本的特征是其所有词向量的平均值。"""features_matrix = np.zeros((len(texts), embedding_dim))for i, text in enumerate(texts):words = text.lower().split() # Convert to lowercase and split into wordsword_vectors = []for word in words:if word in word_embeddings:word_vectors.append(word_embeddings[word])if word_vectors: # If there are any valid word vectorsfeatures_matrix[i] = np.mean(word_vectors, axis=0)# else: features_matrix[i] remains all zeros (for empty texts or OOV texts)return features_matrix# --- 1.3. 使用 TF-IDF 和词嵌入提取特征并拼接 ---
print("--- 正在使用 TF-IDF 和词嵌入提取文本特征并拼接 ---")
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(df_train['CombinedText'])# Word Embedding features
X_embeddings = get_embedding_features(df_train['CombinedText'].astype(str).tolist(), word_embeddings, embedding_dim)# Concatenate TF-IDF and Word Embedding features
X_text = np.hstack((X_tfidf.toarray().astype(np.float32), X_embeddings)) # Convert sparse TF-IDF to dense before hstack
print(f"组合后的文本特征矩阵形状: {X_text.shape}")# Label Encoding for 'Category'
le_category = LabelEncoder()
le_category.fit(df_train['Category'])
y_category = le_category.transform(df_train['Category'])# Label Encoding for 'Misconception'
le_misconception = LabelEncoder()
le_misconception.fit(df_train['Misconception'])
y_misconception = le_misconception.transform(df_train['Misconception'])# --- 2. 定义计算 MAP@K 的函数 ---
def mean_average_precision_at_k(y_true, y_pred_proba, k=3):"""计算 Mean Average Precision @ K (MAP@K)。Args:y_true (np.array): 真实标签的数组（整数编码）。y_pred_proba (np.array): 预测概率的数组，形状为 (n_samples, n_classes)。k (int): 考虑的前 K 个预测。Returns:float: MAP@K 值。"""average_precisions = []for i in range(len(y_true)):true_label = y_true[i]probas_for_sample = y_pred_proba[i]top_k_indices = np.argsort(probas_for_sample)[::-1][:k]precision_at_k = 0num_correct = 0for j, predicted_index in enumerate(top_k_indices):if predicted_index == true_label:num_correct += 1precision_at_k += num_correct / (j + 1)if true_label in top_k_indices:average_precisions.append(precision_at_k / num_correct if num_correct > 0 else 0)else:average_precisions.append(0)return np.mean(average_precisions)# --- 3. 定义 XGBoost 参数分布用于 Random Search ---
# 定义一个通用的参数分布，可以根据需要调整范围
param_dist = {'n_estimators': randint(100, 500), # 树的数量'learning_rate': uniform(0.01, 0.2), # 学习率'max_depth': randint(3, 10), # 树的最大深度'subsample': uniform(0.6, 0.4), # 每次迭代采样训练样本的比例'colsample_bytree': uniform(0.6, 0.4), # 每次迭代采样特征的比例'gamma': uniform(0, 0.5), # 剪枝的最小损失减少'reg_alpha': uniform(0, 1), # L1 正则化'reg_lambda': uniform(0, 1) # L2 正则化
}# --- 4. Category 模型训练与评估 (XGBoost with Random Search) ---print("--- 正在对 Category 模型执行 Random Search ---")
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X_text, y_category, test_size=0.2, random_state=42, stratify=y_category
)xgb_cat = xgb.XGBClassifier(objective='multi:softmax',num_class=len(le_category.classes_),eval_metric='mlogloss',use_label_encoder=False,random_state=42,n_jobs=-1,device='cuda' # 启用GPU加速，替代 tree_method='gpu_hist' 和 predictor='gpu_predictor'
)# RandomizedSearchCV for Category Model
random_search_cat = RandomizedSearchCV(estimator=xgb_cat,param_distributions=param_dist,n_iter=50, # 迭代次数，可以根据计算资源调整cv=3, # 交叉验证折数scoring='f1_weighted', # 评估指标，考虑到类别不平衡，使用加权F1verbose=1, # 打印训练过程random_state=42,n_jobs=-1
)random_search_cat.fit(X_train_cat, y_train_cat)print("\nCategory 模型最佳参数:", random_search_cat.best_params_)
print("Category 模型最佳F1分数 (训练集):", random_search_cat.best_score_)# Use the best estimator found by Random Search for evaluation
model_category_eval = random_search_cat.best_estimator_# Predict and evaluate Category model on the TEST SET
y_pred_cat_test = model_category_eval.predict(X_test_cat)
y_pred_proba_cat_test = model_category_eval.predict_proba(X_test_cat)print(f"\nCategory 模型测试集准确率: {accuracy_score(y_test_cat, y_pred_cat_test):.4f}")
print("\nCategory 模型测试集分类报告:")
print(classification_report(y_test_cat, y_pred_cat_test, target_names=le_category.classes_))# Output Category model's MAP@3 on test set
map_at_3_cat_test = mean_average_precision_at_k(y_test_cat, y_pred_proba_cat_test, k=3)
print(f"\nCategory 模型测试集 Mean Average Precision @3 (MAP@3): {map_at_3_cat_test:.4f}")# Calculate and display Confusion Matrix for Category model on test set
print("\nCategory 模型测试集混淆矩阵:")
cm_category_test = confusion_matrix(y_test_cat, y_pred_cat_test)
disp_cat_test = ConfusionMatrixDisplay(confusion_matrix=cm_category_test, display_labels=le_category.classes_)
fig_cat_test, ax_cat_test = plt.subplots(figsize=(8, 6))
disp_cat_test.plot(cmap=plt.cm.Blues, ax=ax_cat_test)
ax_cat_test.set_title('Category 模型测试集混淆矩阵 (XGBoost - Random Search)')
plt.show()# --- 5. Misconception 模型训练与评估 (XGBoost with Random Search) ---print("\n--- 正在对 Misconception 模型执行 Random Search ---")
X_train_mis, X_test_mis, y_train_mis, y_test_mis = train_test_split(X_text, y_misconception, test_size=0.2, random_state=42, stratify=y_misconception
)xgb_mis = xgb.XGBClassifier(objective='multi:softmax',num_class=len(le_misconception.classes_),eval_metric='mlogloss',use_label_encoder=False,random_state=42,n_jobs=-1,device='cuda' # 启用GPU加速，替代 tree_method='gpu_hist' 和 predictor='gpu_predictor'
)# RandomizedSearchCV for Misconception Model
random_search_mis = RandomizedSearchCV(estimator=xgb_mis,param_distributions=param_dist,n_iter=50, # 迭代次数，可以根据计算资源调整cv=3, # 交叉验证折数scoring='f1_weighted', # 评估指标verbose=1, # 打印训练过程random_state=42,n_jobs=-1
)random_search_mis.fit(X_train_mis, y_train_mis)print("\nMisconception 模型最佳参数:", random_search_mis.best_params_)
print("Misconception 模型最佳F1分数 (训练集):", random_search_mis.best_score_)# Use the best estimator found by Random Search for evaluation
model_misconception_eval = random_search_mis.best_estimator_# Predict and evaluate Misconception model on the TEST SET
y_pred_mis_test = model_misconception_eval.predict(X_test_mis)
y_pred_proba_mis_test = model_misconception_eval.predict_proba(X_test_mis)print(f"\nMisconception 模型测试集准确率: {accuracy_score(y_test_mis, y_pred_mis_test):.4f}")
print("\nMisconception 模型测试集分类报告:")
print(classification_report(y_test_mis, y_pred_mis_test, target_names=le_misconception.classes_))# Output Misconception model's MAP@3 on test set
map_at_3_mis_test = mean_average_precision_at_k(y_test_mis, y_pred_proba_mis_test, k=3)
print(f"\nMisconception 模型测试集 Mean Average Precision @3 (MAP@3): {map_at_3_mis_test:.4f}")# Calculate and display Confusion Matrix for Misconception model on test set
print("\nMisconception 模型测试集混淆矩阵:")
cm_misconception_test = confusion_matrix(y_test_mis, y_pred_mis_test)
disp_mis_test = ConfusionMatrixDisplay(confusion_matrix=cm_misconception_test, display_labels=le_misconception.classes_)
fig_mis_test, ax_mis_test = plt.subplots(figsize=(15, 12))
disp_mis_test.plot(cmap=plt.cm.Blues, ax=ax_mis_test)
ax_mis_test.set_title('Misconception 模型测试集混淆矩阵 (XGBoost - Random Search)')
plt.show()print("\n模型评估完成。")# --- 6. 重新训练最终模型 (在整个训练集上) 用于对 test.csv 的预测 ---
print("\n--- 正在整个训练集上重新训练最终模型，用于对 test.csv 的预测 ---")
# Use the best parameters found by Random Search for final models
final_model_category = xgb.XGBClassifier(objective='multi:softmax', num_class=len(le_category.classes_),eval_metric='mlogloss', use_label_encoder=False,random_state=42, n_jobs=-1,device='cuda', # 启用GPU加速，替代 tree_method='gpu_hist' 和 predictor='gpu_predictor'**random_search_cat.best_params_ # 使用最佳参数
)
final_model_category.fit(X_text, y_category) # Train on FULL df_train datafinal_model_misconception = xgb.XGBClassifier(objective='multi:softmax', num_class=len(le_misconception.classes_),eval_metric='mlogloss', use_label_encoder=False,random_state=42, n_jobs=-1,device='cuda', # 启用GPU加速，替代 tree_method='gpu_hist' 和 predictor='gpu_predictor'**random_search_mis.best_params_ # 使用最佳参数
)
final_model_misconception.fit(X_text, y_misconception) # Train on FULL df_train data
print("最终模型已在整个训练集上重新训练完毕。")# --- 7. 定义获取前 K 个预测组合的函数 (现在使用 final_models) ---
def get_top_k_combined_predictions(question_text, mc_answer, student_explanation, k=3): # Added mc_answer parameter"""为给定数据生成前 K 个 Category : Misconception 预测组合。Args:question_text (str): 问题文本。mc_answer (str): 多项选择答案。student_explanation (str): 学生解释文本。k (int): 要返回的最高概率组合的数量。Returns:list: 包含前 K 个 (Category : Misconception, probability) 元组的列表。"""# Prepare new datacombined_text = question_text + " " + mc_answer + " " + student_explanation # Correctly combine all three parts# Transform new data using both TF-IDF and Word EmbeddingsX_new_tfidf = tfidf_vectorizer.transform([combined_text])X_new_embeddings = get_embedding_features([combined_text], word_embeddings, embedding_dim)X_new_transformed = np.hstack((X_new_tfidf.toarray(), X_new_embeddings))# Get probability predictions from final_modelsproba_category = final_model_category.predict_proba(X_new_transformed)[0]proba_misconception = final_model_misconception.predict_proba(X_new_transformed)[0]# Get all class namescategory_names = le_category.classes_misconception_names = le_misconception.classes_# Combine probabilities and store all possibilitiesall_combinations = []for i, cat_name in enumerate(category_names):for j, mis_name in enumerate(misconception_names):combined_prob = proba_category[i] * proba_misconception[j]all_combinations.append((f"{cat_name}:{mis_name}", combined_prob))# Sort by probability in descending order and get top K resultsall_combinations.sort(key=lambda x: x[1], reverse=True)return all_combinations[:k]# --- 8. 加载 test.csv 并进行预测 ---
print("\n--- 正在加载 test.csv 并进行预测 ---")testfile_path = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'try:df_test = pd.read_csv(testfile_path)
except FileNotFoundError:print("错误: 'test.csv' 文件未找到。请确保文件已上传。")df_test = pd.DataFrame({'row_id': [100000, 100001, 100002],'QuestionId': [99999, 88888, 77777],'QuestionText': ["What is 5 + 3?","If x = 10, what is 2x?","What is the capital of France?"],'MC_Answer': [ # Ensure MC_Answer is in dummy data"8","20","Paris"],'StudentExplanation': ["5 and 3 makes 8.","2 times 10 is 20.","It is the city of lights."]})print("已创建示例测试数据进行演示。")# Prepare test data for prediction - now correctly includes MC_Answer
# df_test['CombinedText'] = df_test['QuestionText'] + " " + df_test['MC_Answer'] + " " + df_test['StudentExplanation'] # This line is no longer needed here# Create a new column to store the combined top 3 predictions
df_test['Category:Misconception'] = None# Iterate through each row of the test DataFrame and make predictions
for index, row in df_test.iterrows():question_text = row['QuestionText']student_explanation = row['StudentExplanation']mc_answer = row['MC_Answer'] # Get MC_Answer from the row# Pass all three parts to the prediction functiontop_predictions = get_top_k_combined_predictions(question_text, mc_answer, student_explanation, k=3)predictions_list_str = []if len(top_predictions) > 0:predictions_list_str.append(f"{top_predictions[0][0]}")if len(top_predictions) > 1:predictions_list_str.append(f"{top_predictions[1][0]}")if len(top_predictions) > 2:predictions_list_str.append(f"{top_predictions[2][0]}")df_test.at[index, 'Category:Misconception'] = " ".join(predictions_list_str)print("\n测试数据预测完成。")# Display the first few rows of the test DataFrame with predictions
print("\n预测结果示例 (前5行):")
print(df_test[['row_id', 'Category:Misconception']].head().to_markdown(index=False, numalign="left", stralign="left"))# You can save the results to a new CSV file if needed
df_test.to_csv('submission.csv', index=False, columns=['row_id', 'Category:Misconception'])
print("\n预测结果已保存到 'submission.csv'。")