import `polars as pl
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
import dill
import gc
import time
current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("this notebook training time is ", current_time)
class Config():seed=2024num_folds=10TARGET_NAME ='target'batch_size=1000import random
def seed_everything(seed):np.random.seed(seed)random.seed(seed)
seed_everything(Config.seed)
colname2dtype=pd.read_csv("/kaggle/input/home-credit-inconsistent-data-types/colname2dtype.csv")
colname=colname2dtype['Column'].values
dtype=colname2dtype['DataType'].valuesdtype2pl={}
dtype2pl['Int64']=pl.Int64
dtype2pl['Float64']=pl.Float64
dtype2pl['String']=pl.String
dtype2pl['Boolean']=pl.Stringcolname2dtype={}
for idx in range(len(colname)):colname2dtype[colname[idx]]=dtype2pl[dtype[idx]]
def find_df_null_col(df,margin=0.975):cols=[]for col in df.columns:if df[col].isna().mean()>margin:cols.append(col)return cols
def find_last_case_id(df,id='case_id'):df_copy=df.clone()df_tail=df.tail(1)df_copy=df_copy.with_columns(pl.col(id).shift(-1).alias(f"{id}_shift_-1"))df_last=df_copy.filter(pl.col(id)-pl.col(f'{id}_shift_-1')!=0).drop(f'{id}_shift_-1')df_last=pl.concat([df_last,df_tail])del df_copy,df_tailgc.collect()return df_last
def df_fillna(df,col,method=None):if method ==None:passif method == "forward":df = df.select([pl.col(col).fill_null('forward')])else:df=df.with_columns(pl.col(col).fill_null(method).alias(col))return df
def one_hot_encoder(df,col,unique):if len(unique)==2:df=df.with_columns((pl.col(col)==unique[0]).cast(pl.Int8).alias(f"{col}_{unique[0]}"))else:for idx in range(len(unique)):df=df.with_columns((pl.col(col)==unique[idx]).cast(pl.Int8).alias(f"{col}_{unique[idx]}"))return df.drop(col)
def last_features_merge(feats,last_df,last_features=[]):last_df=last_df.select(['case_id']+[last[0] for last in last_features])for last in last_features:col,fill=lastlast_df=df_fillna(last_df,col,method=fill)feats=feats.join(last_df,on='case_id',how='left')return feats
def group_features_merge(feats,group_df,group_features=[],group_name='applprev2'):group_df=group_df.select(['case_id']+[g[0] for g in group_features])for group in group_features:if group_df[group[0]].dtype==pl.String:col,fill,one_hot=groupgroup_df=df_fillna(group_df,col,method=fill)if one_hot==None:group_df=group_df.drop(col) else:group_df=one_hot_encoder(group_df,col,one_hot)for value in one_hot:new_col=f"{col}_{value}"feat=feat=group_df.group_by('case_id').agg( pl.mean(new_col).alias(f"mean_{group_name}_{new_col}"),pl.std(new_col).alias(f"std_{group_name}_{new_col}"),pl.count(new_col).alias(f"count_{group_name}_{new_col}"),)feats=feats.join(feat,on='case_id',how='left')else:col,fill=groupgroup_df=df_fillna(group_df,col,method=fill)feat=group_df.group_by('case_id').agg( pl.max(col).alias(f"max_{group_name}_{col}"),pl.mean(col).alias(f"mean_{group_name}_{col}"),pl.median(col).alias(f"median_{group_name}_{col}"),pl.std(col).alias(f"std_{group_name}_{col}"),pl.min(col).alias(f"min_{group_name}_{col}"),pl.count(col).alias(f"count_{group_name}_{col}"),pl.sum(col).alias(f"sum_{group_name}_{col}"),pl.n_unique(col).alias(f"n_unique_{group_name}_{col}"),pl.first(col).alias(f"first_{group_name}_{col}"),pl.last(col).alias(f"last_{group_name}_{col}"))feats=feats.join(feat,on='case_id',how='left')return featsdef set_table_dtypes(df):for col in df.columns:df=df.with_columns(pl.col(col).cast(colname2dtype[col]).alias(col))return df
def preprocessor(mode='train'):print(f"{mode} base file after break.number 1")feats=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_base.csv").pipe(set_table_dtypes)feats=feats.drop(['date_decision','MONTH','WEEK_NUM'])print("-"*30)print(f"{mode} applprev_2 file after break. number:1")applprev2=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_applprev_2.csv").pipe(set_table_dtypes)applprev2=applprev2.with_columns(( (pl.col('cacccardblochreas_147M')!=pl.col('cacccardblochreas_147M'))&(pl.col('conts_type_509L')!=pl.col('conts_type_509L')) )\.alias("no_credit"))applprev2=applprev2.with_columns(( (pl.col('cacccardblochreas_147M')!=pl.col('cacccardblochreas_147M'))&(pl.col('conts_type_509L')==pl.col('conts_type_509L'))) \.alias("no_frozen_credit").cast(pl.Int8))applprev2=applprev2.with_columns((pl.col('cacccardblochreas_147M')==pl.col('cacccardblochreas_147M'))\.alias("frozen_credit").cast(pl.Int8))applprev2_last=find_last_case_id(applprev2)"""这些列有些是要取最新的特征,有些是需要groupby.联系方式要最新的看一个人最新状态是不是还没有信用卡有没有信用卡冻结也考虑一下最新状态吧,反正就一个特征.信用卡冻结列特征可以从冻结原因那列构造"""last_features=[['conts_type_509L','WHATSAPP'],['no_credit',0],['no_frozen_credit',0],['frozen_credit',0]]feats=last_features_merge(feats,applprev2_last,last_features)group_features=[['cacccardblochreas_147M','a55475b1',\["P19_60_110","P17_56_144","a55475b1","P201_63_60","P127_74_114","P133_119_56","P41_107_150","P23_105_103""P33_145_161"]],['credacc_cards_status_52L','UNCONFIRMED',\['BLOCKED','UNCONFIRMED','RENEWED', 'CANCELLED', 'INACTIVE', 'ACTIVE']],['num_group1',0],['num_group2',0],]feats=group_features_merge(feats,applprev2,group_features,group_name='applprev2')del applprev2,applprev2_lastgc.collect()print("-"*30)print("credit bureau b num 2")bureau_b_1=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_credit_bureau_b_1.csv").pipe(set_table_dtypes)bureau_b_2=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_credit_bureau_b_2.csv").pipe(set_table_dtypes)bureau_b_1_last=find_last_case_id(bureau_b_1,id='case_id')bureau_b_2_last=find_last_case_id(bureau_b_2,id='case_id')feats=feats.join(bureau_b_1_last,on='case_id',how='left')feats=feats.join(bureau_b_2_last,on='case_id',how='left')del bureau_b_1,bureau_b_1_last,bureau_b_2,bureau_b_2_lastgc.collect()print(f"{mode} debitcard file after break num 1")debitcard=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_debitcard_1.csv").pipe(set_table_dtypes)debitcard_last=find_last_case_id(debitcard,id='case_id')last_features=[['last180dayaveragebalance_704A',0],['last180dayturnover_1134A',30000],['last30dayturnover_651A',0]]feats=last_features_merge(feats,debitcard_last,last_features)group_features=[['num_group1',0]]feats=group_features_merge(feats,debitcard,group_features,group_name='debitcard')del debitcard,debitcard_lastgc.collect()print(f"{mode} deposit file num 1")deposit=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_deposit_1.csv").pipe(set_table_dtypes)for idx in range(1,len(deposit.columns)):col=deposit.columns[idx]column_type = deposit[col].dtypeis_numeric = (column_type == pl.datatypes.Int64) or (column_type == pl.datatypes.Float64) if is_numeric:feat=deposit.group_by('case_id').agg( pl.max(col).alias(f"max_deposit_{col}"),pl.mean(col).alias(f"mean_deposit_{col}"),pl.median(col).alias(f"median_deposit_{col}"),pl.std(col).alias(f"std_deposit_{col}"),pl.min(col).alias(f"min_deposit_{col}"),pl.count(col).alias(f"count_deposit_{col}"),pl.sum(col).alias(f"sum_deposit_{col}"),pl.n_unique(col).alias(f"n_unique_deposit_{col}"),pl.first(col).alias(f"first_deposit_{col}"),pl.last(col).alias(f"last_deposit_{col}"))feats=feats.join(feat,on='case_id',how='left')del depositgc.collect()print(f"{mode} other file after break number 1")other=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_other_1.csv").pipe(set_table_dtypes)other_last=find_last_case_id(other)last_features=[['amtdepositbalance_4809441A',0]]feats=last_features_merge(feats,other_last,last_features)group_features=[['amtdebitincoming_4809443A',0],['amtdebitoutgoing_4809440A',0],['amtdepositincoming_4809444A',0], ['amtdepositoutgoing_4809442A',0]]feats=group_features_merge(feats,other,group_features,group_name='other')del other,other_lastgc.collect()print("person 1 num 1")person1=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_person_1.csv").pipe(set_table_dtypes)person1=person1.drop(['birthdate_87D','childnum_185L','gender_992L','housingtype_772L','isreference_387L','maritalst_703L','role_993L']) person1=person1.select(['case_id','contaddr_matchlist_1032L','contaddr_smempladdr_334L','empl_employedtotal_800L','language1_981M','persontype_1072L','persontype_792L','remitter_829L','role_1084L','safeguarantyflag_411L','sex_738L'])person1_last=find_last_case_id(person1)feats=feats.join(person1_last,on='case_id',how='left')del person1,person1_lastgc.collect()print(f"{mode} person2 file after break number 1")person2=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_person_2.csv").pipe(set_table_dtypes)person2=person2.drop(['addres_role_871L','empls_employedfrom_796D','relatedpersons_role_762T'])person2=person2.drop(['addres_district_368M','addres_zip_823M','empls_employer_name_740M'])group_features=[['conts_role_79M','a55475b1',['a55475b1', 'P38_92_157', 'P7_147_157', 'P177_137_98', 'P125_14_176', 'P125_105_50', 'P115_147_77', 'P58_79_51','P124_137_181', 'P206_38_166', 'P42_134_91']],['empls_economicalst_849M','a55475b1',['a55475b1', 'P164_110_33', 'P22_131_138', 'P28_32_178','P148_57_109', 'P7_47_145', 'P164_122_65', 'P112_86_147','P82_144_169', 'P191_80_124']],['num_group1',0],['num_group2',0],]del person2gc.collect()print(f"static_0 file num 2(3)")static_0_0=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_0_0.csv").pipe(set_table_dtypes)static_0_1=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_0_1.csv").pipe(set_table_dtypes)static=pl.concat([static_0_0,static_0_1],how="vertical_relaxed")if mode=='test':static_0_2=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_0_2.csv").pipe(set_table_dtypes)static=pl.concat([static,static_0_2],how="vertical_relaxed")feats=feats.join(static,on='case_id',how='left')del static,static_0_0,static_0_1gc.collect()print(f"{mode} static_cb_file after break num 1")static_cb=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_cb_0.csv").pipe(set_table_dtypes)static_cb=static_cb.drop(['assignmentdate_4955616D', 'dateofbirth_342D','for3years_128L','for3years_504L','for3years_584L','formonth_118L','formonth_206L','formonth_535L','forquarter_1017L', 'forquarter_462L','forquarter_634L','fortoday_1092L','forweek_1077L','forweek_528L','forweek_601L','foryear_618L','foryear_818L','foryear_850L','pmtaverage_4955615A','pmtcount_4955617L','riskassesment_302T','riskassesment_940T'])static_cb=static_cb.drop(['birthdate_574D','dateofbirth_337D','assignmentdate_238D','assignmentdate_4527235D','responsedate_1012D','responsedate_4527233D','responsedate_4917613D',])last_features=[ ['contractssum_5085716L',0],['days120_123L',0],['days180_256L',0],['days30_165L',0],['days360_512L',1],['days90_310L',0],['description_5085714M','a55475b1'],['education_88M','a55475b1'],['firstquarter_103L',0],['secondquarter_766L',0],['thirdquarter_1082L',0],['fourthquarter_440L',0],['maritalst_385M','a55475b1'],['numberofqueries_373L',1],['pmtaverage_3A',0],['pmtcount_693L', 6],['pmtscount_423L',6.0],['pmtssum_45A',0],['requesttype_4525192L','DEDUCTION_6'],]feats=last_features_merge(feats,static_cb,last_features)feats=feats.with_columns( (pl.col('days180_256L')-pl.col('days120_123L')).alias("daysgap60"))feats=feats.with_columns( (pl.col('days180_256L')-pl.col('days30_165L')).alias("daysgap150"))feats=feats.with_columns( (pl.col('days120_123L')-pl.col('days30_165L')).alias("daysgap90"))feats=feats.with_columns( (pl.col('firstquarter_103L')+pl.col('secondquarter_766L')+pl.col('thirdquarter_1082L')+pl.col('fourthquarter_440L')).alias("totalyear_result"))del static_cbgc.collect()print("-"*30)print(f"{mode} tax_a file after break num 1")tax_a=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_tax_registry_a_1.csv").pipe(set_table_dtypes)group_features=[['amount_4527230A',850],['num_group1',0]]feats=group_features_merge(feats,tax_a,group_features,group_name='tax_a')del tax_agc.collect()print("-"*30)print(f"{mode} tax_b file after break num 1")tax_b=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_tax_registry_b_1.csv").pipe(set_table_dtypes)group_features=[['amount_4917619A',6885],['num_group1',0]]feats=group_features_merge(feats,tax_b,group_features,group_name='tax_b')del tax_bgc.collect()print("-"*30)print(f"{mode} tax_c file after break num 1")tax_c=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_tax_registry_c_1.csv").pipe(set_table_dtypes)if len(tax_c)==0:tax_c=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_tax_registry_c_1.csv").pipe(set_table_dtypes)tax_c=tax_c.drop(['employername_160M','processingdate_168D'])group_features=[['pmtamount_36A',850],['num_group1',0]]feats=group_features_merge(feats,tax_c,group_features,group_name='tax_c')del tax_cgc.collect()print("-"*30)return feats
train_feats=preprocessor(mode='train')
test_feats=preprocessor(mode='test')train_feats=train_feats.to_pandas()
test_feats=test_feats.to_pandas()
mode_values = train_feats.mode().iloc[0]
train_feats = train_feats.fillna(mode_values)
test_feats = test_feats.fillna(mode_values)
print("----------string one hot encoder ****")
for col in test_feats.columns:n_unique=train_feats[col].nunique()if n_unique==2 and train_feats[col].dtype=='object':print(f"one_hot_2:{col}")unique=train_feats[col].unique()train_feats[col]=(train_feats[col]==unique[0]).astype(int)test_feats[col]=(test_feats[col]==unique[0]).astype(int)elif (n_unique<10) and train_feats[col].dtype=='object':print(f"one_hot_10:{col}")unique=train_feats[col].unique()for idx in range(len(unique)):if unique[idx]==unique[idx]:train_feats[col+"_"+str(idx)]=(train_feats[col]==unique[idx]).astype(int)test_feats[col+"_"+str(idx)]=(test_feats[col]==unique[idx]).astype(int)train_feats.drop([col],axis=1,inplace=True)test_feats.drop([col],axis=1,inplace=True)
print("----------drop other string or unique value or full null value ****")
drop_cols=[]
for col in test_feats.columns:if (train_feats[col].dtype=='object') or (test_feats[col].dtype=='object') \or (train_feats[col].nunique()==1) or train_feats[col].isna().mean()>0.99:drop_cols+=[col]
drop_cols+=['case_id']
train_feats.drop(drop_cols,axis=1,inplace=True)
test_feats.drop(drop_cols,axis=1,inplace=True)
print(f"len(train_feats):{len(train_feats)},total_features_counts:{len(test_feats.columns)}")
train_feats.head()
def reduce_mem_usage(df, float16_as32=True):start_mem = df.memory_usage().sum() / 1024**2print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))for col in df.columns:col_type = df[col].dtypeif col_type != object:c_min,c_max = df[col].min(),df[col].max() if str(col_type)[:3] == 'int':if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:df[col] = df[col].astype(np.int8)elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:df[col] = df[col].astype(np.int16)elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:df[col] = df[col].astype(np.int32)elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:df[col] = df[col].astype(np.int64) else:if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:if float16_as32:df[col] = df[col].astype(np.float32)else:df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:df[col] = df[col].astype(np.float32)else:df[col] = df[col].astype(np.float64)end_mem = df.memory_usage().sum() / 1024**2print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))return df
train_feats = reduce_mem_usage(train_feats)
test_feats = reduce_mem_usage(test_feats)def pearson_corr(x1,x2):"""x1,x2:np.array"""mean_x1=np.mean(x1)mean_x2=np.mean(x2)std_x1=np.std(x1)std_x2=np.std(x2)pearson=np.mean((x1-mean_x1)*(x2-mean_x2))/(std_x1*std_x2)return pearson
choose_cols=[]
for col in train_feats.columns:if col!='target':pearson=pearson_corr(train_feats[col].values,train_feats['target'].values) if abs(pearson)>0.0025:choose_cols.append(col)
print(f"len(choose_cols):{len(choose_cols)},choose_cols:{choose_cols}")
from sklearn.linear_model import LinearRegressionX=train_feats[choose_cols].copy()
y=train_feats[Config.TARGET_NAME].copy()
test_X=test_feats[choose_cols].copy()
oof_pred_pro=np.zeros((len(X)))
test_pred_pro=np.zeros((Config.num_folds,len(test_X)))
del train_feats,test_feats
gc.collect()
skf = StratifiedKFold(n_splits=Config.num_folds,random_state=Config.seed, shuffle=True)for fold, (train_index, valid_index) in (enumerate(skf.split(X, y.astype(str)))):print(f"fold:{fold}")X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]model = LinearRegression()model.fit(X_train,y_train)oof_pred_pro[valid_index]=model.predict(X_valid)for idx in range(0,len(test_X),Config.batch_size):test_pred_pro[fold][idx:idx+Config.batch_size]=model.predict(test_X[idx:idx+Config.batch_size]) del model,X_train, X_valid,y_train, y_validgc.collect()
gini=2*roc_auc_score(y.values,oof_pred_pro)-1
print(f"mean_gini:{gini}")test_preds=test_pred_pro.mean(axis=0)
submission=pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv")
submission['score']=np.clip(np.nan_to_num(test_preds,nan=0.3),0,1)
submission.to_csv("submission.csv",index=None)
submission.head()