FTTransformer预测准确率超过LightGBM-论文阅读

Date:

FTTransformer是一个可以用于结构化(tabular)数据的分类和回归任务的模型。 FT 即 Feature Tokenizer的意思,把结构化数据中的离散特征和连续特征都像单词一样编码成一个向量。 从而可以像对text数据那样 应用 Transformer对 Tabular数据进行特征抽取。 值得注意的是,它对Transformer作了一些微妙的改动以适应 Tabular数据。 例如:去除第一个Transformer输入的LayerNorm层,仿照BERT的设计增加了output token(CLS token) 与features token 一起进行进入Transformer参与注意力计算。

一.准备数据

import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split

file_path = "covertype.parquet"
dfdata = pd.read_parquet(file_path)

cat_cols = ['Wilderness_Area', 'Soil_Type']
num_cols = ['Elevation','Aspect','Slope','Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Roadways',
 'Hillshade_9am','Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points']
target_col = 'Cover_Type'

print("dfdata.shape = ",dfdata.shape)
print("target_col = ", target_col)
print('cat_cols = ', cat_cols)  
print('num_cols = ', num_cols[:3]+['...'])

dftmp, dftest_raw = train_test_split(dfdata, random_state=42, test_size=0.2)
dftrain_raw, dfval_raw = train_test_split(dftmp, random_state=42, test_size=0.2)

print("len(dftrain) = ",len(dftrain_raw))
print("len(dfval) = ",len(dfval_raw))
print("len(dftest) = ",len(dftest_raw))

dfdata.shape = (581012, 13) </br> target_col = Cover_Type </br> cat_cols = [‘Wilderness_Area’, ‘Soil_Type’] </br> num_cols = [‘Elevation’, ‘Aspect’, ‘Slope’, ‘…’] </br> len(dftrain) = 371847 </br> len(dfval) = 92962 </br> len(dftest) = 116203 </br>

from torchkeras.tabular import TabularPreprocessor
from sklearn.preprocessing import OrdinalEncoder

#特征工程
pipe = TabularPreprocessor(cat_features = cat_cols, 
                           embedding_features=cat_cols)
encoder = OrdinalEncoder()

dftrain = pipe.fit_transform(dftrain_raw.drop(target_col,axis=1))
dftrain[target_col] = encoder.fit_transform(
    dftrain_raw[target_col].values.reshape(-1,1)).astype(np.int32)

dfval = pipe.transform(dfval_raw.drop(target_col,axis=1))
dfval[target_col] = encoder.transform(
    dfval_raw[target_col].values.reshape(-1,1)).astype(np.int32)

dftest = pipe.transform(dftest_raw.drop(target_col,axis=1))
dftest[target_col] = encoder.transform(
    dftest_raw[target_col].values.reshape(-1,1)).astype(np.int32)

dfdata.shape = (581012, 13) </br> target_col = Cover_Type </br> cat_cols = [‘Wilderness_Area’, ‘Soil_Type’] </br> num_cols = [‘Elevation’, ‘Aspect’, ‘Slope’, ‘…’] </br> len(dftrain) = 371847 </br> len(dfval) = 92962 </br> len(dftest) = 116203 </br>


from torchkeras.tabular import TabularDataset
from torch.utils.data import Dataset,DataLoader 

def get_dataset(dfdata):
    return TabularDataset(
                data = dfdata,
                task = 'classification',
                target = [target_col],
                continuous_cols = pipe.get_numeric_features(),
                categorical_cols = pipe.get_embedding_features()
        )

def get_dataloader(ds,batch_size=1024,num_workers=0,shuffle=False):
    dl = DataLoader(
            ds,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=num_workers,
            pin_memory=False,
        )
    return dl 
    
ds_train = get_dataset(dftrain)
ds_val = get_dataset(dfval)
ds_test = get_dataset(dftest)

dl_train = get_dataloader(ds_train,shuffle=True)
dl_val = get_dataloader(ds_val,shuffle=False)
dl_test = get_dataloader(ds_test,shuffle=False)


for batch in dl_train:
    break

二.定义模型


from torchkeras.tabular.models import FTTransformerConfig,FTTransformerModel

model_config = FTTransformerConfig(
    task="classification",
    num_attn_blocks=3
)

config = model_config.merge_dataset_config(ds_train)
net = FTTransformerModel(config = config)

#初始化参数
net.reset_weights()
net.data_aware_initialization(dl_train)

print(net.backbone.output_dim)
print(net.hparams.output_dim)

32 </br> 7


output = net.forward(batch)
loss = net.compute_loss(output,batch['target'])
print(loss)

tensor(1.8233, grad_fn=)

三.模型训练


from torchkeras import KerasModel 
from torchkeras.tabular import StepRunner 
KerasModel.StepRunner = StepRunner 


import torch 
from torch import nn 
class Accuracy(nn.Module):
    def __init__(self):
        super().__init__()

        self.correct = nn.Parameter(torch.tensor(0.0),requires_grad=False)
        self.total = nn.Parameter(torch.tensor(0.0),requires_grad=False)

    def forward(self, preds: torch.Tensor, targets: torch.Tensor):
        preds = preds.argmax(dim=-1)
        targets = targets.reshape(-1)
        m = (preds == targets).sum()
        n = targets.shape[0] 
        self.correct += m 
        self.total += n
        
        return m/n

    def compute(self):
        return self.correct.float() / self.total 
    
    def reset(self):
        self.correct -= self.correct
        self.total -= self.total


keras_model = KerasModel(net,
                   loss_fn=None,
                   optimizer = torch.optim.AdamW(net.parameters(),lr = 1e-3),
                   metrics_dict = {"acc":Accuracy()}
                   )


keras_model.fit(
    train_data = dl_train,
    val_data= dl_val,
    ckpt_path='checkpoint',
    epochs=20,
    patience=10,
    monitor="val_acc", 
    mode="max",
    plot = True,
    wandb = False
)

四.模型评估

keras_model.evaluate(dl_train)
keras_model.evaluate(dl_val)
keras_model.evaluate(dl_test)

五.使用模型

from tqdm import tqdm 
net = net.cpu()
net.eval()
preds = []
with torch.no_grad():
    for batch in tqdm(dl_test):
        preds.append(net.predict(batch))
yhat_list = [yd.argmax(dim=-1).tolist() for yd in preds]
yhat = []
for yd in yhat_list:
    yhat.extend(yd)
yhat = encoder.inverse_transform(np.array(yhat).reshape(-1,1))
dftest_raw = dftest_raw.rename(columns = {target_col: 'y'})
dftest_raw['yhat'] = yhat
from sklearn.metrics import classification_report
print(classification_report(y_true = dftest_raw['y'],y_pred = dftest_raw['yhat']))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


# 计算混淆矩阵
cm = confusion_matrix(dftest_raw['y'], dftest_raw['yhat'])

# 将混淆矩阵转换为DataFrame
df_cm = pd.DataFrame(cm, index=['Actual {}'.format(i) for i in range(cm.shape[0])],
                     columns=['Predicted {}'.format(i) for i in range(cm.shape[1])])

# 使用seaborn绘制混淆矩阵
plt.figure(figsize=(10,7))
sns.heatmap(df_cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()

六.保存模型

最佳模型权重已经保存在ckpt_path = ‘checkpoint’位置了。

net.load_state_dict(torch.load('checkpoint'))

七.与LightGBM对比

```python import pandas as pd import lightgbm as lgb from sklearn.preprocessing import OrdinalEncoder from sklearn.metrics import accuracy_score

dftmp, dftest_raw = train_test_split(dfdata, random_state=42, test_size=0.2) dftrain_raw, dfval_raw = train_test_split(dftmp, random_state=42, test_size=0.2)

dftrain = dftrain_raw.copy() dfval = dfval_raw.copy() dftest = dftest_raw.copy()

target_col = ‘Cover_Type’ cat_cols = [‘Wilderness_Area’, ‘Soil_Type’]

encoder = OrdinalEncoder()

dftrain[target_col] = encoder.fit_transform(dftrain[target_col].values.reshape(-1,1)) dfval[target_col] = encoder.transform(dfval[target_col].values.reshape(-1,1)) dftest[target_col] = encoder.transform(dftest[target_col].values.reshape(-1,1))

for col in cat_cols: dftrain[col] = dftrain[col].astype(int) dfval[col] = dfval[col].astype(int) dftest[col] = dftest[col].astype(int)

ds_train = lgb.Dataset(dftrain.drop(columns=[target_col]), label=dftrain[target_col],categorical_feature=cat_cols) ds_val = lgb.Dataset(dfval.drop(columns=[target_col]), label=dfval[target_col],categorical_feature=cat_cols) ds_test = lgb.Dataset(dftest.drop(columns=[target_col]), label=dftest[target_col],categorical_feature=cat_cols)

import lightgbm as lgb

params = { ‘n_estimators’:500, ‘boosting_type’: ‘gbdt’, ‘objective’:’multiclass’, ‘num_class’: 7, # 类别数量 ‘metric’: ‘multi_logloss’, ‘learning_rate’: 0.01, ‘verbose’: 1, ‘early_stopping_round’:50 } model = lgb.train(params, ds_train, valid_sets=[ds_val], valid_names=[‘validate’] )

y_pred_val = model.predict(dfval.drop(target_col,axis = 1), num_iteration=model.best_iteration) y_pred_val = np.argmax(y_pred_val, axis=1)

y_pred_test = model.predict(dftest.drop(target_col,axis = 1), num_iteration=model.best_iteration) y_pred_test = np.argmax(y_pred_test, axis=1)

val_score = accuracy_score(dfval[target_col], y_pred_val) test_score = accuracy_score(dftest[target_col], y_pred_test)

print(‘val_score = ‘,val_score) print(‘test_score = ‘ , test_score)```

[LightGBM] [Info] Total Bins 2219 [LightGBM] [Info] Number of data points in the train set: 371847, number of used features: 12 [LightGBM] [Info] Start training from score -1.009334 [LightGBM] [Info] Start training from score -0.717530 [LightGBM] [Info] Start training from score -2.789050 [LightGBM] [Info] Start training from score -5.354306 [LightGBM] [Info] Start training from score -4.127223 [LightGBM] [Info] Start training from score -3.510637 [LightGBM] [Info] Start training from score -3.341909 Training until validation scores don’t improve for 50 rounds Did not meet early stopping. Best iteration is: [500] validate’s multi_logloss: 0.408343 val_score = 0.8321464684494739 test_score = 0.8329389086340284

#测试集准确率对比数据 {‘FTTransformer’: 0.91481286, ‘AutoInt’:0.8217, ‘CategoryEmbeddingModel’:0.85238, ‘GANDALFModel’:0.897670, ‘GatedAdditiveTreeEnsembleModel’:0.9008, ‘LGB’:0.8329 }