-
Notifications
You must be signed in to change notification settings - Fork 0
/
train-TabTransformer.py
205 lines (161 loc) · 6.7 KB
/
train-TabTransformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import shap
import matplotlib.pyplot as plt
# 1. 加载数据
data = pd.read_excel('data3.xlsx') # 替换为您的文件路径
# 2. 定义特征和目标
target = '利用回数'
features = [
'年度', '月', '曜日', 'バスとの距離', '駅との距離',
'人口_総数_300m以内', '男性割合', '15_64人口割合',
'就業者_通学者割合', '就業者_通学者利用交通手段_自転車割合',
'ポート数_300mBuffer', 'ポート数_500mBuffer', 'ポート数_1000mBuffer',
'平均気温', '降水量の合計(mm)'
]
X = data[features].copy() # 使用 .copy() 创建独立副本
y = data[target].copy()
categorical_features = ['年度', '月', '曜日']
numerical_features = [feat for feat in features if feat not in categorical_features]
# 处理分类特征
label_encoders = {}
for col in categorical_features:
le = LabelEncoder()
X[col] = le.fit_transform(X[col].astype(str))
label_encoders[col] = le # 保存编码器以便后续使用
# 划分数据
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 标准化数值特征
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])
# 3. 定义 Dataset 和 DataLoader
class TabularDataset(Dataset):
def __init__(self, X, y, categorical_features, numerical_features):
self.X_categ = X[categorical_features].values.astype(int)
self.X_cont = X[numerical_features].values.astype(float)
self.y = y.values.astype(float)
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
return {
'categorical': torch.tensor(self.X_categ[idx], dtype=torch.long),
'continuous': torch.tensor(self.X_cont[idx], dtype=torch.float),
'target': torch.tensor(self.y[idx], dtype=torch.float)
}
train_dataset = TabularDataset(X_train, y_train, categorical_features, numerical_features)
test_dataset = TabularDataset(X_test, y_test, categorical_features, numerical_features)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 4. 定义模型
category_sizes = [
X['年度'].nunique(),
X['月'].nunique(),
X['曜日'].nunique(),
]
num_continuous = len(numerical_features)
model = TabTransformer(
categories=tuple(category_sizes),
num_continuous=num_continuous,
dim=32,
dim_out=1,
depth=6,
heads=8,
attn_dropout=0.1,
ff_dropout=0.1,
mlp_hidden_mults=(4, 2),
mlp_act=nn.ReLU(),
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# 5. 训练模型
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
epochs = 300
for epoch in range(epochs):
model.train()
epoch_losses = []
for batch in train_loader:
categ = batch['categorical'].to(device)
cont = batch['continuous'].to(device)
target_batch = batch['target'].to(device).unsqueeze(1)
# 前向传播
output = model(categ, cont)
# 计算损失
loss = criterion(output, target_batch)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_losses.append(loss.item())
# 每10个 epoch 打印一次损失
if (epoch + 1) % 10 == 0 or epoch == 0:
avg_loss = sum(epoch_losses) / len(epoch_losses)
print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")
# 6. 预测和评估
model.eval()
predictions = []
actuals = []
with torch.no_grad():
for batch in test_loader:
categ = batch['categorical'].to(device)
cont = batch['continuous'].to(device)
target_batch = batch['target'].to(device).unsqueeze(1)
output = model(categ, cont)
predictions.extend(output.cpu().numpy())
actuals.extend(target_batch.cpu().numpy())
# 转换为一维数组
predictions = [pred[0] for pred in predictions]
actuals = [act[0] for act in actuals]
# 计算评估指标
mae = mean_absolute_error(actuals, predictions)
mse = mean_squared_error(actuals, predictions)
rmse = mse ** 0.5
mape = mean_absolute_percentage_error(actuals, predictions) * 100
r2 = r2_score(actuals, predictions)
print(f"TabTransformer 模型的均方误差 (MSE): {mse:.4f}")
print(f"TabTransformer 模型的根均方误差 (RMSE): {rmse:.4f}")
print(f"TabTransformer 模型的平均绝对误差 (MAE): {mae:.4f}")
print(f"TabTransformer 模型的平均绝对百分比误差 (MAPE): {mape:.2f}%")
print(f"TabTransformer 模型的 R² 分数为: {r2:.2f}")
# 7. 展示特征的重要性(使用 SHAP)
# 注意:由于 TabTransformer 是深度学习模型,SHAP 的解释可能会比较慢
# 7.1 定义一个包装函数,使其接受两个独立的参数
def model_forward(categ, cont):
return model(categ, cont)
# 7.2 准备背景数据
# 选择前 100 个训练样本作为背景数据
categ_background = torch.tensor(X_train[categorical_features].values[:100], dtype=torch.long).to(device)
cont_background = torch.tensor(X_train[numerical_features].values[:100], dtype=torch.float).to(device)
# 7.3 初始化 SHAP 的 DeepExplainer
explainer = shap.DeepExplainer(model_forward, (categ_background, cont_background))
# 7.4 准备要解释的数据
categ_test = torch.tensor(X_test[categorical_features].values, dtype=torch.long).to(device)
cont_test = torch.tensor(X_test[numerical_features].values, dtype=torch.float).to(device)
# 7.5 计算 SHAP 值
shap_values = explainer.shap_values((categ_test, cont_test))[0].cpu().numpy()
# 7.6 将 SHAP 值转换为 Pandas DataFrame
shap_df = pd.DataFrame(shap_values, columns=features)
# 7.7 计算平均绝对 SHAP 值作为特征重要性
feature_importance = shap_df.abs().mean().sort_values(ascending=False)
print("\n特征重要性(基于 SHAP 的平均绝对值):")
print(feature_importance)
# 7.8 可视化特征重要性
plt.figure(figsize=(10, 8))
feature_importance.plot(kind='barh')
plt.title('Feature Importance based on SHAP values')
plt.xlabel('Mean |SHAP value|')
plt.gca().invert_yaxis() # 使最重要的特征在上方
plt.show()
# 7.9 可选:绘制 SHAP 摘要图
shap.summary_plot(shap_values, X_test, feature_names=features, plot_type="bar")