-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkaggle_notebook_template.py
More file actions
487 lines (380 loc) ยท 19 KB
/
kaggle_notebook_template.py
File metadata and controls
487 lines (380 loc) ยท 19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
# MITSUI&CO. Commodity Prediction Challenge - Enhanced ML Template
# This template implements sophisticated ML models with feature engineering and ensemble methods
import pandas as pd
import numpy as np
import os
import sys
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
# ML Libraries
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
print("๐ Starting MITSUI Commodity Prediction Challenge")
print(f"โฐ Started at: {datetime.now()}")
# ============================================================================
# PATH SETUP AND DATA LOADING
# ============================================================================
# Detect competition data path
competition_path = None
possible_paths = [
'/kaggle/input/mitsui-commodity-prediction-challenge',
'/kaggle/input/mitsui-commodity-prediction-challenge/data',
'./data'
]
for path in possible_paths:
if os.path.exists(path):
competition_path = path
print(f"โ
Found competition data at: {path}")
break
if not competition_path:
print("โ Competition data not found in expected locations")
raise FileNotFoundError("Competition data not found")
# Setup evaluation API path
evaluation_path = f'{competition_path}/kaggle_evaluation'
if os.path.exists(evaluation_path):
sys.path.append(evaluation_path)
print(f"โ
Added evaluation path: {evaluation_path}")
else:
print(f"โ Evaluation path not found: {evaluation_path}")
# Try alternative paths
alt_paths = [
'/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation',
'/kaggle/input/mitsui-commodity-prediction-challenge/data/kaggle_evaluation',
'./kaggle_evaluation'
]
for alt_path in alt_paths:
if os.path.exists(alt_path):
sys.path.append(alt_path)
print(f"โ
Added alternative evaluation path: {alt_path}")
break
else:
print("โ No evaluation API found in any expected location")
try:
# Import the evaluation API
from mitsui_gateway import MitsuiGateway
from kaggle_evaluation.core.templates import InferenceServer
print("โ
Successfully imported evaluation API")
# Load the target pairs information
target_pairs_path = f'{competition_path}/target_pairs.csv'
target_pairs = pd.read_csv(target_pairs_path)
print(f"๐ Loaded target pairs: {len(target_pairs)} targets")
# Load training data for feature engineering
train_data_path = f'{competition_path}/train.csv'
print("๐ Loading training data...")
train_data = pd.read_csv(train_data_path)
print(f"๐ Training data shape: {train_data.shape}")
# Load training labels
train_labels_path = f'{competition_path}/train_labels.csv'
print("๐ฏ Loading training labels...")
train_labels = pd.read_csv(train_labels_path)
print(f"๐ฏ Training labels shape: {train_labels.shape}")
except ImportError as e:
print(f"โ Import error: {str(e)}")
print("๐ก The evaluation API is not available in this environment.")
print("๐ This notebook needs to be run on Kaggle's competition platform.")
raise
except Exception as e:
print(f"โ Error loading data: {str(e)}")
print("๐ก Check that the competition data is properly loaded.")
raise
# ============================================================================
# FEATURE ENGINEERING CLASS
# ============================================================================
class FeatureEngineer:
"""Advanced feature engineering for financial time series data."""
def __init__(self):
self.scaler = RobustScaler()
self.feature_columns = []
def create_technical_indicators(self, df):
"""Create technical indicators for financial data."""
features = df.copy()
# Get price columns (close prices)
price_cols = [col for col in df.columns if 'Close' in col or 'close' in col]
for col in price_cols:
if col in df.columns:
# Price-based features
features[f'{col}_pct_change'] = df[col].pct_change()
features[f'{col}_log_return'] = np.log(df[col] / df[col].shift(1))
# Moving averages
for window in [5, 10, 20, 50]:
features[f'{col}_ma_{window}'] = df[col].rolling(window=window).mean()
features[f'{col}_ma_ratio_{window}'] = df[col] / features[f'{col}_ma_{window}']
# Volatility features
for window in [5, 10, 20]:
features[f'{col}_volatility_{window}'] = df[col].rolling(window=window).std()
# Momentum features
for period in [5, 10, 20]:
features[f'{col}_momentum_{period}'] = df[col] - df[col].shift(period)
features[f'{col}_momentum_pct_{period}'] = (df[col] - df[col].shift(period)) / df[col].shift(period)
# Volume-based features (if available)
volume_cols = [col for col in df.columns if 'Volume' in col or 'volume' in col]
for col in volume_cols:
if col in df.columns:
features[f'{col}_pct_change'] = df[col].pct_change()
for window in [5, 10, 20]:
features[f'{col}_ma_{window}'] = df[col].rolling(window=window).mean()
# FX-specific features
fx_cols = [col for col in df.columns if col.startswith('FX_')]
for col in fx_cols:
if col in df.columns:
features[f'{col}_pct_change'] = df[col].pct_change()
features[f'{col}_log_return'] = np.log(df[col] / df[col].shift(1))
return features
def create_cross_asset_features(self, df):
"""Create features based on relationships between different assets."""
features = df.copy()
# Commodity spreads
if 'LME_AH_Close' in df.columns and 'LME_ZS_Close' in df.columns:
features['LME_AH_ZS_spread'] = df['LME_AH_Close'] - df['LME_ZS_Close']
features['LME_AH_ZS_ratio'] = df['LME_AH_Close'] / df['LME_ZS_Close']
if 'LME_PB_Close' in df.columns and 'LME_CA_Close' in df.columns:
features['LME_PB_CA_spread'] = df['LME_PB_Close'] - df['LME_CA_Close']
features['LME_PB_CA_ratio'] = df['LME_PB_Close'] / df['LME_CA_Close']
# Gold-Platinum spread
if 'JPX_Gold_Standard_Futures_Close' in df.columns and 'JPX_Platinum_Standard_Futures_Close' in df.columns:
features['Gold_Platinum_spread'] = df['JPX_Gold_Standard_Futures_Close'] - df['JPX_Platinum_Standard_Futures_Close']
features['Gold_Platinum_ratio'] = df['JPX_Gold_Standard_Futures_Close'] / df['JPX_Platinum_Standard_Futures_Close']
# Currency strength indices
fx_cols = [col for col in df.columns if col.startswith('FX_')]
if len(fx_cols) > 0:
# USD strength index (average of USD pairs)
usd_pairs = [col for col in fx_cols if 'USD' in col and col != 'FX_USDJPY']
if usd_pairs:
features['USD_strength'] = df[usd_pairs].mean(axis=1)
return features
def create_lag_features(self, df, max_lag=5):
"""Create lagged features for time series prediction."""
features = df.copy()
# Create lags for key price columns
key_cols = [col for col in df.columns if 'Close' in col or 'close' in col]
key_cols.extend([col for col in df.columns if col.startswith('FX_')])
for col in key_cols:
if col in df.columns:
for lag in range(1, max_lag + 1):
features[f'{col}_lag_{lag}'] = df[col].shift(lag)
return features
def engineer_features(self, df):
"""Main feature engineering pipeline."""
print("๐ง Starting feature engineering...")
# Apply all feature engineering steps
features = self.create_technical_indicators(df)
features = self.create_cross_asset_features(features)
features = self.create_lag_features(features)
# Remove infinite and NaN values
features = features.replace([np.inf, -np.inf], np.nan)
features = features.fillna(method='ffill').fillna(method='bfill').fillna(0)
# Store feature columns for later use
self.feature_columns = [col for col in features.columns if col != 'date_id']
print(f"โ
Feature engineering completed. Total features: {len(self.feature_columns)}")
return features
# ============================================================================
# ENSEMBLE MODEL CLASS
# ============================================================================
class EnsembleModel:
"""Ensemble of multiple ML models for robust predictions."""
def __init__(self, n_models=5):
self.n_models = n_models
self.models = []
self.scalers = []
self.is_trained = False
def create_models(self):
"""Create diverse set of models for ensemble."""
self.models = [
# Linear models
Ridge(alpha=1.0, random_state=42),
Lasso(alpha=0.01, random_state=42),
# Tree-based models
RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
GradientBoostingRegressor(n_estimators=100, max_depth=6, random_state=42),
# Gradient boosting
xgb.XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42),
lgb.LGBMRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
]
# Create scalers for each model
self.scalers = [StandardScaler() for _ in range(len(self.models))]
print(f"โ
Created {len(self.models)} models for ensemble")
def train_models(self, X, y):
"""Train all models in the ensemble."""
print("๐๏ธ Training ensemble models...")
# Create models if not already created
if not self.models:
self.create_models()
# Train each model
for i, (model, scaler) in enumerate(zip(self.models, self.scalers)):
print(f" Training model {i+1}/{len(self.models)}: {type(model).__name__}")
# Scale features
X_scaled = scaler.fit_transform(X)
# Train model
model.fit(X_scaled, y)
self.is_trained = True
print("โ
All models trained successfully")
def predict(self, X):
"""Generate ensemble predictions."""
if not self.is_trained:
raise ValueError("Models must be trained before making predictions")
predictions = []
# Get predictions from each model
for model, scaler in zip(self.models, self.scalers):
X_scaled = scaler.transform(X)
pred = model.predict(X_scaled)
predictions.append(pred)
# Average predictions (simple ensemble)
ensemble_pred = np.mean(predictions, axis=0)
return ensemble_pred
# ============================================================================
# TARGET-SPECIFIC MODELING
# ============================================================================
class TargetModelManager:
"""Manages models for each target, using target_pairs information."""
def __init__(self, target_pairs):
self.target_pairs = target_pairs
self.feature_engineer = FeatureEngineer()
self.target_models = {}
self.is_trained = False
def get_target_features(self, target_name, df):
"""Get relevant features for a specific target based on target_pairs."""
target_info = self.target_pairs[self.target_pairs['target'] == target_name].iloc[0]
pair = target_info['pair']
# Extract relevant features based on the pair
relevant_features = []
# Add features for each asset in the pair
if ' - ' in pair:
# This is a spread target
asset1, asset2 = pair.split(' - ')
relevant_features.extend([col for col in df.columns if asset1 in col])
relevant_features.extend([col for col in df.columns if asset2 in col])
else:
# This is a single asset target
relevant_features.extend([col for col in df.columns if pair in col])
# Add general market features
relevant_features.extend([col for col in df.columns if 'FX_' in col])
relevant_features.extend([col for col in df.columns if 'LME_' in col])
relevant_features.extend([col for col in df.columns if 'JPX_' in col])
# Remove duplicates and ensure date_id is not included
relevant_features = list(set(relevant_features))
if 'date_id' in relevant_features:
relevant_features.remove('date_id')
return relevant_features
def train_target_model(self, target_name, train_data, train_labels):
"""Train a model for a specific target."""
print(f"๐ฏ Training model for {target_name}")
# Get target values
target_values = train_labels[target_name].values
# Get relevant features for this target
relevant_features = self.get_target_features(target_name, train_data)
if len(relevant_features) == 0:
print(f"โ ๏ธ No relevant features found for {target_name}, using all features")
relevant_features = [col for col in train_data.columns if col != 'date_id']
# Prepare training data
X = train_data[relevant_features].values
y = target_values
# Remove rows with NaN values
valid_mask = ~(np.isnan(X).any(axis=1) | np.isnan(y))
X = X[valid_mask]
y = y[valid_mask]
if len(X) == 0:
print(f"โ ๏ธ No valid training data for {target_name}, using simple baseline")
self.target_models[target_name] = None
return
# Create and train ensemble model
ensemble = EnsembleModel()
ensemble.train_models(X, y)
# Store model and feature list
self.target_models[target_name] = {
'model': ensemble,
'features': relevant_features
}
def train_all_models(self, train_data, train_labels):
"""Train models for all targets."""
print("๐๏ธ Training models for all targets...")
# Engineer features for training data
engineered_data = self.feature_engineer.engineer_features(train_data)
# Train models for each target
target_columns = [col for col in train_labels.columns if col.startswith('target_')]
for target_name in target_columns:
try:
self.train_target_model(target_name, engineered_data, train_labels)
except Exception as e:
print(f"โ Error training model for {target_name}: {str(e)}")
self.target_models[target_name] = None
self.is_trained = True
print(f"โ
Training completed for {len(self.target_models)} targets")
def predict_target(self, target_name, test_data):
"""Predict for a specific target."""
if target_name not in self.target_models or self.target_models[target_name] is None:
# Return baseline prediction
return np.random.normal(0, 0.01)
model_info = self.target_models[target_name]
model = model_info['model']
features = model_info['features']
# Get relevant features from test data
X = test_data[features].values
# Handle NaN values
if np.isnan(X).any():
X = np.nan_to_num(X, nan=0.0)
# Make prediction
prediction = model.predict(X.reshape(1, -1))[0]
return prediction
# ============================================================================
# MAIN PREDICTION FUNCTION
# ============================================================================
# Initialize the target model manager
print("๐ง Initializing target model manager...")
target_manager = TargetModelManager(target_pairs)
# Train models on historical data
print("๐ Training models on historical data...")
target_manager.train_all_models(train_data, train_labels)
def predict(data_batch):
"""
Generate predictions for all 424 targets for a given data batch.
Uses sophisticated ML models with feature engineering and ensemble methods.
Args:
data_batch: Tuple containing (test_batch, label_lags_1_batch, label_lags_2_batch, label_lags_3_batch, label_lags_4_batch)
Returns:
DataFrame with predictions for all 424 targets
"""
# Extract the test data from the batch
test_batch = data_batch[0]
print(f"๐ฎ Making predictions for batch with {len(test_batch)} samples")
# Engineer features for test data
engineered_test_data = target_manager.feature_engineer.engineer_features(test_batch)
# Generate predictions for all 424 targets
predictions = {}
for i in range(424):
target_name = f'target_{i}'
try:
# Use trained model for prediction
pred_value = target_manager.predict_target(target_name, engineered_test_data)
predictions[target_name] = pred_value
except Exception as e:
print(f"โ ๏ธ Error predicting {target_name}: {str(e)}, using baseline")
# Fallback to baseline prediction
predictions[target_name] = np.random.normal(0, 0.01)
# Convert to DataFrame (required format)
pred_df = pd.DataFrame([predictions])
print(f"โ
Generated predictions for {len(predictions)} targets")
return pred_df
print("๐ง Prediction function defined")
# ============================================================================
# INFERENCE SERVER SETUP
# ============================================================================
# Create a proper implementation of InferenceServer
class MitsuiInferenceServer(InferenceServer):
def _get_gateway_for_test(self, data_paths, file_share_dir=None, *args, **kwargs):
"""Return a gateway instance for testing."""
return MitsuiGateway(data_paths)
# Create the inference server
print("๐ง Creating inference server...")
inference_server = MitsuiInferenceServer(predict)
print("โ
Inference server created successfully")
# Start the server
print("๐ Starting inference server...")
inference_server.serve()
print("๐ Inference server completed!")
print(f"โฐ Completed at: {datetime.now()}")