IP-Bot/random_forest_study.py at main · Tripxp1/IP-Bot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
# random_forest_study.py
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
import joblib
from skimage import feature, transform

# ИЗМЕНИТЕ ЭТОТ ПУТЬ! Укажите путь к вашей папке с данными
DATA_PATH = "EuroSAT-master"

"""Random Forest классификатор для EuroSAT"""
class RandomForestClassifierModel:

    def __init__(self):
        self.rf = RandomForestClassifier(
            n_estimators=100,
            max_depth=20,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',
            bootstrap=True,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        )

        self.scaler = StandardScaler()
        self.is_trained = False
        self.label_map = None
        self.reverse_label_map = None

        self.hog_orientations = 9
        self.hog_pixels_per_cell = (8, 8)
        self.hog_cells_per_block = (2, 2)
        self.hog_block_norm = 'L2-Hys'

    def load_data(self):
        train_df = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
        val_df = pd.read_csv(os.path.join(DATA_PATH, "validation.csv"))

        with open(os.path.join(DATA_PATH, "label_map.json"), 'r') as f:
            self.label_map = json.load(f)
        self.reverse_label_map = {v: k for k, v in self.label_map.items()}

        print(f"Загружено {len(train_df)} тренировочных и {len(val_df)} валидационных образцов")
        return train_df, val_df

    def prepare_data(self, df, sample_size=None, balance_classes=True):
        print(f"\n Подготовка данных...")

        if sample_size is None:
            sample_size = len(df)

        if balance_classes:
            samples_per_class = sample_size // len(df['Label'].unique())
            balanced_samples = []

            for label in df['Label'].unique():
                class_samples = df[df['Label'] == label]
                if len(class_samples) > samples_per_class:
                    class_samples = class_samples.sample(samples_per_class, random_state=42)
                balanced_samples.append(class_samples)

            sample_df = pd.concat(balanced_samples, ignore_index=True)
        else:
            sample_df = df.sample(min(sample_size, len(df)), random_state=42)

        images = []
        labels = []
        failed_count = 0

        for idx, row in sample_df.iterrows():
            img_filename = row['Filename']
            img_path = os.path.join(DATA_PATH, img_filename)

            try:
                img = Image.open(img_path)
                img_array = np.array(img)

                if img_array.size == 0 or len(img_array.shape) != 3:
                    failed_count += 1
                    continue

                images.append(img_array)
                labels.append(row['Label'])

            except Exception as e:
                failed_count += 1
                print(f"Ошибка загрузки {img_filename}: {e}")
                continue

        print(f"✅ Загружено {len(images)} изображений, {len(set(labels))} классов, ошибок: {failed_count}")

        return images, labels

    """Извлечение HOG-признаков"""
    def extract_hog_features(self, images, target_size=(64, 64)):
        print("Извлечение HOG-признаков...")

        features = []
        processed_count = 0
        failed_count = 0

        for i, img_array in enumerate(images):
            try:
                img_resized = transform.resize(img_array, target_size, anti_aliasing=True)

                if len(img_resized.shape) == 3:
                    gray = np.dot(img_resized[...,:3], [0.2989, 0.5870, 0.1140])
                else:
                    gray = img_resized

                hog_feat = feature.hog(
                    gray,
                    orientations=self.hog_orientations,
                    pixels_per_cell=self.hog_pixels_per_cell,
                    cells_per_block=self.hog_cells_per_block,
                    block_norm=self.hog_block_norm,
                    visualize=False,
                    channel_axis=None
                )

                features.append(hog_feat)
                processed_count += 1

                if processed_count % 500 == 0:
                    print(f"Обработано {processed_count}/{len(images)} изображений...")

            except Exception as e:
                failed_count += 1
                print(f"Ошибка обработки изображения {i}: {e}")
                continue

        print(f"✅ Успешно обработано: {processed_count}, ошибок: {failed_count}")

        return np.array(features)

    """Извлечение цветовых признаков"""
    def extract_color_features(self, images, target_size=(64, 64)):
        print("Извлечение цветовых признаков...")

        color_features = []

        for i, img_array in enumerate(images):
            try:
                img_resized = transform.resize(img_array, target_size, anti_aliasing=True)

                # Цветовые гистограммы по каналам
                hist_r = np.histogram(img_resized[:,:,0], bins=16, range=(0, 1))[0]
                hist_g = np.histogram(img_resized[:,:,1], bins=16, range=(0, 1))[0]
                hist_b = np.histogram(img_resized[:,:,2], bins=16, range=(0, 1))[0]

                # Средние значения по каналам
                mean_r = np.mean(img_resized[:,:,0])
                mean_g = np.mean(img_resized[:,:,1])
                mean_b = np.mean(img_resized[:,:,2])

                # Стандартные отклонения
                std_r = np.std(img_resized[:,:,0])
                std_g = np.std(img_resized[:,:,1])
                std_b = np.std(img_resized[:,:,2])

                # Объединяем все признаки
                color_feat = np.concatenate([
                    hist_r, hist_g, hist_b,
                    [mean_r, mean_g, mean_b],
                    [std_r, std_g, std_b]
                ])

                color_features.append(color_feat)

                if (i + 1) % 500 == 0:
                    print(f"Обработано {i+1}/{len(images)} изображений...")

            except Exception as e:
                print(f"Ошибка обработки цвета изображения {i}: {e}")
                color_features.append(np.zeros(16*3 + 3 + 3))  # нули в случае ошибки
                continue

        return np.array(color_features)

    """Извлечение комбинированных признаков"""
    def extract_combined_features(self, images):
        print("Извлечение комбинированных признаков...")

        # HOG признаки
        hog_features = self.extract_hog_features(images)

        # Цветовые признаки
        color_features = self.extract_color_features(images)

        # Объединяем признаки
        combined_features = np.hstack([hog_features, color_features])

        print(f"Общая размерность признаков: {combined_features.shape[1]}")
        print(f" - HOG: {hog_features.shape[1]}")
        print(f" - Color: {color_features.shape[1]}")

        return combined_features

    """Обучение модели"""
    def train(self):
        print("\n ОБУЧЕНИЕ RANDOM FOREST CLASSIFIER")
        print("=" * 50)

        train_df, val_df = self.load_data()
        if train_df is None:
            print("❌ Не удалось загрузить данные!")
            return 0.0

        # Уменьшите sample_size если возникают проблемы с памятью
        X_train, y_train = self.prepare_data(train_df, sample_size=3000, balance_classes=True)
        X_val, y_val = self.prepare_data(val_df, sample_size=800, balance_classes=False)

        print("\nИзвлечение признаков для обучения...")
        X_train_features = self.extract_combined_features(X_train)
        X_val_features = self.extract_combined_features(X_val)

        print("Масштабирование признаков...")
        X_train_scaled = self.scaler.fit_transform(X_train_features)
        X_val_scaled = self.scaler.transform(X_val_features)

        print("Обучение Random Forest...")
        self.rf.fit(X_train_scaled, y_train)
        self.is_trained = True

        print("Анализ важности признаков...")
        self.plot_feature_importance(X_train_features.shape[1])

        print("\n ОЦЕНКА МОДЕЛИ:")
        print("-" * 40)

        y_train_pred = self.rf.predict(X_train_scaled)
        y_val_pred = self.rf.predict(X_val_scaled)

        train_accuracy = accuracy_score(y_train, y_train_pred)
        val_accuracy = accuracy_score(y_val, y_val_pred)
        overfitting_gap = train_accuracy - val_accuracy

        print(f"Точность на обучающей выборке: {train_accuracy:.4f}")
        print(f"Точность на валидационной выборке: {val_accuracy:.4f}")
        print(f"Разрыв (переобучение): {overfitting_gap:.4f}")

        print("\n Детальный отчет по классификации:")
        class_names = [self.reverse_label_map[i] for i in range(len(self.label_map))]
        print(classification_report(y_val, y_val_pred, target_names=class_names, zero_division=0))

        self.plot_confusion_matrix(y_val, y_val_pred, class_names)

        return val_accuracy

    """Визуализация важности признаков"""
    def plot_feature_importance(self, num_features):
        if not self.is_trained:
            return

        importance = self.rf.feature_importances_

        hog_importance = np.sum(importance[:324])
        color_hist_importance = np.sum(importance[324:324+48])
        color_stat_importance = np.sum(importance[324+48:])

        labels = ['HOG', 'Color Histograms', 'Color Stats']
        values = [hog_importance, color_hist_importance, color_stat_importance]

        plt.figure(figsize=(10, 6))
        bars = plt.bar(labels, values, color=['skyblue', 'lightcoral', 'lightgreen'])
        plt.title('Важность типов признаков в Random Forest')
        plt.ylabel('Суммарная важность')

        for bar, value in zip(bars, values):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                    f'{value:.3f}', ha='center', va='bottom')

        plt.tight_layout()
        plt.savefig('rf_feature_importance.png', dpi=100, bbox_inches='tight')
        plt.show()

        print(f"\n АНАЛИЗ ВАЖНОСТИ ПРИЗНАКОВ:")
        print(f"HOG: {hog_importance:.3f}")
        print(f"Color Histograms: {color_hist_importance:.3f}")
        print(f"Color Statistics: {color_stat_importance:.3f}")

    """Построение матрицы ошибок"""
    def plot_confusion_matrix(self, y_true, y_pred, class_names):
        plt.figure(figsize=(10, 8))
        cm = confusion_matrix(y_true, y_pred)

        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=class_names,
                   yticklabels=class_names)
        plt.title('Матрица ошибок - Random Forest Classifier')
        plt.xlabel('Предсказанные метки')
        plt.ylabel('Истинные метки')
        plt.xticks(rotation=45)
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.savefig('rf_confusion_matrix.png', dpi=100, bbox_inches='tight')
        plt.show()

        self.analyze_confusion_matrix(cm, class_names)

    """Анализ матрицы ошибок"""
    def analyze_confusion_matrix(self, cm, class_names):
        print("\n🔍 АНАЛИЗ МАТРИЦЫ ОШИБОК:")
        print("-" * 40)

        for i in range(len(class_names)):
            correct = cm[i, i]
            total = cm[i].sum()
            accuracy = correct / total if total > 0 else 0

            if accuracy < 0.6:
                print(f"❌ Проблемный класс: {class_names[i]}")
                print(f"Точность: {accuracy:.2%} ({correct}/{total})")

                errors = [(class_names[j], cm[i, j]) for j in range(len(class_names)) if j != i and cm[i, j] > 0]
                errors.sort(key=lambda x: x[1], reverse=True)

                if errors:
                    print(f"Чаще путается с: {errors[:3]}")

    """Сохранение модели"""
    def save_model(self):
        if not self.is_trained:
            print("❌ Модель не обучена!")
            return

        model_data = {
            'rf': self.rf,
            'scaler': self.scaler,
            'label_map': self.label_map,
            'hog_params': {
                'orientations': self.hog_orientations,
                'pixels_per_cell': self.hog_pixels_per_cell,
                'cells_per_block': self.hog_cells_per_block,
                'block_norm': self.hog_block_norm
            }
        }

        joblib.dump(model_data, 'random_forest_model.pkl')
        print("✅ Random Forest модель сохранена в 'random_forest_model.pkl'")

    """Загрузка модели"""
    def load_model(self):
        try:
            model_data = joblib.load('random_forest_model.pkl')
            self.rf = model_data['rf']
            self.scaler = model_data['scaler']
            self.label_map = model_data['label_map']

            hog_params = model_data.get('hog_params', {})
            self.hog_orientations = hog_params.get('orientations', 9)
            self.hog_pixels_per_cell = hog_params.get('pixels_per_cell', (8, 8))
            self.hog_cells_per_block = hog_params.get('cells_per_block', (2, 2))
            self.hog_block_norm = hog_params.get('block_norm', 'L2-Hys')

            self.reverse_label_map = {v: k for k, v in self.label_map.items()}
            self.is_trained = True
            print("✅ Random Forest модель загружена")
        except FileNotFoundError:
            print("❌ Файл модели не найден")

def main():
    rf_classifier = RandomForestClassifierModel()
    rf_accuracy = rf_classifier.train()
    rf_classifier.save_model()

    print(f"\n🎉 ОБУЧЕНИЕ RANDOM FOREST ЗАВЕРШЕНО!")
    print(f"📈 Итоговая точность Random Forest: {rf_accuracy:.4f}")

if __name__ == "__main__":
    main()