-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSimulating crime prediction in nigeria using south african dataset.R
102 lines (87 loc) · 4.02 KB
/
Simulating crime prediction in nigeria using south african dataset.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
install.packages("dplyr")
install.packages("caret")
install.packages("tidyverse")
install.packages("readr")
install.packages("e1071")
install.packages("randomForest")
library(dplyr)
library(caret)
library(tidyverse)
library(readr)
library(e1071)
library(randomForest)
library(ggplot2)
# load the dataset
SouthAfricaCrimes <- read.csv("~/SouthAfricaCrimeStats_v2.csv", header=FALSE)
# Sample 10000 rows from the dataset
set.seed(123)
data_sampled <- SouthAfricaCrimes %>% sample_n(30000, replace = TRUE)
# Convert relevant columns to numeric
data_sampled[, 4:ncol(data_sampled)] <- lapply(data_sampled[, 4:ncol(data_sampled)], as.numeric)
# Sum up the number of crimes across all categories for each row
total_crime <- rowSums(data_sampled[, 4:ncol(data_sampled)], na.rm = TRUE)
# Create a binary outcome variable based on '2005-2006'
outcome <-factor(ifelse(total_crime > 0, 1, 0),
levels = c(0, 1),
labels = c('non criminal activity', 'criminal activity'))
# Remove variables with low variance
vars_to_remove <- nearZeroVar(data_sampled, saveMetrics = TRUE)
data_sampled <- data_sampled[, !vars_to_remove$nzv]
# Check if any variables are left in the data set
if (ncol(data_sampled) == 0) {
stop("No variables left in the data set after removing low variance variables.")
}
# Split the data into training and testing datasets
trainIndex <- createDataPartition(outcome, p = 0.7, list = FALSE, times = 1)
train <- data_sampled[trainIndex, ]
train$outcome <- outcome[trainIndex] # add outcome as a column to train
test <- data_sampled[-trainIndex, ]
test$outcome <- outcome[-trainIndex]
# Build the random forest model
set.seed(123)
rf_model <- randomForest(outcome ~ ., data = train, ntree = 500, importance = TRUE)
# Make predictions using the model
model_training <- predict(rf_model, train) # making predictions for training set
model_testing <- predict(rf_model, test) # making predictions for testing set
# Model performance (display confusion matrix and stats)
model_training_confusion <- confusionMatrix(model_training, train$outcome)
model_testing_confusion <- confusionMatrix(model_testing, test$outcome)
#print result
print(model_training_confusion)
print(model_testing_confusion)
# Get the confusion matrix for the testing set
conf_mat <- model_testing_confusion$table
# Convert the confusion matrix to a data frame
conf_mat_df <- as.data.frame.matrix(conf_mat)
# Add row and column names to the data frame
conf_mat_df <- cbind(Class = rownames(conf_mat_df), conf_mat_df)
rownames(conf_mat_df) <- NULL
# Reshape the data frame for plotting
conf_mat_df_long <- tidyr::gather(conf_mat_df, key = "Prediction", value = "Count", -Class)
# Plot the heatmap
ggplot(conf_mat_df_long, aes(x = Prediction, y = Class, fill = Count)) +
geom_tile() +
geom_text(aes(label = Count)) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(x = "Predicted", y = "Actual", title = "Confusion Matrix Heatmap for Testing Set") +
theme_minimal()
# Calculate accuracy and f1_score for training set
train_accuracy <- confusionMatrix(model_training, train$outcome)$overall['Accuracy']
train_f1_score <- confusionMatrix(model_training, train$outcome)$byClass['F1']
# Calculate accuracy and f1_score for testing set
test_accuracy <- confusionMatrix(model_testing, test$outcome)$overall['Accuracy']
test_f1_score <- confusionMatrix(model_testing, test$outcome)$byClass['F1']
# Create a data frame with the metrics
df_metrics <- data.frame(
Metric = c('Accuracy', 'F1 Score'),
Training = c(train_accuracy, train_f1_score),
Testing = c(test_accuracy, test_f1_score)
)
# Reshape data to long format for plotting
df_metrics_long <- df_metrics %>% pivot_longer(cols = c('Training', 'Testing'), names_to = 'Set', values_to = 'Value')
# Plot bar chart
ggplot(df_metrics_long, aes(Metric, Value, fill = Set)) +
geom_col(position = position_dodge()) +
scale_fill_manual(values = c('blue', 'red')) +
ggtitle('Model Performance Metrics') +
theme_minimal()