# Load the dataset
dataset <- read.csv("diabetes_prediction_dataset.csv", header = TRUE, stringsAsFactors = FALSE)
# Display the first few rows of the dataset
head(dataset)
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## 1 Female 80 0 1 never 25.19 6.6
## 2 Female 54 0 0 No Info 27.32 6.6
## 3 Male 28 0 0 never 27.32 5.7
## 4 Female 36 0 0 current 23.45 5.0
## 5 Male 76 1 1 current 20.14 4.8
## 6 Female 20 0 0 never 27.32 6.6
## blood_glucose_level diabetes
## 1 140 0
## 2 80 0
## 3 158 0
## 4 155 0
## 5 155 0
## 6 85 0
# Generate summary statistics for the dataset
summary(dataset)
## gender age hypertension heart_disease
## Length:100000 Min. : 0.08 Min. :0.00000 Min. :0.00000
## Class :character 1st Qu.:24.00 1st Qu.:0.00000 1st Qu.:0.00000
## Mode :character Median :43.00 Median :0.00000 Median :0.00000
## Mean :41.89 Mean :0.07485 Mean :0.03942
## 3rd Qu.:60.00 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :80.00 Max. :1.00000 Max. :1.00000
## smoking_history bmi HbA1c_level blood_glucose_level
## Length:100000 Min. :10.01 Min. :3.500 Min. : 80.0
## Class :character 1st Qu.:23.63 1st Qu.:4.800 1st Qu.:100.0
## Mode :character Median :27.32 Median :5.800 Median :140.0
## Mean :27.32 Mean :5.528 Mean :138.1
## 3rd Qu.:29.58 3rd Qu.:6.200 3rd Qu.:159.0
## Max. :95.69 Max. :9.000 Max. :300.0
## diabetes
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.085
## 3rd Qu.:0.000
## Max. :1.000
# Create a bar chart for the 'gender' variable with enhanced aesthetics
gender_chart <- ggplot(dataset, aes(x=gender, fill=gender)) +
geom_bar() +
labs(title="Gender Distribution", x="Gender", y="Count") +
theme_minimal() +
scale_fill_brewer(palette="Pastel1") +
theme(legend.position="none")
# Print the gender bar chart
print(gender_chart)

# Create a scatter plot for 'bmi' and 'blood_glucose_level' with enhanced aesthetics
scatter_plot <- ggplot(dataset, aes(x=bmi, y=blood_glucose_level, color=blood_glucose_level)) +
geom_point(alpha=0.6) +
labs(title="BMI vs Blood Glucose Level", x="BMI", y="Blood Glucose Level") +
theme_minimal() +
scale_color_gradient(low="blue", high="red")
# Print the scatter plot
print(scatter_plot)

# Split the dataset into training and testing sets
set.seed(123) # for reproducibility
index <- createDataPartition(dataset$diabetes, p=0.8, list=FALSE)
train_set <- dataset[index, ]
test_set <- dataset[-index, ]
# Convert the 'diabetes' variable to a factor with two levels
train_set$diabetes <- factor(train_set$diabetes, levels = c(0, 1))
# Train a random forest classifier
set.seed(123) # for reproducibility
rf_model <- randomForest(diabetes ~ ., data=train_set, ntree=100)
# Generate predictions on the test set
predictions <- predict(rf_model, test_set)
# Ensure the actual diabetes values and predictions are factors with the same levels
test_set$diabetes <- factor(test_set$diabetes, levels = c(0, 1))
predictions <- factor(predictions, levels = c(0, 1))
# Diagnostic code to inspect the predictions and actual values
cat("Table of actual diabetes values:\n")
## Table of actual diabetes values:
print(table(test_set$diabetes))
##
## 0 1
## 18377 1623
cat("Table of predicted diabetes values:\n")
## Table of predicted diabetes values:
print(table(predictions))
## predictions
## 0 1
## 18880 1120
# Calculate the F1-score using the confusionMatrix function
conf_matrix <- confusionMatrix(predictions, test_set$diabetes)
f1_score <- conf_matrix$byClass['F1']
# Check if F1-score is NA and print a message if it is
if(is.na(f1_score)) {
cat("The F1-score is NA. Please check the predictions and actual values.\n")
} else {
# Write the F1-score to a text file
cat(f1_score, file = "/home/ubuntu/f1_score.txt")
# Print the F1-score to the console
cat("F1-score:", f1_score, "\n")
}
## F1-score: 0.9860697
# Save the trained random forest model to a file for future use
model_file <- "/home/ubuntu/diabetes_rf_model.rds"
saveRDS(rf_model, file = model_file)
cat("Model saved to:", model_file, "\n")
## Model saved to: /home/ubuntu/diabetes_rf_model.rds
# Instructions to load the model for future inference
cat("To load the model for future inference, use the following command:\n")
## To load the model for future inference, use the following command:
cat("loaded_model <- readRDS('", model_file, "')\n")
## loaded_model <- readRDS(' /home/ubuntu/diabetes_rf_model.rds ')
# Conclusion
# Summarize the tutorial findings and provide insights based on the model's performance
cat("Conclusion:\n")
## Conclusion:
cat("In this tutorial, we have successfully loaded and explored the diabetes prediction dataset, performed exploratory data analysis, and visualized key variables. We split the data into training and testing sets, trained a random forest classifier, and evaluated its performance using the F1-score as the metric.\n\n")
## In this tutorial, we have successfully loaded and explored the diabetes prediction dataset, performed exploratory data analysis, and visualized key variables. We split the data into training and testing sets, trained a random forest classifier, and evaluated its performance using the F1-score as the metric.
cat("The F1-score obtained from the model evaluation is a measure of the model's accuracy in predicting whether a patient has diabetes or not. It considers both the precision and the recall of the test to compute the score, which is crucial for a balanced assessment in medical diagnosis predictions.\n\n")
## The F1-score obtained from the model evaluation is a measure of the model's accuracy in predicting whether a patient has diabetes or not. It considers both the precision and the recall of the test to compute the score, which is crucial for a balanced assessment in medical diagnosis predictions.
cat("The random forest model showed an F1-score of ", f1_score, ", which indicates an excellent predictive performance. An F1-score closer to 1 suggests a high precision and recall, meaning the model is effective at identifying true positives while minimizing false positives and false negatives - crucial for medical diagnosis predictions.\n\n")
## The random forest model showed an F1-score of 0.9860697 , which indicates an excellent predictive performance. An F1-score closer to 1 suggests a high precision and recall, meaning the model is effective at identifying true positives while minimizing false positives and false negatives - crucial for medical diagnosis predictions.
cat("Future work could involve tuning the model parameters, trying different algorithms, or incorporating more features to improve the model's predictive power. Additionally, the model could be deployed in a clinical setting to assist healthcare professionals in early diabetes detection.\n\n")
## Future work could involve tuning the model parameters, trying different algorithms, or incorporating more features to improve the model's predictive power. Additionally, the model could be deployed in a clinical setting to assist healthcare professionals in early diabetes detection.
cat("The trained model has been saved to a file, and instructions for loading it for future inference have been provided, ensuring that the model can be easily used for predictions on new data without the need for retraining.\n")
## The trained model has been saved to a file, and instructions for loading it for future inference have been provided, ensuring that the model can be easily used for predictions on new data without the need for retraining.