Add files via upload

2025-12-28 08:40:07 +01:00 · 2024-11-29 12:57:37 +07:00
parent da0d3c5909
commit 51d36cea7a
1 changed files with 782 additions and 0 deletions
--- a/Forest/Random_Forest_Diabetes.ipynb
+++ b/Forest/Random_Forest_Diabetes.ipynb
@@ -0,0 +1,782 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c7d79098",
+   "metadata": {},
+   "source": [
+    "# Introduction\n",
+    "This is a `Random Forest` approach towards a dataset called **Marketing Campaigns Dataset**. This dataset is obtained from kaggle, and is available on my repo. Below is the preview of the data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3998b03e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6dfea609",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Pregnancies</th>\n",
+       "      <th>Glucose</th>\n",
+       "      <th>BloodPressure</th>\n",
+       "      <th>SkinThickness</th>\n",
+       "      <th>Insulin</th>\n",
+       "      <th>BMI</th>\n",
+       "      <th>Pedigree</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>Outcome</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>6</td>\n",
+       "      <td>148</td>\n",
+       "      <td>72</td>\n",
+       "      <td>35</td>\n",
+       "      <td>0</td>\n",
+       "      <td>33.6</td>\n",
+       "      <td>0.627</td>\n",
+       "      <td>50</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>85</td>\n",
+       "      <td>66</td>\n",
+       "      <td>29</td>\n",
+       "      <td>0</td>\n",
+       "      <td>26.6</td>\n",
+       "      <td>0.351</td>\n",
+       "      <td>31</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>8</td>\n",
+       "      <td>183</td>\n",
+       "      <td>64</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>23.3</td>\n",
+       "      <td>0.672</td>\n",
+       "      <td>32</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>89</td>\n",
+       "      <td>66</td>\n",
+       "      <td>23</td>\n",
+       "      <td>94</td>\n",
+       "      <td>28.1</td>\n",
+       "      <td>0.167</td>\n",
+       "      <td>21</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>137</td>\n",
+       "      <td>40</td>\n",
+       "      <td>35</td>\n",
+       "      <td>168</td>\n",
+       "      <td>43.1</td>\n",
+       "      <td>2.288</td>\n",
+       "      <td>33</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \\\n",
+       "0            6      148             72             35        0  33.6   \n",
+       "1            1       85             66             29        0  26.6   \n",
+       "2            8      183             64              0        0  23.3   \n",
+       "3            1       89             66             23       94  28.1   \n",
+       "4            0      137             40             35      168  43.1   \n",
+       "\n",
+       "   Pedigree  Age  Outcome  \n",
+       "0     0.627   50        1  \n",
+       "1     0.351   31        0  \n",
+       "2     0.672   32        1  \n",
+       "3     0.167   21        0  \n",
+       "4     2.288   33        1  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv('https://raw.githubusercontent.com/youronlydimwit/Data_ScienceUse_Cases/refs/heads/main/Predictions/Data/diabetes.csv')\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ca9faa0b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 768 entries, 0 to 767\n",
+      "Data columns (total 9 columns):\n",
+      " #   Column         Non-Null Count  Dtype  \n",
+      "---  ------         --------------  -----  \n",
+      " 0   Pregnancies    768 non-null    int64  \n",
+      " 1   Glucose        768 non-null    int64  \n",
+      " 2   BloodPressure  768 non-null    int64  \n",
+      " 3   SkinThickness  768 non-null    int64  \n",
+      " 4   Insulin        768 non-null    int64  \n",
+      " 5   BMI            768 non-null    float64\n",
+      " 6   Pedigree       768 non-null    float64\n",
+      " 7   Age            768 non-null    int64  \n",
+      " 8   Outcome        768 non-null    int64  \n",
+      "dtypes: float64(2), int64(7)\n",
+      "memory usage: 54.1 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2019596c",
+   "metadata": {},
+   "source": [
+    "This dataset is related to diabetes prediction, where the target variable (`Outcome`) indicates whether a patient has diabetes (`1`) or not (`0`)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "03e4fcd6",
+   "metadata": {},
+   "source": [
+    "# 1. Initial Hypothesis\n",
+    "- **Hypothesis:** There is a relationship between various medical indicators (e.g., glucose levels, BMI, age, etc.) and whether an individual has diabetes. We hypothesize that using these features, a Random Forest model can predict the outcome effectively.\n",
+    "\n",
+    "# 2. Equation/Model\n",
+    "Mathematically, the Random Forest prediction can be written as:\n",
+    "\n",
+    "$$\n",
+    "f(x) = \\frac{1}{N} \\sum_{i=1}^{N} T_i(x)\n",
+    "$$\n",
+    "\n",
+    "Where:\n",
+    "- \\( f(x) \\) is the final predicted outcome for input features \\( x \\),\n",
+    "- \\( N \\) is the number of trees in the forest,\n",
+    "- \\( T_i(x) \\) is the prediction from the \\( i^{th} \\) tree for the input features \\( x \\).\n",
+    "\n",
+    "# 3. Preprocessing\n",
+    "Before training the model, we need to preprocess the data:\n",
+    "- **Check for missing values**: Since this dataset does not have missing values, this step can be skipped.\n",
+    "- **Feature scaling**: Although Random Forest is not as sensitive to feature scaling as algorithms like SVM or logistic regression, it's generally a good practice to standardize or normalize features like `BMI` or `Glucose`.\n",
+    "- **Splitting the data**: Split the data into training and testing sets (80-20 split is typical).\n",
+    "- **One-hot encoding**: There are no categorical variables in this dataset, so this step is not needed.\n",
+    "- **Target variable**: The `Outcome` variable is already binary (0 or 1), so no transformation is needed."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3375f744",
+   "metadata": {},
+   "source": [
+    "We will be leveraging the pre-installed `sklearn` library to conduct our analysis."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "82c296e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "08c8c3c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Features and target variable\n",
+    "X = df.drop('Outcome', axis=1)  # Features\n",
+    "y = df['Outcome']  # Target variable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "317f9ea8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Split the data into training and test sets (80% train, 20% test)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "545a59ce",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.72\n",
+      "Confusion Matrix:\n",
+      "[[77 22]\n",
+      " [21 34]]\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.79      0.78      0.78        99\n",
+      "           1       0.61      0.62      0.61        55\n",
+      "\n",
+      "    accuracy                           0.72       154\n",
+      "   macro avg       0.70      0.70      0.70       154\n",
+      "weighted avg       0.72      0.72      0.72       154\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Feature scaling\n",
+    "scaler = StandardScaler()\n",
+    "X_train_scaled = scaler.fit_transform(X_train)\n",
+    "X_test_scaled = scaler.transform(X_test)\n",
+    "\n",
+    "# Train the Random Forest model\n",
+    "rf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
+    "rf_model.fit(X_train_scaled, y_train)\n",
+    "\n",
+    "# Make predictions on the test set\n",
+    "y_pred = rf_model.predict(X_test_scaled)\n",
+    "\n",
+    "# Evaluate the model\n",
+    "accuracy = accuracy_score(y_test, y_pred)\n",
+    "print(f\"Accuracy: {accuracy:.2f}\")\n",
+    "print(\"Confusion Matrix:\")\n",
+    "print(confusion_matrix(y_test, y_pred))\n",
+    "print(\"Classification Report:\")\n",
+    "print(classification_report(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "c71558f1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.72\n",
+      "Confusion Matrix:\n",
+      "[[77 22]\n",
+      " [21 34]]\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.79      0.78      0.78        99\n",
+      "           1       0.61      0.62      0.61        55\n",
+      "\n",
+      "    accuracy                           0.72       154\n",
+      "   macro avg       0.70      0.70      0.70       154\n",
+      "weighted avg       0.72      0.72      0.72       154\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Train the Random Forest model\n",
+    "rf_model_norm = RandomForestClassifier(n_estimators=100, random_state=42)\n",
+    "rf_model_norm.fit(X_train, y_train)\n",
+    "\n",
+    "# Make predictions on the test set\n",
+    "y_pred = rf_model_norm.predict(X_test)\n",
+    "\n",
+    "# Evaluate the model\n",
+    "accuracy = accuracy_score(y_test, y_pred)\n",
+    "print(f\"Accuracy: {accuracy:.2f}\")\n",
+    "print(\"Confusion Matrix:\")\n",
+    "print(confusion_matrix(y_test, y_pred))\n",
+    "print(\"Classification Report:\")\n",
+    "print(classification_report(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "9da31e7e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "         Feature  Importance\n",
+      "1        Glucose    0.258864\n",
+      "5            BMI    0.169984\n",
+      "7            Age    0.140931\n",
+      "6       Pedigree    0.123768\n",
+      "2  BloodPressure    0.088134\n",
+      "0    Pregnancies    0.076551\n",
+      "4        Insulin    0.076122\n",
+      "3  SkinThickness    0.065646\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Feature importance\n",
+    "importances = rf_model.feature_importances_\n",
+    "features = X.columns\n",
+    "\n",
+    "# Create a DataFrame for visualization\n",
+    "feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})\n",
+    "feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)\n",
+    "\n",
+    "print(feature_importance_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51af4a71",
+   "metadata": {},
+   "source": [
+    "# First Interpretation of Model Results\n",
+    "\n",
+    "### 1. **Accuracy: 0.72**\n",
+    "   - The model correctly predicted 72% of the instances in the dataset. While this is a decent starting point, it indicates there’s room for improvement.\n",
+    "\n",
+    "### 2. **Confusion Matrix:**\n",
+    "[[77 22] [21 34]]\n",
+    "\n",
+    "The confusion matrix shows how the model performed in terms of `true positives` (TP), `false positives` (FP), `true negatives` (TN), and `false negatives` (FN):\n",
+    "\n",
+    "- **True Negatives (TN) = 77**: The model correctly predicted \"No Diabetes\" (`0`) 77 times.\n",
+    "- **False Positives (FP) = 22**: The model incorrectly predicted \"Diabetes\" (`1`) when the actual outcome was \"No Diabetes\" (0).\n",
+    "- **False Negatives (FN) = 21**: The model incorrectly predicted \"No Diabetes\" (`0`) when the actual outcome was \"Diabetes\" (1).\n",
+    "- **True Positives (TP) = 34**: The model correctly predicted \"Diabetes\" (`1`) 34 times.\n",
+    "\n",
+    "### 3. **Classification Report:**\n",
+    "This gives us more detailed performance metrics, including precision, recall, and F1-score for both classes (`0` and `1`):\n",
+    "\n",
+    "- **Class `0` (No Diabetes)**:\n",
+    "  - **Precision = 0.79**: Of all the instances the model predicted as \"No Diabetes\" (`0`), 79% were actually correct.\n",
+    "  - **Recall = 0.78**: Of all the actual \"No Diabetes\" instances, 78% were correctly identified by the model.\n",
+    "  - **F1-score = 0.78**: The harmonic mean of precision and recall for class `0`. This score indicates a balanced performance for class `0`.\n",
+    "\n",
+    "- **Class 1 (Diabetes)**:\n",
+    "  - **Precision = 0.61**: Of all the instances the model predicted as \"Diabetes\" (`1`), 61% were actually correct.\n",
+    "  - **Recall = 0.62**: Of all the actual \"Diabetes\" instances, 62% were correctly identified by the model.\n",
+    "  - **F1-score = 0.61**: The F1-score for class `1` is lower than that for class `0`, which indicates that the model is performing less well for class 1.\n",
+    "\n",
+    "- **Macro Average**: The average of precision, recall, and F1-score across both classes (`0` and `1`), without considering their support (i.e., the number of instances in each class). Here, you have:\n",
+    "  - **Precision = 0.70**\n",
+    "  - **Recall = 0.70**\n",
+    "  - **F1-score = 0.70**\n",
+    "\n",
+    "  The macro average indicates that, on average, the model performs similarly for both classes, but it doesn’t take into account the class imbalance (if there’s more data for one class than the other).\n",
+    "\n",
+    "- **Weighted Average**: This metric accounts for the support (number of instances) of each class. The weighted averages are:\n",
+    "  - **Precision = 0.72**\n",
+    "  - **Recall = 0.72**\n",
+    "  - **F1-score = 0.72**\n",
+    "\n",
+    "  The weighted average suggests that, considering the class distribution, the overall model performance is slightly better for `No Diabetes` than for `Diabetes` but still reasonably balanced.\n",
+    "\n",
+    "### 4. **Conclusion:**\n",
+    "- **Accuracy** (72%) shows that the model is making correct predictions most of the time, but there’s still a potential for improvement, especially for classifying `Diabetes` correctly.\n",
+    "- The **confusion matrix** indicates that the model is somewhat biased towards predicting `No Diabetes` (class `0`), as seen by the higher number of `True Negatives` and `False Positives`.\n",
+    "- For **class `1` (`Diabetes`)**, both precision and recall are lower, which means the model is missing some instances of diabetes (`False Negatives`) and sometimes incorrectly identifying non-diabetes cases as diabetes (`False Positives`).\n",
+    "- The **macro and weighted averages** suggest a moderately balanced performance, but there’s room for improving the recall for class `1` (`Diabetes`) to reduce the number of `False Negatives`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "049b4055",
+   "metadata": {},
+   "source": [
+    "## **Proposed Improvement Methods:**\n",
+    "1. **Address Class Imbalance**: If the dataset has a class imbalance (more \"No Diabetes\" than \"Diabetes\"), consider techniques like oversampling the minority class, undersampling the majority class, or using class weights in the Random Forest model to handle the imbalance better.\n",
+    "2. **Hyperparameter Tuning**: Adjust the hyperparameters of the Random Forest, such as the number of trees (`n_estimators`), the maximum depth (`max_depth`), or the minimum samples required for a split, to potentially improve performance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "0216c9a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "6832bb59",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fitting 5 folds for each of 100 candidates, totalling 500 fits\n",
+      "Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'class_weight': 'balanced', 'bootstrap': False}\n",
+      "Accuracy: 0.7337662337662337\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.84      0.73      0.78        99\n",
+      "           1       0.60      0.75      0.67        55\n",
+      "\n",
+      "    accuracy                           0.73       154\n",
+      "   macro avg       0.72      0.74      0.72       154\n",
+      "weighted avg       0.75      0.73      0.74       154\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:540: FitFailedWarning: \n",
+      "185 fits failed out of a total of 500.\n",
+      "The score on these train-test partitions for these parameters will be set to nan.\n",
+      "If these failures are not expected, you can try to debug them by setting error_score='raise'.\n",
+      "\n",
+      "Below are more details about the failures:\n",
+      "--------------------------------------------------------------------------------\n",
+      "82 fits failed with the following error:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n",
+      "    estimator.fit(X_train, y_train, **fit_params)\n",
+      "  File \"C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n",
+      "    estimator._validate_params()\n",
+      "  File \"C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n",
+      "    validate_parameter_constraints(\n",
+      "  File \"C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n",
+      "    raise InvalidParameterError(\n",
+      "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "103 fits failed with the following error:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n",
+      "    estimator.fit(X_train, y_train, **fit_params)\n",
+      "  File \"C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\base.py\", line 1466, in wrapper\n",
+      "    estimator._validate_params()\n",
+      "  File \"C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\base.py\", line 666, in _validate_params\n",
+      "    validate_parameter_constraints(\n",
+      "  File \"C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 95, in validate_parameter_constraints\n",
+      "    raise InvalidParameterError(\n",
+      "sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.\n",
+      "\n",
+      "  warnings.warn(some_fits_failed_message, FitFailedWarning)\n",
+      "C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py:1103: UserWarning: One or more of the test scores are non-finite: [0.749167          nan 0.75404505        nan        nan 0.76217513\n",
+      "        nan 0.67426363 0.67423697 0.7638278  0.75896308 0.7589231\n",
+      " 0.7720112  0.75900307 0.71659336 0.69056377        nan 0.72316407\n",
+      "        nan 0.75239238 0.76545382        nan        nan 0.76224177\n",
+      " 0.75241903        nan 0.68236705 0.70359856        nan        nan\n",
+      " 0.70517126 0.71336799 0.72469679 0.75079302        nan 0.74272957\n",
+      " 0.7703452  0.77522324 0.75893643        nan 0.7231774         nan\n",
+      " 0.72800213 0.7459283  0.74270292 0.72637612 0.71170199 0.76060243\n",
+      " 0.73778489        nan 0.72961482 0.69380248 0.76057577 0.73779821\n",
+      " 0.74103692 0.7605891         nan        nan 0.75564441        nan\n",
+      " 0.65799014        nan        nan        nan 0.65472478 0.77199787\n",
+      " 0.7638278  0.75733707        nan 0.72148474        nan 0.75893643\n",
+      " 0.76869252        nan 0.74267626 0.75567106        nan        nan\n",
+      " 0.77363721 0.71658003        nan 0.76057577        nan 0.73451953\n",
+      " 0.74594162        nan        nan 0.77683593        nan        nan\n",
+      "        nan        nan        nan 0.75728375 0.76224177 0.75737705\n",
+      "        nan        nan 0.7654938  0.74754098]\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define the RandomForestClassifier\n",
+    "rf = RandomForestClassifier()\n",
+    "\n",
+    "# Define the parameter distribution\n",
+    "param_dist = {\n",
+    "    'n_estimators': [50, 200, 100, 150],\n",
+    "    'max_depth': [None, 10, 20, 30, 40, 50],\n",
+    "    'min_samples_split': [2, 5, 10, 20],\n",
+    "    'min_samples_leaf': [1, 2, 4, 8],\n",
+    "    'max_features': ['auto', 'sqrt', 'log2'],\n",
+    "    'bootstrap': [True, False],\n",
+    "    'class_weight': ['balanced', {0: 1, 1: 5}, {0: 1, 1: 7}, {0: 1, 1: 10}]\n",
+    "}\n",
+    "\n",
+    "# Setup RandomizedSearchCV\n",
+    "random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, \n",
+    "                                   n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)\n",
+    "\n",
+    "# Fit the random search to the data\n",
+    "random_search.fit(X_train, y_train)\n",
+    "\n",
+    "# Best hyperparameters\n",
+    "print(f\"Best Hyperparameters: {random_search.best_params_}\")\n",
+    "\n",
+    "# Predict using the best model\n",
+    "y_pred = random_search.best_estimator_.predict(X_test)\n",
+    "\n",
+    "# Evaluate the model\n",
+    "print(f\"Accuracy: {accuracy_score(y_test, y_pred)}\")\n",
+    "print(classification_report(y_test, y_pred))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b6f98230",
+   "metadata": {},
+   "source": [
+    "# Further processing with the addition of SMOTE Oversampling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "5de37774",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from imblearn.over_sampling import SMOTE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "dffc09c3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Original class distribution: Outcome\n",
+      "0    401\n",
+      "1    213\n",
+      "Name: count, dtype: int64\n",
+      "Resampled class distribution: Outcome\n",
+      "0    401\n",
+      "1    401\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Instantiate SMOTE\n",
+    "smote = SMOTE(random_state=42)\n",
+    "\n",
+    "# Fit and resample the training data\n",
+    "X_resampled, y_resampled = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "# Check the new class distribution\n",
+    "print(f\"Original class distribution: {y_train.value_counts()}\")\n",
+    "print(f\"Resampled class distribution: {pd.Series(y_resampled).value_counts()}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "84bb66c3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.7597402597402597\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.84      0.77      0.80        99\n",
+      "           1       0.64      0.75      0.69        55\n",
+      "\n",
+      "    accuracy                           0.76       154\n",
+      "   macro avg       0.74      0.76      0.75       154\n",
+      "weighted avg       0.77      0.76      0.76       154\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Best hyperparameters from the previous RandomizedSearchCV\n",
+    "best_params = random_search.best_params_\n",
+    "\n",
+    "# Define the RandomForestClassifier with the best parameters\n",
+    "rf = RandomForestClassifier(\n",
+    "    n_estimators=best_params['n_estimators'],\n",
+    "    max_depth=best_params['max_depth'],\n",
+    "    min_samples_split=best_params['min_samples_split'],\n",
+    "    min_samples_leaf=best_params['min_samples_leaf'],\n",
+    "    max_features=best_params['max_features'],\n",
+    "    bootstrap=best_params['bootstrap'],\n",
+    "    class_weight='balanced',  # Since the data is now balanced\n",
+    "    random_state=42\n",
+    ")\n",
+    "\n",
+    "# Train the model\n",
+    "rf.fit(X_resampled, y_resampled)\n",
+    "\n",
+    "# Make predictions\n",
+    "y_pred = rf.predict(X_test)\n",
+    "\n",
+    "# Evaluate the model\n",
+    "print(f\"Accuracy: {accuracy_score(y_test, y_pred)}\")\n",
+    "print(classification_report(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7113149b",
+   "metadata": {},
+   "source": [
+    "# Final Interpretation\n",
+    "\n",
+    "### 1. **Accuracy: 75.97%**\n",
+    "   - This indicates that the model correctly predicted the outcome (Diabetes or No Diabetes) for about 76% of the test data. While this is a decent performance, there's room for improvement, particularly for classifying the minority class (Diabetes).\n",
+    "\n",
+    "### 2. **Classification Report:**\n",
+    "This gives us more detailed performance metrics, including precision, recall, and F1-score for both classes (`0` and `1`):\n",
+    "\n",
+    "- **Class `0` (No Diabetes)**:\n",
+    "  - **Precision = 0.84**: The model correctly identifies 84% of the instances predicted as \"No Diabetes\"\n",
+    "  - **Recall = 0.77**: The model identifies 75% of the actual \"Diabetes\" instances, meaning that it is fairly good at recognizing class `1`. However, 25% of the true `Diabetes` cases are misclassified as `No Diabetes`.\n",
+    "  - **F1-score = 0.80**: The harmonic mean of precision and recall for class `0`. A score of 0.80 for class 0 indicates that the model has a good balance of precision and recall.\n",
+    "\n",
+    "- **Class `1` (Diabetes)**:\n",
+    "  - **Precision = 0.64**: Of all the instances the model predicted as \"Diabetes\" (`1`), 64% were actually correct.\n",
+    "  - **Recall = 0.75**: Of all the actual \"Diabetes\" instances, 62% were correctly identified by the model.\n",
+    "  - **F1-score = 0.69**: The F1-score for class `1` reflects a reasonable trade-off between precision and recall. While the model performs relatively well at detecting diabetes (due to the recall), there’s still room to improve the accuracy of its predictions.\n",
+    "\n",
+    "- **Macro Average**: The average of precision, recall, and F1-score across both classes (`0` and `1`), without considering their support (i.e., the number of instances in each class). Here, you have:\n",
+    "  - **Precision = 0.74**\n",
+    "  - **Recall = 0.76**\n",
+    "  - **F1-score = 0.75**\n",
+    "\n",
+    "  These averages suggest that, on balance, the model does a fair job of identifying both classes (Diabetes and No Diabetes) across the dataset.\n",
+    "\n",
+    "- **Weighted Average**: This metric accounts for the support (number of instances) of each class. The weighted averages are:\n",
+    "  - **Precision = 0.77**\n",
+    "  - **Recall = 0.76**\n",
+    "  - **F1-score = 0.76**\n",
+    "\n",
+    "  This indicates that the model performs well overall, especially when considering the class imbalance. Since class `0` (`No Diabetes`) has more instances than class `1` (`Diabetes`), the weighted average is more influenced by the performance on class `0`.\n",
+    "\n",
+    "### 4. **Conclusion:**\n",
+    "- The model is relatively strong at predicting class `0` (`No Diabetes`), with good precision and recall, resulting in a higher F1-score.\n",
+    "- For class `1` (`Diabetes`), the model performs reasonably well but could be improved in terms of precision (since some non-Diabetes instances are incorrectly predicted as Diabetes). The recall for Diabetes is better, meaning the model can identify most of the actual diabetes cases, but it still misses some (false negatives).\n",
+    "- Overall, the model performs well, but focusing on improving the recall and precision for the minority class (Diabetes) could enhance its real-world applicability, especially when detecting true diabetic cases in imbalanced datasets."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2dff64ae",
+   "metadata": {},
+   "source": [
+    "# Optional: Save the model for further deployment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1db41a56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#import joblib\n",
+    "\n",
+    "# Save the trained model\n",
+    "# joblib.dump(rf, 'diabetes_rf_model_with_smote.pkl')\n",
+    "\n",
+    "# Load the saved model\n",
+    "# rf_loaded = joblib.load('diabetes_rf_model_with_smote.pkl')\n",
+    "# X_new = new_data.drop(columns=['Outcome'])  # Drop the Outcome column if present\n",
+    "\n",
+    "# Apply the model to make predictions\n",
+    "# y_pred_new = rf_loaded.predict(X_new)\n",
+    "\n",
+    "# Add predictions to the new dataframe (if you'd like)\n",
+    "# new_data['Predicted_Outcome'] = y_pred_new\n",
+    "\n",
+    "# Display or save the new dataframe with predictions\n",
+    "# print(new_data.head())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}