{
"cells": [
{
"cell_type": "markdown",
"id": "bd00ebdc",
"metadata": {},
"source": [
"# Load Basic Libraries\n",
" Load some libaries to read and display the data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "375f1a0c",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b8760991",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" diagnosis | \n",
" radius_mean | \n",
" texture_mean | \n",
" perimeter_mean | \n",
" area_mean | \n",
" smoothness_mean | \n",
" compactness_mean | \n",
" concavity_mean | \n",
" concave points_mean | \n",
" ... | \n",
" texture_worst | \n",
" perimeter_worst | \n",
" area_worst | \n",
" smoothness_worst | \n",
" compactness_worst | \n",
" concavity_worst | \n",
" concave points_worst | \n",
" symmetry_worst | \n",
" fractal_dimension_worst | \n",
" Unnamed: 32 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 842302 | \n",
" M | \n",
" 17.99 | \n",
" 10.38 | \n",
" 122.80 | \n",
" 1001.0 | \n",
" 0.11840 | \n",
" 0.27760 | \n",
" 0.3001 | \n",
" 0.14710 | \n",
" ... | \n",
" 17.33 | \n",
" 184.60 | \n",
" 2019.0 | \n",
" 0.1622 | \n",
" 0.6656 | \n",
" 0.7119 | \n",
" 0.2654 | \n",
" 0.4601 | \n",
" 0.11890 | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" 842517 | \n",
" M | \n",
" 20.57 | \n",
" 17.77 | \n",
" 132.90 | \n",
" 1326.0 | \n",
" 0.08474 | \n",
" 0.07864 | \n",
" 0.0869 | \n",
" 0.07017 | \n",
" ... | \n",
" 23.41 | \n",
" 158.80 | \n",
" 1956.0 | \n",
" 0.1238 | \n",
" 0.1866 | \n",
" 0.2416 | \n",
" 0.1860 | \n",
" 0.2750 | \n",
" 0.08902 | \n",
" NaN | \n",
"
\n",
" \n",
" | 2 | \n",
" 84300903 | \n",
" M | \n",
" 19.69 | \n",
" 21.25 | \n",
" 130.00 | \n",
" 1203.0 | \n",
" 0.10960 | \n",
" 0.15990 | \n",
" 0.1974 | \n",
" 0.12790 | \n",
" ... | \n",
" 25.53 | \n",
" 152.50 | \n",
" 1709.0 | \n",
" 0.1444 | \n",
" 0.4245 | \n",
" 0.4504 | \n",
" 0.2430 | \n",
" 0.3613 | \n",
" 0.08758 | \n",
" NaN | \n",
"
\n",
" \n",
" | 3 | \n",
" 84348301 | \n",
" M | \n",
" 11.42 | \n",
" 20.38 | \n",
" 77.58 | \n",
" 386.1 | \n",
" 0.14250 | \n",
" 0.28390 | \n",
" 0.2414 | \n",
" 0.10520 | \n",
" ... | \n",
" 26.50 | \n",
" 98.87 | \n",
" 567.7 | \n",
" 0.2098 | \n",
" 0.8663 | \n",
" 0.6869 | \n",
" 0.2575 | \n",
" 0.6638 | \n",
" 0.17300 | \n",
" NaN | \n",
"
\n",
" \n",
" | 4 | \n",
" 84358402 | \n",
" M | \n",
" 20.29 | \n",
" 14.34 | \n",
" 135.10 | \n",
" 1297.0 | \n",
" 0.10030 | \n",
" 0.13280 | \n",
" 0.1980 | \n",
" 0.10430 | \n",
" ... | \n",
" 16.67 | \n",
" 152.20 | \n",
" 1575.0 | \n",
" 0.1374 | \n",
" 0.2050 | \n",
" 0.4000 | \n",
" 0.1625 | \n",
" 0.2364 | \n",
" 0.07678 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 33 columns
\n",
"
"
],
"text/plain": [
" id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n",
"0 842302 M 17.99 10.38 122.80 1001.0 \n",
"1 842517 M 20.57 17.77 132.90 1326.0 \n",
"2 84300903 M 19.69 21.25 130.00 1203.0 \n",
"3 84348301 M 11.42 20.38 77.58 386.1 \n",
"4 84358402 M 20.29 14.34 135.10 1297.0 \n",
"\n",
" smoothness_mean compactness_mean concavity_mean concave points_mean \\\n",
"0 0.11840 0.27760 0.3001 0.14710 \n",
"1 0.08474 0.07864 0.0869 0.07017 \n",
"2 0.10960 0.15990 0.1974 0.12790 \n",
"3 0.14250 0.28390 0.2414 0.10520 \n",
"4 0.10030 0.13280 0.1980 0.10430 \n",
"\n",
" ... texture_worst perimeter_worst area_worst smoothness_worst \\\n",
"0 ... 17.33 184.60 2019.0 0.1622 \n",
"1 ... 23.41 158.80 1956.0 0.1238 \n",
"2 ... 25.53 152.50 1709.0 0.1444 \n",
"3 ... 26.50 98.87 567.7 0.2098 \n",
"4 ... 16.67 152.20 1575.0 0.1374 \n",
"\n",
" compactness_worst concavity_worst concave points_worst symmetry_worst \\\n",
"0 0.6656 0.7119 0.2654 0.4601 \n",
"1 0.1866 0.2416 0.1860 0.2750 \n",
"2 0.4245 0.4504 0.2430 0.3613 \n",
"3 0.8663 0.6869 0.2575 0.6638 \n",
"4 0.2050 0.4000 0.1625 0.2364 \n",
"\n",
" fractal_dimension_worst Unnamed: 32 \n",
"0 0.11890 NaN \n",
"1 0.08902 NaN \n",
"2 0.08758 NaN \n",
"3 0.17300 NaN \n",
"4 0.07678 NaN \n",
"\n",
"[5 rows x 33 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"https://raw.githubusercontent.com/youronlydimwit/Data_ScienceUse_Cases/main/Classification/Data/Breast_Cancer_Wisconsin.csv\")\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "6eb25bbe",
"metadata": {},
"source": [
"# Preliminary check for missing values\n",
"Created a function to outputs a dataframe with columns:\n",
"- Column Name\n",
"- The Data type\n",
"- Count of missing data (Nulls)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "72bb3dcc",
"metadata": {},
"outputs": [],
"source": [
"# Missing data check function\n",
"def completeness_check(input_df):\n",
" # Create a new DataFrame\n",
" summary_df = pd.DataFrame(columns=['Column_Name', 'Data_Type', 'Missing_Data'])\n",
"\n",
" # Fill in the data\n",
" summary_df['Column_Name'] = input_df.columns\n",
" summary_df['Data_Type'] = input_df.dtypes.values\n",
" summary_df['Missing_Data'] = input_df.isnull().sum().values\n",
"\n",
" return summary_df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "dd4b5d0f",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Column_Name | \n",
" Data_Type | \n",
" Missing_Data | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" id | \n",
" int64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" diagnosis | \n",
" object | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" radius_mean | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" texture_mean | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" perimeter_mean | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 5 | \n",
" area_mean | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 6 | \n",
" smoothness_mean | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 7 | \n",
" compactness_mean | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 8 | \n",
" concavity_mean | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 9 | \n",
" concave points_mean | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 10 | \n",
" symmetry_mean | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 11 | \n",
" fractal_dimension_mean | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 12 | \n",
" radius_se | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 13 | \n",
" texture_se | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 14 | \n",
" perimeter_se | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 15 | \n",
" area_se | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 16 | \n",
" smoothness_se | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 17 | \n",
" compactness_se | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 18 | \n",
" concavity_se | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 19 | \n",
" concave points_se | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 20 | \n",
" symmetry_se | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 21 | \n",
" fractal_dimension_se | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 22 | \n",
" radius_worst | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 23 | \n",
" texture_worst | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 24 | \n",
" perimeter_worst | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 25 | \n",
" area_worst | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 26 | \n",
" smoothness_worst | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 27 | \n",
" compactness_worst | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 28 | \n",
" concavity_worst | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 29 | \n",
" concave points_worst | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 30 | \n",
" symmetry_worst | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 31 | \n",
" fractal_dimension_worst | \n",
" float64 | \n",
" 0 | \n",
"
\n",
" \n",
" | 32 | \n",
" Unnamed: 32 | \n",
" float64 | \n",
" 569 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Column_Name Data_Type Missing_Data\n",
"0 id int64 0\n",
"1 diagnosis object 0\n",
"2 radius_mean float64 0\n",
"3 texture_mean float64 0\n",
"4 perimeter_mean float64 0\n",
"5 area_mean float64 0\n",
"6 smoothness_mean float64 0\n",
"7 compactness_mean float64 0\n",
"8 concavity_mean float64 0\n",
"9 concave points_mean float64 0\n",
"10 symmetry_mean float64 0\n",
"11 fractal_dimension_mean float64 0\n",
"12 radius_se float64 0\n",
"13 texture_se float64 0\n",
"14 perimeter_se float64 0\n",
"15 area_se float64 0\n",
"16 smoothness_se float64 0\n",
"17 compactness_se float64 0\n",
"18 concavity_se float64 0\n",
"19 concave points_se float64 0\n",
"20 symmetry_se float64 0\n",
"21 fractal_dimension_se float64 0\n",
"22 radius_worst float64 0\n",
"23 texture_worst float64 0\n",
"24 perimeter_worst float64 0\n",
"25 area_worst float64 0\n",
"26 smoothness_worst float64 0\n",
"27 compactness_worst float64 0\n",
"28 concavity_worst float64 0\n",
"29 concave points_worst float64 0\n",
"30 symmetry_worst float64 0\n",
"31 fractal_dimension_worst float64 0\n",
"32 Unnamed: 32 float64 569"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"completeness_check(df)"
]
},
{
"cell_type": "markdown",
"id": "d91cd816",
"metadata": {},
"source": [
"Just dropping unnecessary columns, for this context, it is column **'id'** and **'Unnamed: 32'**"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "6ce4e125",
"metadata": {},
"outputs": [],
"source": [
"# We don't need these 2 columns\n",
"df = df.drop(['id','Unnamed: 32'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "35e3462f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" diagnosis | \n",
" radius_mean | \n",
" texture_mean | \n",
" perimeter_mean | \n",
" area_mean | \n",
" smoothness_mean | \n",
" compactness_mean | \n",
" concavity_mean | \n",
" concave points_mean | \n",
" symmetry_mean | \n",
" ... | \n",
" radius_worst | \n",
" texture_worst | \n",
" perimeter_worst | \n",
" area_worst | \n",
" smoothness_worst | \n",
" compactness_worst | \n",
" concavity_worst | \n",
" concave points_worst | \n",
" symmetry_worst | \n",
" fractal_dimension_worst | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" M | \n",
" 17.99 | \n",
" 10.38 | \n",
" 122.80 | \n",
" 1001.0 | \n",
" 0.11840 | \n",
" 0.27760 | \n",
" 0.3001 | \n",
" 0.14710 | \n",
" 0.2419 | \n",
" ... | \n",
" 25.38 | \n",
" 17.33 | \n",
" 184.60 | \n",
" 2019.0 | \n",
" 0.1622 | \n",
" 0.6656 | \n",
" 0.7119 | \n",
" 0.2654 | \n",
" 0.4601 | \n",
" 0.11890 | \n",
"
\n",
" \n",
" | 1 | \n",
" M | \n",
" 20.57 | \n",
" 17.77 | \n",
" 132.90 | \n",
" 1326.0 | \n",
" 0.08474 | \n",
" 0.07864 | \n",
" 0.0869 | \n",
" 0.07017 | \n",
" 0.1812 | \n",
" ... | \n",
" 24.99 | \n",
" 23.41 | \n",
" 158.80 | \n",
" 1956.0 | \n",
" 0.1238 | \n",
" 0.1866 | \n",
" 0.2416 | \n",
" 0.1860 | \n",
" 0.2750 | \n",
" 0.08902 | \n",
"
\n",
" \n",
" | 2 | \n",
" M | \n",
" 19.69 | \n",
" 21.25 | \n",
" 130.00 | \n",
" 1203.0 | \n",
" 0.10960 | \n",
" 0.15990 | \n",
" 0.1974 | \n",
" 0.12790 | \n",
" 0.2069 | \n",
" ... | \n",
" 23.57 | \n",
" 25.53 | \n",
" 152.50 | \n",
" 1709.0 | \n",
" 0.1444 | \n",
" 0.4245 | \n",
" 0.4504 | \n",
" 0.2430 | \n",
" 0.3613 | \n",
" 0.08758 | \n",
"
\n",
" \n",
" | 3 | \n",
" M | \n",
" 11.42 | \n",
" 20.38 | \n",
" 77.58 | \n",
" 386.1 | \n",
" 0.14250 | \n",
" 0.28390 | \n",
" 0.2414 | \n",
" 0.10520 | \n",
" 0.2597 | \n",
" ... | \n",
" 14.91 | \n",
" 26.50 | \n",
" 98.87 | \n",
" 567.7 | \n",
" 0.2098 | \n",
" 0.8663 | \n",
" 0.6869 | \n",
" 0.2575 | \n",
" 0.6638 | \n",
" 0.17300 | \n",
"
\n",
" \n",
" | 4 | \n",
" M | \n",
" 20.29 | \n",
" 14.34 | \n",
" 135.10 | \n",
" 1297.0 | \n",
" 0.10030 | \n",
" 0.13280 | \n",
" 0.1980 | \n",
" 0.10430 | \n",
" 0.1809 | \n",
" ... | \n",
" 22.54 | \n",
" 16.67 | \n",
" 152.20 | \n",
" 1575.0 | \n",
" 0.1374 | \n",
" 0.2050 | \n",
" 0.4000 | \n",
" 0.1625 | \n",
" 0.2364 | \n",
" 0.07678 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 31 columns
\n",
"
"
],
"text/plain": [
" diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n",
"0 M 17.99 10.38 122.80 1001.0 \n",
"1 M 20.57 17.77 132.90 1326.0 \n",
"2 M 19.69 21.25 130.00 1203.0 \n",
"3 M 11.42 20.38 77.58 386.1 \n",
"4 M 20.29 14.34 135.10 1297.0 \n",
"\n",
" smoothness_mean compactness_mean concavity_mean concave points_mean \\\n",
"0 0.11840 0.27760 0.3001 0.14710 \n",
"1 0.08474 0.07864 0.0869 0.07017 \n",
"2 0.10960 0.15990 0.1974 0.12790 \n",
"3 0.14250 0.28390 0.2414 0.10520 \n",
"4 0.10030 0.13280 0.1980 0.10430 \n",
"\n",
" symmetry_mean ... radius_worst texture_worst perimeter_worst \\\n",
"0 0.2419 ... 25.38 17.33 184.60 \n",
"1 0.1812 ... 24.99 23.41 158.80 \n",
"2 0.2069 ... 23.57 25.53 152.50 \n",
"3 0.2597 ... 14.91 26.50 98.87 \n",
"4 0.1809 ... 22.54 16.67 152.20 \n",
"\n",
" area_worst smoothness_worst compactness_worst concavity_worst \\\n",
"0 2019.0 0.1622 0.6656 0.7119 \n",
"1 1956.0 0.1238 0.1866 0.2416 \n",
"2 1709.0 0.1444 0.4245 0.4504 \n",
"3 567.7 0.2098 0.8663 0.6869 \n",
"4 1575.0 0.1374 0.2050 0.4000 \n",
"\n",
" concave points_worst symmetry_worst fractal_dimension_worst \n",
"0 0.2654 0.4601 0.11890 \n",
"1 0.1860 0.2750 0.08902 \n",
"2 0.2430 0.3613 0.08758 \n",
"3 0.2575 0.6638 0.17300 \n",
"4 0.1625 0.2364 0.07678 \n",
"\n",
"[5 rows x 31 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "4b2e562f",
"metadata": {},
"source": [
"# Reformatting\n",
"From the source, the data is assumed already cleaned. Then, the _float_ in this dataset is being reformatted as **#.##** format for easier reading."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "b7eca327",
"metadata": {},
"outputs": [],
"source": [
"def format_float_columns(df):\n",
" # Get columns with float64 dtype\n",
" float_columns = df.select_dtypes(include='float64').columns\n",
" \n",
" # Format each column without changing the data type\n",
" for col in float_columns:\n",
" df[col] = df[col].round(2)\n",
" \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "adc5f2f3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" diagnosis | \n",
" radius_mean | \n",
" texture_mean | \n",
" perimeter_mean | \n",
" area_mean | \n",
" smoothness_mean | \n",
" compactness_mean | \n",
" concavity_mean | \n",
" concave points_mean | \n",
" symmetry_mean | \n",
" ... | \n",
" radius_worst | \n",
" texture_worst | \n",
" perimeter_worst | \n",
" area_worst | \n",
" smoothness_worst | \n",
" compactness_worst | \n",
" concavity_worst | \n",
" concave points_worst | \n",
" symmetry_worst | \n",
" fractal_dimension_worst | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" M | \n",
" 17.99 | \n",
" 10.38 | \n",
" 122.80 | \n",
" 1001.0 | \n",
" 0.12 | \n",
" 0.28 | \n",
" 0.30 | \n",
" 0.15 | \n",
" 0.24 | \n",
" ... | \n",
" 25.38 | \n",
" 17.33 | \n",
" 184.60 | \n",
" 2019.0 | \n",
" 0.16 | \n",
" 0.67 | \n",
" 0.71 | \n",
" 0.27 | \n",
" 0.46 | \n",
" 0.12 | \n",
"
\n",
" \n",
" | 1 | \n",
" M | \n",
" 20.57 | \n",
" 17.77 | \n",
" 132.90 | \n",
" 1326.0 | \n",
" 0.08 | \n",
" 0.08 | \n",
" 0.09 | \n",
" 0.07 | \n",
" 0.18 | \n",
" ... | \n",
" 24.99 | \n",
" 23.41 | \n",
" 158.80 | \n",
" 1956.0 | \n",
" 0.12 | \n",
" 0.19 | \n",
" 0.24 | \n",
" 0.19 | \n",
" 0.28 | \n",
" 0.09 | \n",
"
\n",
" \n",
" | 2 | \n",
" M | \n",
" 19.69 | \n",
" 21.25 | \n",
" 130.00 | \n",
" 1203.0 | \n",
" 0.11 | \n",
" 0.16 | \n",
" 0.20 | \n",
" 0.13 | \n",
" 0.21 | \n",
" ... | \n",
" 23.57 | \n",
" 25.53 | \n",
" 152.50 | \n",
" 1709.0 | \n",
" 0.14 | \n",
" 0.42 | \n",
" 0.45 | \n",
" 0.24 | \n",
" 0.36 | \n",
" 0.09 | \n",
"
\n",
" \n",
" | 3 | \n",
" M | \n",
" 11.42 | \n",
" 20.38 | \n",
" 77.58 | \n",
" 386.1 | \n",
" 0.14 | \n",
" 0.28 | \n",
" 0.24 | \n",
" 0.11 | \n",
" 0.26 | \n",
" ... | \n",
" 14.91 | \n",
" 26.50 | \n",
" 98.87 | \n",
" 567.7 | \n",
" 0.21 | \n",
" 0.87 | \n",
" 0.69 | \n",
" 0.26 | \n",
" 0.66 | \n",
" 0.17 | \n",
"
\n",
" \n",
" | 4 | \n",
" M | \n",
" 20.29 | \n",
" 14.34 | \n",
" 135.10 | \n",
" 1297.0 | \n",
" 0.10 | \n",
" 0.13 | \n",
" 0.20 | \n",
" 0.10 | \n",
" 0.18 | \n",
" ... | \n",
" 22.54 | \n",
" 16.67 | \n",
" 152.20 | \n",
" 1575.0 | \n",
" 0.14 | \n",
" 0.20 | \n",
" 0.40 | \n",
" 0.16 | \n",
" 0.24 | \n",
" 0.08 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 564 | \n",
" M | \n",
" 21.56 | \n",
" 22.39 | \n",
" 142.00 | \n",
" 1479.0 | \n",
" 0.11 | \n",
" 0.12 | \n",
" 0.24 | \n",
" 0.14 | \n",
" 0.17 | \n",
" ... | \n",
" 25.45 | \n",
" 26.40 | \n",
" 166.10 | \n",
" 2027.0 | \n",
" 0.14 | \n",
" 0.21 | \n",
" 0.41 | \n",
" 0.22 | \n",
" 0.21 | \n",
" 0.07 | \n",
"
\n",
" \n",
" | 565 | \n",
" M | \n",
" 20.13 | \n",
" 28.25 | \n",
" 131.20 | \n",
" 1261.0 | \n",
" 0.10 | \n",
" 0.10 | \n",
" 0.14 | \n",
" 0.10 | \n",
" 0.18 | \n",
" ... | \n",
" 23.69 | \n",
" 38.25 | \n",
" 155.00 | \n",
" 1731.0 | \n",
" 0.12 | \n",
" 0.19 | \n",
" 0.32 | \n",
" 0.16 | \n",
" 0.26 | \n",
" 0.07 | \n",
"
\n",
" \n",
" | 566 | \n",
" M | \n",
" 16.60 | \n",
" 28.08 | \n",
" 108.30 | \n",
" 858.1 | \n",
" 0.08 | \n",
" 0.10 | \n",
" 0.09 | \n",
" 0.05 | \n",
" 0.16 | \n",
" ... | \n",
" 18.98 | \n",
" 34.12 | \n",
" 126.70 | \n",
" 1124.0 | \n",
" 0.11 | \n",
" 0.31 | \n",
" 0.34 | \n",
" 0.14 | \n",
" 0.22 | \n",
" 0.08 | \n",
"
\n",
" \n",
" | 567 | \n",
" M | \n",
" 20.60 | \n",
" 29.33 | \n",
" 140.10 | \n",
" 1265.0 | \n",
" 0.12 | \n",
" 0.28 | \n",
" 0.35 | \n",
" 0.15 | \n",
" 0.24 | \n",
" ... | \n",
" 25.74 | \n",
" 39.42 | \n",
" 184.60 | \n",
" 1821.0 | \n",
" 0.16 | \n",
" 0.87 | \n",
" 0.94 | \n",
" 0.26 | \n",
" 0.41 | \n",
" 0.12 | \n",
"
\n",
" \n",
" | 568 | \n",
" B | \n",
" 7.76 | \n",
" 24.54 | \n",
" 47.92 | \n",
" 181.0 | \n",
" 0.05 | \n",
" 0.04 | \n",
" 0.00 | \n",
" 0.00 | \n",
" 0.16 | \n",
" ... | \n",
" 9.46 | \n",
" 30.37 | \n",
" 59.16 | \n",
" 268.6 | \n",
" 0.09 | \n",
" 0.06 | \n",
" 0.00 | \n",
" 0.00 | \n",
" 0.29 | \n",
" 0.07 | \n",
"
\n",
" \n",
"
\n",
"
569 rows × 31 columns
\n",
"
"
],
"text/plain": [
" diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n",
"0 M 17.99 10.38 122.80 1001.0 \n",
"1 M 20.57 17.77 132.90 1326.0 \n",
"2 M 19.69 21.25 130.00 1203.0 \n",
"3 M 11.42 20.38 77.58 386.1 \n",
"4 M 20.29 14.34 135.10 1297.0 \n",
".. ... ... ... ... ... \n",
"564 M 21.56 22.39 142.00 1479.0 \n",
"565 M 20.13 28.25 131.20 1261.0 \n",
"566 M 16.60 28.08 108.30 858.1 \n",
"567 M 20.60 29.33 140.10 1265.0 \n",
"568 B 7.76 24.54 47.92 181.0 \n",
"\n",
" smoothness_mean compactness_mean concavity_mean concave points_mean \\\n",
"0 0.12 0.28 0.30 0.15 \n",
"1 0.08 0.08 0.09 0.07 \n",
"2 0.11 0.16 0.20 0.13 \n",
"3 0.14 0.28 0.24 0.11 \n",
"4 0.10 0.13 0.20 0.10 \n",
".. ... ... ... ... \n",
"564 0.11 0.12 0.24 0.14 \n",
"565 0.10 0.10 0.14 0.10 \n",
"566 0.08 0.10 0.09 0.05 \n",
"567 0.12 0.28 0.35 0.15 \n",
"568 0.05 0.04 0.00 0.00 \n",
"\n",
" symmetry_mean ... radius_worst texture_worst perimeter_worst \\\n",
"0 0.24 ... 25.38 17.33 184.60 \n",
"1 0.18 ... 24.99 23.41 158.80 \n",
"2 0.21 ... 23.57 25.53 152.50 \n",
"3 0.26 ... 14.91 26.50 98.87 \n",
"4 0.18 ... 22.54 16.67 152.20 \n",
".. ... ... ... ... ... \n",
"564 0.17 ... 25.45 26.40 166.10 \n",
"565 0.18 ... 23.69 38.25 155.00 \n",
"566 0.16 ... 18.98 34.12 126.70 \n",
"567 0.24 ... 25.74 39.42 184.60 \n",
"568 0.16 ... 9.46 30.37 59.16 \n",
"\n",
" area_worst smoothness_worst compactness_worst concavity_worst \\\n",
"0 2019.0 0.16 0.67 0.71 \n",
"1 1956.0 0.12 0.19 0.24 \n",
"2 1709.0 0.14 0.42 0.45 \n",
"3 567.7 0.21 0.87 0.69 \n",
"4 1575.0 0.14 0.20 0.40 \n",
".. ... ... ... ... \n",
"564 2027.0 0.14 0.21 0.41 \n",
"565 1731.0 0.12 0.19 0.32 \n",
"566 1124.0 0.11 0.31 0.34 \n",
"567 1821.0 0.16 0.87 0.94 \n",
"568 268.6 0.09 0.06 0.00 \n",
"\n",
" concave points_worst symmetry_worst fractal_dimension_worst \n",
"0 0.27 0.46 0.12 \n",
"1 0.19 0.28 0.09 \n",
"2 0.24 0.36 0.09 \n",
"3 0.26 0.66 0.17 \n",
"4 0.16 0.24 0.08 \n",
".. ... ... ... \n",
"564 0.22 0.21 0.07 \n",
"565 0.16 0.26 0.07 \n",
"566 0.14 0.22 0.08 \n",
"567 0.26 0.41 0.12 \n",
"568 0.00 0.29 0.07 \n",
"\n",
"[569 rows x 31 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"format_float_columns(df)"
]
},
{
"cell_type": "markdown",
"id": "c9aaef3a",
"metadata": {},
"source": [
"# Encoding\n",
"Since the labels to be predicted only has **M for Malignant** and **B for Benign**, we can encode it to become 0 and 1 respectively.\n",
"I also changed the type to float, so the data types are all the same type"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "60870d9a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" diagnosis | \n",
" radius_mean | \n",
" texture_mean | \n",
" perimeter_mean | \n",
" area_mean | \n",
" smoothness_mean | \n",
" compactness_mean | \n",
" concavity_mean | \n",
" concave points_mean | \n",
" symmetry_mean | \n",
" ... | \n",
" radius_worst | \n",
" texture_worst | \n",
" perimeter_worst | \n",
" area_worst | \n",
" smoothness_worst | \n",
" compactness_worst | \n",
" concavity_worst | \n",
" concave points_worst | \n",
" symmetry_worst | \n",
" fractal_dimension_worst | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.0 | \n",
" 17.99 | \n",
" 10.38 | \n",
" 122.8 | \n",
" 1001.0 | \n",
" 0.12 | \n",
" 0.28 | \n",
" 0.30 | \n",
" 0.15 | \n",
" 0.24 | \n",
" ... | \n",
" 25.38 | \n",
" 17.33 | \n",
" 184.6 | \n",
" 2019.0 | \n",
" 0.16 | \n",
" 0.67 | \n",
" 0.71 | \n",
" 0.27 | \n",
" 0.46 | \n",
" 0.12 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0.0 | \n",
" 20.57 | \n",
" 17.77 | \n",
" 132.9 | \n",
" 1326.0 | \n",
" 0.08 | \n",
" 0.08 | \n",
" 0.09 | \n",
" 0.07 | \n",
" 0.18 | \n",
" ... | \n",
" 24.99 | \n",
" 23.41 | \n",
" 158.8 | \n",
" 1956.0 | \n",
" 0.12 | \n",
" 0.19 | \n",
" 0.24 | \n",
" 0.19 | \n",
" 0.28 | \n",
" 0.09 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.0 | \n",
" 19.69 | \n",
" 21.25 | \n",
" 130.0 | \n",
" 1203.0 | \n",
" 0.11 | \n",
" 0.16 | \n",
" 0.20 | \n",
" 0.13 | \n",
" 0.21 | \n",
" ... | \n",
" 23.57 | \n",
" 25.53 | \n",
" 152.5 | \n",
" 1709.0 | \n",
" 0.14 | \n",
" 0.42 | \n",
" 0.45 | \n",
" 0.24 | \n",
" 0.36 | \n",
" 0.09 | \n",
"
\n",
" \n",
"
\n",
"
3 rows × 31 columns
\n",
"
"
],
"text/plain": [
" diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n",
"0 0.0 17.99 10.38 122.8 1001.0 \n",
"1 0.0 20.57 17.77 132.9 1326.0 \n",
"2 0.0 19.69 21.25 130.0 1203.0 \n",
"\n",
" smoothness_mean compactness_mean concavity_mean concave points_mean \\\n",
"0 0.12 0.28 0.30 0.15 \n",
"1 0.08 0.08 0.09 0.07 \n",
"2 0.11 0.16 0.20 0.13 \n",
"\n",
" symmetry_mean ... radius_worst texture_worst perimeter_worst \\\n",
"0 0.24 ... 25.38 17.33 184.6 \n",
"1 0.18 ... 24.99 23.41 158.8 \n",
"2 0.21 ... 23.57 25.53 152.5 \n",
"\n",
" area_worst smoothness_worst compactness_worst concavity_worst \\\n",
"0 2019.0 0.16 0.67 0.71 \n",
"1 1956.0 0.12 0.19 0.24 \n",
"2 1709.0 0.14 0.42 0.45 \n",
"\n",
" concave points_worst symmetry_worst fractal_dimension_worst \n",
"0 0.27 0.46 0.12 \n",
"1 0.19 0.28 0.09 \n",
"2 0.24 0.36 0.09 \n",
"\n",
"[3 rows x 31 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Replace 'M' with 0 and 'B' with 1 in the 'Column' column\n",
"df['diagnosis'] = df['diagnosis'].replace({'M': 0, 'B': 1})\n",
"df['diagnosis'] = df['diagnosis'].astype(float)\n",
"df.head(3)"
]
},
{
"cell_type": "markdown",
"id": "ecbe4b22",
"metadata": {},
"source": [
"Correct data type"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7e437274",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dtype('float64')"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['diagnosis'].dtypes"
]
},
{
"cell_type": "markdown",
"id": "5b18b919",
"metadata": {},
"source": [
"Running a correlation matrix to get a glimpse of who's affecting who the most."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "2c3b26b6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" diagnosis | \n",
" radius_mean | \n",
" texture_mean | \n",
" perimeter_mean | \n",
" area_mean | \n",
" smoothness_mean | \n",
" compactness_mean | \n",
" concavity_mean | \n",
" concave points_mean | \n",
" symmetry_mean | \n",
" ... | \n",
" radius_worst | \n",
" texture_worst | \n",
" perimeter_worst | \n",
" area_worst | \n",
" smoothness_worst | \n",
" compactness_worst | \n",
" concavity_worst | \n",
" concave points_worst | \n",
" symmetry_worst | \n",
" fractal_dimension_worst | \n",
"
\n",
" \n",
" \n",
" \n",
" | diagnosis | \n",
" 1.000000 | \n",
" -0.730032 | \n",
" -0.415185 | \n",
" -0.742636 | \n",
" -0.708984 | \n",
" -0.330624 | \n",
" -0.597576 | \n",
" -0.695750 | \n",
" -0.774410 | \n",
" -0.333477 | \n",
" ... | \n",
" -0.776453 | \n",
" -0.456903 | \n",
" -0.782914 | \n",
" -0.733825 | \n",
" -0.419661 | \n",
" -0.590477 | \n",
" -0.659345 | \n",
" -0.791976 | \n",
" -0.417123 | \n",
" -0.322046 | \n",
"
\n",
" \n",
" | radius_mean | \n",
" -0.730032 | \n",
" 1.000000 | \n",
" 0.323777 | \n",
" 0.997856 | \n",
" 0.987361 | \n",
" 0.147104 | \n",
" 0.505622 | \n",
" 0.677041 | \n",
" 0.821646 | \n",
" 0.151195 | \n",
" ... | \n",
" 0.969541 | \n",
" 0.297000 | \n",
" 0.965139 | \n",
" 0.941087 | \n",
" 0.119031 | \n",
" 0.413065 | \n",
" 0.527379 | \n",
" 0.741494 | \n",
" 0.164409 | \n",
" 0.009169 | \n",
"
\n",
" \n",
" | texture_mean | \n",
" -0.415185 | \n",
" 0.323777 | \n",
" 1.000000 | \n",
" 0.329533 | \n",
" 0.321086 | \n",
" -0.040419 | \n",
" 0.237019 | \n",
" 0.300536 | \n",
" 0.287618 | \n",
" 0.067193 | \n",
" ... | \n",
" 0.352578 | \n",
" 0.912045 | \n",
" 0.358040 | \n",
" 0.343546 | \n",
" 0.075885 | \n",
" 0.278112 | \n",
" 0.300041 | \n",
" 0.293490 | \n",
" 0.105824 | \n",
" 0.113425 | \n",
"
\n",
" \n",
" | perimeter_mean | \n",
" -0.742636 | \n",
" 0.997856 | \n",
" 0.329533 | \n",
" 1.000000 | \n",
" 0.986507 | \n",
" 0.183356 | \n",
" 0.556485 | \n",
" 0.716448 | \n",
" 0.849875 | \n",
" 0.186306 | \n",
" ... | \n",
" 0.969476 | \n",
" 0.303038 | \n",
" 0.970387 | \n",
" 0.941550 | \n",
" 0.149530 | \n",
" 0.455370 | \n",
" 0.564315 | \n",
" 0.768429 | \n",
" 0.189385 | \n",
" 0.052794 | \n",
"
\n",
" \n",
" | area_mean | \n",
" -0.708984 | \n",
" 0.987361 | \n",
" 0.321086 | \n",
" 0.986507 | \n",
" 1.000000 | \n",
" 0.151671 | \n",
" 0.498038 | \n",
" 0.686308 | \n",
" 0.822500 | \n",
" 0.154603 | \n",
" ... | \n",
" 0.962745 | \n",
" 0.287489 | \n",
" 0.959120 | \n",
" 0.959213 | \n",
" 0.122984 | \n",
" 0.389874 | \n",
" 0.512943 | \n",
" 0.719419 | \n",
" 0.143762 | \n",
" 0.006854 | \n",
"
\n",
" \n",
" | smoothness_mean | \n",
" -0.330624 | \n",
" 0.147104 | \n",
" -0.040419 | \n",
" 0.183356 | \n",
" 0.151671 | \n",
" 1.000000 | \n",
" 0.640292 | \n",
" 0.501956 | \n",
" 0.528881 | \n",
" 0.544531 | \n",
" ... | \n",
" 0.189984 | \n",
" 0.022669 | \n",
" 0.215574 | \n",
" 0.182905 | \n",
" 0.777084 | \n",
" 0.461971 | \n",
" 0.420549 | \n",
" 0.483773 | \n",
" 0.391210 | \n",
" 0.498783 | \n",
"
\n",
" \n",
" | compactness_mean | \n",
" -0.597576 | \n",
" 0.505622 | \n",
" 0.237019 | \n",
" 0.556485 | \n",
" 0.498038 | \n",
" 0.640292 | \n",
" 1.000000 | \n",
" 0.883729 | \n",
" 0.829022 | \n",
" 0.602728 | \n",
" ... | \n",
" 0.534939 | \n",
" 0.247564 | \n",
" 0.589944 | \n",
" 0.509449 | \n",
" 0.559318 | \n",
" 0.864209 | \n",
" 0.815163 | \n",
" 0.813038 | \n",
" 0.508330 | \n",
" 0.681343 | \n",
"
\n",
" \n",
" | concavity_mean | \n",
" -0.695750 | \n",
" 0.677041 | \n",
" 0.300536 | \n",
" 0.716448 | \n",
" 0.686308 | \n",
" 0.501956 | \n",
" 0.883729 | \n",
" 1.000000 | \n",
" 0.918687 | \n",
" 0.500739 | \n",
" ... | \n",
" 0.688703 | \n",
" 0.298983 | \n",
" 0.729963 | \n",
" 0.676656 | \n",
" 0.446247 | \n",
" 0.755167 | \n",
" 0.883088 | \n",
" 0.859451 | \n",
" 0.409733 | \n",
" 0.513497 | \n",
"
\n",
" \n",
" | concave points_mean | \n",
" -0.774410 | \n",
" 0.821646 | \n",
" 0.287618 | \n",
" 0.849875 | \n",
" 0.822500 | \n",
" 0.528881 | \n",
" 0.829022 | \n",
" 0.918687 | \n",
" 1.000000 | \n",
" 0.460745 | \n",
" ... | \n",
" 0.828401 | \n",
" 0.286828 | \n",
" 0.853642 | \n",
" 0.807978 | \n",
" 0.446157 | \n",
" 0.665741 | \n",
" 0.748687 | \n",
" 0.905058 | \n",
" 0.370446 | \n",
" 0.366058 | \n",
"
\n",
" \n",
" | symmetry_mean | \n",
" -0.333477 | \n",
" 0.151195 | \n",
" 0.067193 | \n",
" 0.186306 | \n",
" 0.154603 | \n",
" 0.544531 | \n",
" 0.602728 | \n",
" 0.500739 | \n",
" 0.460745 | \n",
" 1.000000 | \n",
" ... | \n",
" 0.189763 | \n",
" 0.084924 | \n",
" 0.222908 | \n",
" 0.181731 | \n",
" 0.424135 | \n",
" 0.474981 | \n",
" 0.432004 | \n",
" 0.428627 | \n",
" 0.698086 | \n",
" 0.427079 | \n",
"
\n",
" \n",
" | fractal_dimension_mean | \n",
" 0.027269 | \n",
" -0.290001 | \n",
" -0.080337 | \n",
" -0.245096 | \n",
" -0.260436 | \n",
" 0.556840 | \n",
" 0.506156 | \n",
" 0.301176 | \n",
" 0.142307 | \n",
" 0.411393 | \n",
" ... | \n",
" -0.242125 | \n",
" -0.051211 | \n",
" -0.199454 | \n",
" -0.218566 | \n",
" 0.451897 | \n",
" 0.410627 | \n",
" 0.305228 | \n",
" 0.148275 | \n",
" 0.283061 | \n",
" 0.701592 | \n",
"
\n",
" \n",
" | radius_se | \n",
" -0.567328 | \n",
" 0.678590 | \n",
" 0.275973 | \n",
" 0.691268 | \n",
" 0.732125 | \n",
" 0.277600 | \n",
" 0.497381 | \n",
" 0.631258 | \n",
" 0.699764 | \n",
" 0.304353 | \n",
" ... | \n",
" 0.714872 | \n",
" 0.195092 | \n",
" 0.719492 | \n",
" 0.751431 | \n",
" 0.140187 | \n",
" 0.286572 | \n",
" 0.380110 | \n",
" 0.528842 | \n",
" 0.094549 | \n",
" 0.049351 | \n",
"
\n",
" \n",
" | texture_se | \n",
" 0.008299 | \n",
" -0.097230 | \n",
" 0.386443 | \n",
" -0.086669 | \n",
" -0.066119 | \n",
" 0.059149 | \n",
" 0.044875 | \n",
" 0.075893 | \n",
" 0.025260 | \n",
" 0.126154 | \n",
" ... | \n",
" -0.111592 | \n",
" 0.409071 | \n",
" -0.102161 | \n",
" -0.083078 | \n",
" -0.073042 | \n",
" -0.092846 | \n",
" -0.070533 | \n",
" -0.120938 | \n",
" -0.127607 | \n",
" -0.044761 | \n",
"
\n",
" \n",
" | perimeter_se | \n",
" -0.556055 | \n",
" 0.674109 | \n",
" 0.281590 | \n",
" 0.693069 | \n",
" 0.726564 | \n",
" 0.272183 | \n",
" 0.548665 | \n",
" 0.659685 | \n",
" 0.712066 | \n",
" 0.313482 | \n",
" ... | \n",
" 0.697133 | \n",
" 0.200270 | \n",
" 0.720966 | \n",
" 0.730647 | \n",
" 0.125791 | \n",
" 0.340951 | \n",
" 0.418269 | \n",
" 0.552873 | \n",
" 0.108877 | \n",
" 0.083512 | \n",
"
\n",
" \n",
" | area_se | \n",
" -0.548236 | \n",
" 0.735868 | \n",
" 0.259844 | \n",
" 0.744983 | \n",
" 0.800086 | \n",
" 0.222595 | \n",
" 0.456053 | \n",
" 0.617299 | \n",
" 0.690642 | \n",
" 0.226088 | \n",
" ... | \n",
" 0.757372 | \n",
" 0.196496 | \n",
" 0.761213 | \n",
" 0.811408 | \n",
" 0.124392 | \n",
" 0.282682 | \n",
" 0.384820 | \n",
" 0.535417 | \n",
" 0.072572 | \n",
" 0.016228 | \n",
"
\n",
" \n",
" | smoothness_se | \n",
" 0.012117 | \n",
" -0.167456 | \n",
" 0.019777 | \n",
" -0.149510 | \n",
" -0.130192 | \n",
" 0.268150 | \n",
" 0.133392 | \n",
" 0.110590 | \n",
" 0.051658 | \n",
" 0.153937 | \n",
" ... | \n",
" -0.152307 | \n",
" -0.008034 | \n",
" -0.136532 | \n",
" -0.115951 | \n",
" 0.309111 | \n",
" 0.033085 | \n",
" 0.018581 | \n",
" -0.027878 | \n",
" -0.032829 | \n",
" 0.119089 | \n",
"
\n",
" \n",
" | compactness_se | \n",
" -0.288713 | \n",
" 0.204341 | \n",
" 0.192029 | \n",
" 0.248683 | \n",
" 0.211126 | \n",
" 0.313043 | \n",
" 0.729164 | \n",
" 0.665217 | \n",
" 0.490719 | \n",
" 0.412566 | \n",
" ... | \n",
" 0.202027 | \n",
" 0.146121 | \n",
" 0.256901 | \n",
" 0.196258 | \n",
" 0.221708 | \n",
" 0.672119 | \n",
" 0.633328 | \n",
" 0.477739 | \n",
" 0.271770 | \n",
" 0.574867 | \n",
"
\n",
" \n",
" | concavity_se | \n",
" -0.255041 | \n",
" 0.194151 | \n",
" 0.145536 | \n",
" 0.228160 | \n",
" 0.206659 | \n",
" 0.249132 | \n",
" 0.571996 | \n",
" 0.688883 | \n",
" 0.441037 | \n",
" 0.337796 | \n",
" ... | \n",
" 0.187371 | \n",
" 0.102586 | \n",
" 0.227007 | \n",
" 0.187836 | \n",
" 0.172332 | \n",
" 0.491365 | \n",
" 0.662437 | \n",
" 0.443167 | \n",
" 0.199593 | \n",
" 0.445392 | \n",
"
\n",
" \n",
" | concave points_se | \n",
" -0.329287 | \n",
" 0.325042 | \n",
" 0.153373 | \n",
" 0.353151 | \n",
" 0.322610 | \n",
" 0.282948 | \n",
" 0.558232 | \n",
" 0.582869 | \n",
" 0.522044 | \n",
" 0.312608 | \n",
" ... | \n",
" 0.307288 | \n",
" 0.079999 | \n",
" 0.342227 | \n",
" 0.297057 | \n",
" 0.144157 | \n",
" 0.389143 | \n",
" 0.463453 | \n",
" 0.502092 | \n",
" 0.126508 | \n",
" 0.237510 | \n",
"
\n",
" \n",
" | symmetry_se | \n",
" 0.025127 | \n",
" -0.117235 | \n",
" -0.017239 | \n",
" -0.093748 | \n",
" -0.084347 | \n",
" 0.216286 | \n",
" 0.236474 | \n",
" 0.178725 | \n",
" 0.098510 | \n",
" 0.446116 | \n",
" ... | \n",
" -0.142188 | \n",
" -0.094773 | \n",
" -0.117920 | \n",
" -0.123926 | \n",
" -0.007047 | \n",
" 0.070591 | \n",
" 0.035520 | \n",
" -0.032924 | \n",
" 0.382952 | \n",
" 0.084610 | \n",
"
\n",
" \n",
" | fractal_dimension_se | \n",
" -0.091669 | \n",
" -0.009516 | \n",
" 0.079210 | \n",
" 0.027147 | \n",
" 0.002046 | \n",
" 0.278176 | \n",
" 0.496677 | \n",
" 0.427309 | \n",
" 0.260200 | \n",
" 0.333318 | \n",
" ... | \n",
" -0.005759 | \n",
" 0.008842 | \n",
" 0.033505 | \n",
" -0.000367 | \n",
" 0.167714 | \n",
" 0.381239 | \n",
" 0.363578 | \n",
" 0.203468 | \n",
" 0.098792 | \n",
" 0.522940 | \n",
"
\n",
" \n",
" | radius_worst | \n",
" -0.776453 | \n",
" 0.969541 | \n",
" 0.352578 | \n",
" 0.969476 | \n",
" 0.962745 | \n",
" 0.189984 | \n",
" 0.534939 | \n",
" 0.688703 | \n",
" 0.828401 | \n",
" 0.189763 | \n",
" ... | \n",
" 1.000000 | \n",
" 0.359925 | \n",
" 0.993707 | \n",
" 0.984014 | \n",
" 0.215895 | \n",
" 0.475348 | \n",
" 0.574562 | \n",
" 0.784946 | \n",
" 0.244034 | \n",
" 0.092952 | \n",
"
\n",
" \n",
" | texture_worst | \n",
" -0.456903 | \n",
" 0.297000 | \n",
" 0.912045 | \n",
" 0.303038 | \n",
" 0.287489 | \n",
" 0.022669 | \n",
" 0.247564 | \n",
" 0.298983 | \n",
" 0.286828 | \n",
" 0.084924 | \n",
" ... | \n",
" 0.359925 | \n",
" 1.000000 | \n",
" 0.365098 | \n",
" 0.345842 | \n",
" 0.225808 | \n",
" 0.361123 | \n",
" 0.367625 | \n",
" 0.358467 | \n",
" 0.234337 | \n",
" 0.214237 | \n",
"
\n",
" \n",
" | perimeter_worst | \n",
" -0.782914 | \n",
" 0.965139 | \n",
" 0.358040 | \n",
" 0.970387 | \n",
" 0.959120 | \n",
" 0.215574 | \n",
" 0.589944 | \n",
" 0.729963 | \n",
" 0.853642 | \n",
" 0.222908 | \n",
" ... | \n",
" 0.993707 | \n",
" 0.365098 | \n",
" 1.000000 | \n",
" 0.977578 | \n",
" 0.235168 | \n",
" 0.528876 | \n",
" 0.618906 | \n",
" 0.813826 | \n",
" 0.269788 | \n",
" 0.137973 | \n",
"
\n",
" \n",
" | area_worst | \n",
" -0.733825 | \n",
" 0.941087 | \n",
" 0.343546 | \n",
" 0.941550 | \n",
" 0.959213 | \n",
" 0.182905 | \n",
" 0.509449 | \n",
" 0.676656 | \n",
" 0.807978 | \n",
" 0.181731 | \n",
" ... | \n",
" 0.984014 | \n",
" 0.345842 | \n",
" 0.977578 | \n",
" 1.000000 | \n",
" 0.209064 | \n",
" 0.437727 | \n",
" 0.543774 | \n",
" 0.745090 | \n",
" 0.209443 | \n",
" 0.079535 | \n",
"
\n",
" \n",
" | smoothness_worst | \n",
" -0.419661 | \n",
" 0.119031 | \n",
" 0.075885 | \n",
" 0.149530 | \n",
" 0.122984 | \n",
" 0.777084 | \n",
" 0.559318 | \n",
" 0.446247 | \n",
" 0.446157 | \n",
" 0.424135 | \n",
" ... | \n",
" 0.215895 | \n",
" 0.225808 | \n",
" 0.235168 | \n",
" 0.209064 | \n",
" 1.000000 | \n",
" 0.563176 | \n",
" 0.513958 | \n",
" 0.542461 | \n",
" 0.486361 | \n",
" 0.608113 | \n",
"
\n",
" \n",
" | compactness_worst | \n",
" -0.590477 | \n",
" 0.413065 | \n",
" 0.278112 | \n",
" 0.455370 | \n",
" 0.389874 | \n",
" 0.461971 | \n",
" 0.864209 | \n",
" 0.755167 | \n",
" 0.665741 | \n",
" 0.474981 | \n",
" ... | \n",
" 0.475348 | \n",
" 0.361123 | \n",
" 0.528876 | \n",
" 0.437727 | \n",
" 0.563176 | \n",
" 1.000000 | \n",
" 0.892859 | \n",
" 0.800307 | \n",
" 0.614717 | \n",
" 0.800823 | \n",
"
\n",
" \n",
" | concavity_worst | \n",
" -0.659345 | \n",
" 0.527379 | \n",
" 0.300041 | \n",
" 0.564315 | \n",
" 0.512943 | \n",
" 0.420549 | \n",
" 0.815163 | \n",
" 0.883088 | \n",
" 0.748687 | \n",
" 0.432004 | \n",
" ... | \n",
" 0.574562 | \n",
" 0.367625 | \n",
" 0.618906 | \n",
" 0.543774 | \n",
" 0.513958 | \n",
" 0.892859 | \n",
" 1.000000 | \n",
" 0.855035 | \n",
" 0.530509 | \n",
" 0.682274 | \n",
"
\n",
" \n",
" | concave points_worst | \n",
" -0.791976 | \n",
" 0.741494 | \n",
" 0.293490 | \n",
" 0.768429 | \n",
" 0.719419 | \n",
" 0.483773 | \n",
" 0.813038 | \n",
" 0.859451 | \n",
" 0.905058 | \n",
" 0.428627 | \n",
" ... | \n",
" 0.784946 | \n",
" 0.358467 | \n",
" 0.813826 | \n",
" 0.745090 | \n",
" 0.542461 | \n",
" 0.800307 | \n",
" 0.855035 | \n",
" 1.000000 | \n",
" 0.502487 | \n",
" 0.510454 | \n",
"
\n",
" \n",
" | symmetry_worst | \n",
" -0.417123 | \n",
" 0.164409 | \n",
" 0.105824 | \n",
" 0.189385 | \n",
" 0.143762 | \n",
" 0.391210 | \n",
" 0.508330 | \n",
" 0.409733 | \n",
" 0.370446 | \n",
" 0.698086 | \n",
" ... | \n",
" 0.244034 | \n",
" 0.234337 | \n",
" 0.269788 | \n",
" 0.209443 | \n",
" 0.486361 | \n",
" 0.614717 | \n",
" 0.530509 | \n",
" 0.502487 | \n",
" 1.000000 | \n",
" 0.527177 | \n",
"
\n",
" \n",
" | fractal_dimension_worst | \n",
" -0.322046 | \n",
" 0.009169 | \n",
" 0.113425 | \n",
" 0.052794 | \n",
" 0.006854 | \n",
" 0.498783 | \n",
" 0.681343 | \n",
" 0.513497 | \n",
" 0.366058 | \n",
" 0.427079 | \n",
" ... | \n",
" 0.092952 | \n",
" 0.214237 | \n",
" 0.137973 | \n",
" 0.079535 | \n",
" 0.608113 | \n",
" 0.800823 | \n",
" 0.682274 | \n",
" 0.510454 | \n",
" 0.527177 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
31 rows × 31 columns
\n",
"
"
],
"text/plain": [
" diagnosis radius_mean texture_mean perimeter_mean \\\n",
"diagnosis 1.000000 -0.730032 -0.415185 -0.742636 \n",
"radius_mean -0.730032 1.000000 0.323777 0.997856 \n",
"texture_mean -0.415185 0.323777 1.000000 0.329533 \n",
"perimeter_mean -0.742636 0.997856 0.329533 1.000000 \n",
"area_mean -0.708984 0.987361 0.321086 0.986507 \n",
"smoothness_mean -0.330624 0.147104 -0.040419 0.183356 \n",
"compactness_mean -0.597576 0.505622 0.237019 0.556485 \n",
"concavity_mean -0.695750 0.677041 0.300536 0.716448 \n",
"concave points_mean -0.774410 0.821646 0.287618 0.849875 \n",
"symmetry_mean -0.333477 0.151195 0.067193 0.186306 \n",
"fractal_dimension_mean 0.027269 -0.290001 -0.080337 -0.245096 \n",
"radius_se -0.567328 0.678590 0.275973 0.691268 \n",
"texture_se 0.008299 -0.097230 0.386443 -0.086669 \n",
"perimeter_se -0.556055 0.674109 0.281590 0.693069 \n",
"area_se -0.548236 0.735868 0.259844 0.744983 \n",
"smoothness_se 0.012117 -0.167456 0.019777 -0.149510 \n",
"compactness_se -0.288713 0.204341 0.192029 0.248683 \n",
"concavity_se -0.255041 0.194151 0.145536 0.228160 \n",
"concave points_se -0.329287 0.325042 0.153373 0.353151 \n",
"symmetry_se 0.025127 -0.117235 -0.017239 -0.093748 \n",
"fractal_dimension_se -0.091669 -0.009516 0.079210 0.027147 \n",
"radius_worst -0.776453 0.969541 0.352578 0.969476 \n",
"texture_worst -0.456903 0.297000 0.912045 0.303038 \n",
"perimeter_worst -0.782914 0.965139 0.358040 0.970387 \n",
"area_worst -0.733825 0.941087 0.343546 0.941550 \n",
"smoothness_worst -0.419661 0.119031 0.075885 0.149530 \n",
"compactness_worst -0.590477 0.413065 0.278112 0.455370 \n",
"concavity_worst -0.659345 0.527379 0.300041 0.564315 \n",
"concave points_worst -0.791976 0.741494 0.293490 0.768429 \n",
"symmetry_worst -0.417123 0.164409 0.105824 0.189385 \n",
"fractal_dimension_worst -0.322046 0.009169 0.113425 0.052794 \n",
"\n",
" area_mean smoothness_mean compactness_mean \\\n",
"diagnosis -0.708984 -0.330624 -0.597576 \n",
"radius_mean 0.987361 0.147104 0.505622 \n",
"texture_mean 0.321086 -0.040419 0.237019 \n",
"perimeter_mean 0.986507 0.183356 0.556485 \n",
"area_mean 1.000000 0.151671 0.498038 \n",
"smoothness_mean 0.151671 1.000000 0.640292 \n",
"compactness_mean 0.498038 0.640292 1.000000 \n",
"concavity_mean 0.686308 0.501956 0.883729 \n",
"concave points_mean 0.822500 0.528881 0.829022 \n",
"symmetry_mean 0.154603 0.544531 0.602728 \n",
"fractal_dimension_mean -0.260436 0.556840 0.506156 \n",
"radius_se 0.732125 0.277600 0.497381 \n",
"texture_se -0.066119 0.059149 0.044875 \n",
"perimeter_se 0.726564 0.272183 0.548665 \n",
"area_se 0.800086 0.222595 0.456053 \n",
"smoothness_se -0.130192 0.268150 0.133392 \n",
"compactness_se 0.211126 0.313043 0.729164 \n",
"concavity_se 0.206659 0.249132 0.571996 \n",
"concave points_se 0.322610 0.282948 0.558232 \n",
"symmetry_se -0.084347 0.216286 0.236474 \n",
"fractal_dimension_se 0.002046 0.278176 0.496677 \n",
"radius_worst 0.962745 0.189984 0.534939 \n",
"texture_worst 0.287489 0.022669 0.247564 \n",
"perimeter_worst 0.959120 0.215574 0.589944 \n",
"area_worst 0.959213 0.182905 0.509449 \n",
"smoothness_worst 0.122984 0.777084 0.559318 \n",
"compactness_worst 0.389874 0.461971 0.864209 \n",
"concavity_worst 0.512943 0.420549 0.815163 \n",
"concave points_worst 0.719419 0.483773 0.813038 \n",
"symmetry_worst 0.143762 0.391210 0.508330 \n",
"fractal_dimension_worst 0.006854 0.498783 0.681343 \n",
"\n",
" concavity_mean concave points_mean symmetry_mean \\\n",
"diagnosis -0.695750 -0.774410 -0.333477 \n",
"radius_mean 0.677041 0.821646 0.151195 \n",
"texture_mean 0.300536 0.287618 0.067193 \n",
"perimeter_mean 0.716448 0.849875 0.186306 \n",
"area_mean 0.686308 0.822500 0.154603 \n",
"smoothness_mean 0.501956 0.528881 0.544531 \n",
"compactness_mean 0.883729 0.829022 0.602728 \n",
"concavity_mean 1.000000 0.918687 0.500739 \n",
"concave points_mean 0.918687 1.000000 0.460745 \n",
"symmetry_mean 0.500739 0.460745 1.000000 \n",
"fractal_dimension_mean 0.301176 0.142307 0.411393 \n",
"radius_se 0.631258 0.699764 0.304353 \n",
"texture_se 0.075893 0.025260 0.126154 \n",
"perimeter_se 0.659685 0.712066 0.313482 \n",
"area_se 0.617299 0.690642 0.226088 \n",
"smoothness_se 0.110590 0.051658 0.153937 \n",
"compactness_se 0.665217 0.490719 0.412566 \n",
"concavity_se 0.688883 0.441037 0.337796 \n",
"concave points_se 0.582869 0.522044 0.312608 \n",
"symmetry_se 0.178725 0.098510 0.446116 \n",
"fractal_dimension_se 0.427309 0.260200 0.333318 \n",
"radius_worst 0.688703 0.828401 0.189763 \n",
"texture_worst 0.298983 0.286828 0.084924 \n",
"perimeter_worst 0.729963 0.853642 0.222908 \n",
"area_worst 0.676656 0.807978 0.181731 \n",
"smoothness_worst 0.446247 0.446157 0.424135 \n",
"compactness_worst 0.755167 0.665741 0.474981 \n",
"concavity_worst 0.883088 0.748687 0.432004 \n",
"concave points_worst 0.859451 0.905058 0.428627 \n",
"symmetry_worst 0.409733 0.370446 0.698086 \n",
"fractal_dimension_worst 0.513497 0.366058 0.427079 \n",
"\n",
" ... radius_worst texture_worst perimeter_worst \\\n",
"diagnosis ... -0.776453 -0.456903 -0.782914 \n",
"radius_mean ... 0.969541 0.297000 0.965139 \n",
"texture_mean ... 0.352578 0.912045 0.358040 \n",
"perimeter_mean ... 0.969476 0.303038 0.970387 \n",
"area_mean ... 0.962745 0.287489 0.959120 \n",
"smoothness_mean ... 0.189984 0.022669 0.215574 \n",
"compactness_mean ... 0.534939 0.247564 0.589944 \n",
"concavity_mean ... 0.688703 0.298983 0.729963 \n",
"concave points_mean ... 0.828401 0.286828 0.853642 \n",
"symmetry_mean ... 0.189763 0.084924 0.222908 \n",
"fractal_dimension_mean ... -0.242125 -0.051211 -0.199454 \n",
"radius_se ... 0.714872 0.195092 0.719492 \n",
"texture_se ... -0.111592 0.409071 -0.102161 \n",
"perimeter_se ... 0.697133 0.200270 0.720966 \n",
"area_se ... 0.757372 0.196496 0.761213 \n",
"smoothness_se ... -0.152307 -0.008034 -0.136532 \n",
"compactness_se ... 0.202027 0.146121 0.256901 \n",
"concavity_se ... 0.187371 0.102586 0.227007 \n",
"concave points_se ... 0.307288 0.079999 0.342227 \n",
"symmetry_se ... -0.142188 -0.094773 -0.117920 \n",
"fractal_dimension_se ... -0.005759 0.008842 0.033505 \n",
"radius_worst ... 1.000000 0.359925 0.993707 \n",
"texture_worst ... 0.359925 1.000000 0.365098 \n",
"perimeter_worst ... 0.993707 0.365098 1.000000 \n",
"area_worst ... 0.984014 0.345842 0.977578 \n",
"smoothness_worst ... 0.215895 0.225808 0.235168 \n",
"compactness_worst ... 0.475348 0.361123 0.528876 \n",
"concavity_worst ... 0.574562 0.367625 0.618906 \n",
"concave points_worst ... 0.784946 0.358467 0.813826 \n",
"symmetry_worst ... 0.244034 0.234337 0.269788 \n",
"fractal_dimension_worst ... 0.092952 0.214237 0.137973 \n",
"\n",
" area_worst smoothness_worst compactness_worst \\\n",
"diagnosis -0.733825 -0.419661 -0.590477 \n",
"radius_mean 0.941087 0.119031 0.413065 \n",
"texture_mean 0.343546 0.075885 0.278112 \n",
"perimeter_mean 0.941550 0.149530 0.455370 \n",
"area_mean 0.959213 0.122984 0.389874 \n",
"smoothness_mean 0.182905 0.777084 0.461971 \n",
"compactness_mean 0.509449 0.559318 0.864209 \n",
"concavity_mean 0.676656 0.446247 0.755167 \n",
"concave points_mean 0.807978 0.446157 0.665741 \n",
"symmetry_mean 0.181731 0.424135 0.474981 \n",
"fractal_dimension_mean -0.218566 0.451897 0.410627 \n",
"radius_se 0.751431 0.140187 0.286572 \n",
"texture_se -0.083078 -0.073042 -0.092846 \n",
"perimeter_se 0.730647 0.125791 0.340951 \n",
"area_se 0.811408 0.124392 0.282682 \n",
"smoothness_se -0.115951 0.309111 0.033085 \n",
"compactness_se 0.196258 0.221708 0.672119 \n",
"concavity_se 0.187836 0.172332 0.491365 \n",
"concave points_se 0.297057 0.144157 0.389143 \n",
"symmetry_se -0.123926 -0.007047 0.070591 \n",
"fractal_dimension_se -0.000367 0.167714 0.381239 \n",
"radius_worst 0.984014 0.215895 0.475348 \n",
"texture_worst 0.345842 0.225808 0.361123 \n",
"perimeter_worst 0.977578 0.235168 0.528876 \n",
"area_worst 1.000000 0.209064 0.437727 \n",
"smoothness_worst 0.209064 1.000000 0.563176 \n",
"compactness_worst 0.437727 0.563176 1.000000 \n",
"concavity_worst 0.543774 0.513958 0.892859 \n",
"concave points_worst 0.745090 0.542461 0.800307 \n",
"symmetry_worst 0.209443 0.486361 0.614717 \n",
"fractal_dimension_worst 0.079535 0.608113 0.800823 \n",
"\n",
" concavity_worst concave points_worst \\\n",
"diagnosis -0.659345 -0.791976 \n",
"radius_mean 0.527379 0.741494 \n",
"texture_mean 0.300041 0.293490 \n",
"perimeter_mean 0.564315 0.768429 \n",
"area_mean 0.512943 0.719419 \n",
"smoothness_mean 0.420549 0.483773 \n",
"compactness_mean 0.815163 0.813038 \n",
"concavity_mean 0.883088 0.859451 \n",
"concave points_mean 0.748687 0.905058 \n",
"symmetry_mean 0.432004 0.428627 \n",
"fractal_dimension_mean 0.305228 0.148275 \n",
"radius_se 0.380110 0.528842 \n",
"texture_se -0.070533 -0.120938 \n",
"perimeter_se 0.418269 0.552873 \n",
"area_se 0.384820 0.535417 \n",
"smoothness_se 0.018581 -0.027878 \n",
"compactness_se 0.633328 0.477739 \n",
"concavity_se 0.662437 0.443167 \n",
"concave points_se 0.463453 0.502092 \n",
"symmetry_se 0.035520 -0.032924 \n",
"fractal_dimension_se 0.363578 0.203468 \n",
"radius_worst 0.574562 0.784946 \n",
"texture_worst 0.367625 0.358467 \n",
"perimeter_worst 0.618906 0.813826 \n",
"area_worst 0.543774 0.745090 \n",
"smoothness_worst 0.513958 0.542461 \n",
"compactness_worst 0.892859 0.800307 \n",
"concavity_worst 1.000000 0.855035 \n",
"concave points_worst 0.855035 1.000000 \n",
"symmetry_worst 0.530509 0.502487 \n",
"fractal_dimension_worst 0.682274 0.510454 \n",
"\n",
" symmetry_worst fractal_dimension_worst \n",
"diagnosis -0.417123 -0.322046 \n",
"radius_mean 0.164409 0.009169 \n",
"texture_mean 0.105824 0.113425 \n",
"perimeter_mean 0.189385 0.052794 \n",
"area_mean 0.143762 0.006854 \n",
"smoothness_mean 0.391210 0.498783 \n",
"compactness_mean 0.508330 0.681343 \n",
"concavity_mean 0.409733 0.513497 \n",
"concave points_mean 0.370446 0.366058 \n",
"symmetry_mean 0.698086 0.427079 \n",
"fractal_dimension_mean 0.283061 0.701592 \n",
"radius_se 0.094549 0.049351 \n",
"texture_se -0.127607 -0.044761 \n",
"perimeter_se 0.108877 0.083512 \n",
"area_se 0.072572 0.016228 \n",
"smoothness_se -0.032829 0.119089 \n",
"compactness_se 0.271770 0.574867 \n",
"concavity_se 0.199593 0.445392 \n",
"concave points_se 0.126508 0.237510 \n",
"symmetry_se 0.382952 0.084610 \n",
"fractal_dimension_se 0.098792 0.522940 \n",
"radius_worst 0.244034 0.092952 \n",
"texture_worst 0.234337 0.214237 \n",
"perimeter_worst 0.269788 0.137973 \n",
"area_worst 0.209443 0.079535 \n",
"smoothness_worst 0.486361 0.608113 \n",
"compactness_worst 0.614717 0.800823 \n",
"concavity_worst 0.530509 0.682274 \n",
"concave points_worst 0.502487 0.510454 \n",
"symmetry_worst 1.000000 0.527177 \n",
"fractal_dimension_worst 0.527177 1.000000 \n",
"\n",
"[31 rows x 31 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.corr()"
]
},
{
"cell_type": "markdown",
"id": "ee2f0278",
"metadata": {},
"source": [
"# Model Loading\n",
"In this classification, we used scikitlearn's algorithms for predicting the labels of M and B (now 0's and 1's).\n",
" \n",
"Having multiple models on their libary, we can run many models and compare it later."
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "4b48f208",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "d7b5908a",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.svm import SVC\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.ensemble import GradientBoostingClassifier"
]
},
{
"cell_type": "markdown",
"id": "3d50df8d",
"metadata": {},
"source": [
"Splitting the dataset into Training and Test Sets"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "e32ec855",
"metadata": {},
"outputs": [],
"source": [
"# Split the data into features (X) and labels (y)\n",
"X = df.drop(columns=['diagnosis'])\n",
"y = df['diagnosis']\n",
"\n",
"# Split the data into training and testing sets\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Define a dictionary to store results\n",
"results = {'Model': [], 'F1_score': [], 'Accuracy': [], 'Precision': [], 'Recall': []}"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "28ebe0cf",
"metadata": {},
"outputs": [],
"source": [
"models = {\n",
" 'Random Forest': RandomForestClassifier(),\n",
" 'Support Vector Machine': SVC(),\n",
" 'K-Nearest Neighbors': KNeighborsClassifier(),\n",
" 'Logistic Regression': LogisticRegression(),\n",
" 'Decision Tree': DecisionTreeClassifier(),\n",
" 'Naive Bayes': GaussianNB(),\n",
" 'AdaBoost': AdaBoostClassifier(),\n",
" 'Gradient Boosting': GradientBoostingClassifier()\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "17781adc",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n"
]
}
],
"source": [
"for model_name, model in models.items():\n",
" # Train the model\n",
" model.fit(X_train, y_train)\n",
"\n",
" # Make predictions\n",
" y_pred = model.predict(X_test)\n",
"\n",
" # Evaluate the model\n",
" f1 = f1_score(y_test, y_pred)\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" precision = precision_score(y_test, y_pred)\n",
" recall = recall_score(y_test, y_pred)\n",
"\n",
" # Store results in the dictionary\n",
" results['Model'].append(model_name)\n",
" results['F1_score'].append(f1)\n",
" results['Accuracy'].append(accuracy)\n",
" results['Precision'].append(precision)\n",
" results['Recall'].append(recall)\n",
"\n",
"# Create a DataFrame from the results dictionary\n",
"results_df = pd.DataFrame(results)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "c98e51c8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Model | \n",
" F1_score | \n",
" Accuracy | \n",
" Precision | \n",
" Recall | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Random Forest | \n",
" 0.972222 | \n",
" 0.964912 | \n",
" 0.958904 | \n",
" 0.985915 | \n",
"
\n",
" \n",
" | 1 | \n",
" Support Vector Machine | \n",
" 0.959459 | \n",
" 0.947368 | \n",
" 0.922078 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" | 2 | \n",
" K-Nearest Neighbors | \n",
" 0.965986 | \n",
" 0.956140 | \n",
" 0.934211 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" | 3 | \n",
" Random Forest | \n",
" 0.972222 | \n",
" 0.964912 | \n",
" 0.958904 | \n",
" 0.985915 | \n",
"
\n",
" \n",
" | 4 | \n",
" Support Vector Machine | \n",
" 0.959459 | \n",
" 0.947368 | \n",
" 0.922078 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" | 5 | \n",
" K-Nearest Neighbors | \n",
" 0.965986 | \n",
" 0.956140 | \n",
" 0.934211 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" | 6 | \n",
" Logistic Regression | \n",
" 0.972222 | \n",
" 0.964912 | \n",
" 0.958904 | \n",
" 0.985915 | \n",
"
\n",
" \n",
" | 7 | \n",
" Decision Tree | \n",
" 0.951049 | \n",
" 0.938596 | \n",
" 0.944444 | \n",
" 0.957746 | \n",
"
\n",
" \n",
" | 8 | \n",
" Naive Bayes | \n",
" 0.979310 | \n",
" 0.973684 | \n",
" 0.959459 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" | 9 | \n",
" AdaBoost | \n",
" 0.957746 | \n",
" 0.947368 | \n",
" 0.957746 | \n",
" 0.957746 | \n",
"
\n",
" \n",
" | 10 | \n",
" Gradient Boosting | \n",
" 0.972222 | \n",
" 0.964912 | \n",
" 0.958904 | \n",
" 0.985915 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Model F1_score Accuracy Precision Recall\n",
"0 Random Forest 0.972222 0.964912 0.958904 0.985915\n",
"1 Support Vector Machine 0.959459 0.947368 0.922078 1.000000\n",
"2 K-Nearest Neighbors 0.965986 0.956140 0.934211 1.000000\n",
"3 Random Forest 0.972222 0.964912 0.958904 0.985915\n",
"4 Support Vector Machine 0.959459 0.947368 0.922078 1.000000\n",
"5 K-Nearest Neighbors 0.965986 0.956140 0.934211 1.000000\n",
"6 Logistic Regression 0.972222 0.964912 0.958904 0.985915\n",
"7 Decision Tree 0.951049 0.938596 0.944444 0.957746\n",
"8 Naive Bayes 0.979310 0.973684 0.959459 1.000000\n",
"9 AdaBoost 0.957746 0.947368 0.957746 0.957746\n",
"10 Gradient Boosting 0.972222 0.964912 0.958904 0.985915"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "212e9b94",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}