diff --git a/Classification/Breast_Cancer_Wisconsin.ipynb b/Classification/Breast_Cancer_Wisconsin.ipynb new file mode 100644 index 0000000..6f7b4db --- /dev/null +++ b/Classification/Breast_Cancer_Wisconsin.ipynb @@ -0,0 +1,2802 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bd00ebdc", + "metadata": {}, + "source": [ + "# Load Basic Libraries\n", + " Load some libaries to read and display the data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "375f1a0c", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b8760991", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...texture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worstUnnamed: 32
0842302M17.9910.38122.801001.00.118400.277600.30010.14710...17.33184.602019.00.16220.66560.71190.26540.46010.11890NaN
1842517M20.5717.77132.901326.00.084740.078640.08690.07017...23.41158.801956.00.12380.18660.24160.18600.27500.08902NaN
284300903M19.6921.25130.001203.00.109600.159900.19740.12790...25.53152.501709.00.14440.42450.45040.24300.36130.08758NaN
384348301M11.4220.3877.58386.10.142500.283900.24140.10520...26.5098.87567.70.20980.86630.68690.25750.66380.17300NaN
484358402M20.2914.34135.101297.00.100300.132800.19800.10430...16.67152.201575.00.13740.20500.40000.16250.23640.07678NaN
\n", + "

5 rows × 33 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", + "0 842302 M 17.99 10.38 122.80 1001.0 \n", + "1 842517 M 20.57 17.77 132.90 1326.0 \n", + "2 84300903 M 19.69 21.25 130.00 1203.0 \n", + "3 84348301 M 11.42 20.38 77.58 386.1 \n", + "4 84358402 M 20.29 14.34 135.10 1297.0 \n", + "\n", + " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", + "0 0.11840 0.27760 0.3001 0.14710 \n", + "1 0.08474 0.07864 0.0869 0.07017 \n", + "2 0.10960 0.15990 0.1974 0.12790 \n", + "3 0.14250 0.28390 0.2414 0.10520 \n", + "4 0.10030 0.13280 0.1980 0.10430 \n", + "\n", + " ... texture_worst perimeter_worst area_worst smoothness_worst \\\n", + "0 ... 17.33 184.60 2019.0 0.1622 \n", + "1 ... 23.41 158.80 1956.0 0.1238 \n", + "2 ... 25.53 152.50 1709.0 0.1444 \n", + "3 ... 26.50 98.87 567.7 0.2098 \n", + "4 ... 16.67 152.20 1575.0 0.1374 \n", + "\n", + " compactness_worst concavity_worst concave points_worst symmetry_worst \\\n", + "0 0.6656 0.7119 0.2654 0.4601 \n", + "1 0.1866 0.2416 0.1860 0.2750 \n", + "2 0.4245 0.4504 0.2430 0.3613 \n", + "3 0.8663 0.6869 0.2575 0.6638 \n", + "4 0.2050 0.4000 0.1625 0.2364 \n", + "\n", + " fractal_dimension_worst Unnamed: 32 \n", + "0 0.11890 NaN \n", + "1 0.08902 NaN \n", + "2 0.08758 NaN \n", + "3 0.17300 NaN \n", + "4 0.07678 NaN \n", + "\n", + "[5 rows x 33 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/youronlydimwit/Data_ScienceUse_Cases/main/Classification/Data/Breast_Cancer_Wisconsin.csv\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "6eb25bbe", + "metadata": {}, + "source": [ + "# Preliminary check for missing values\n", + "Created a function to outputs a dataframe with columns:\n", + "- Column Name\n", + "- The Data type\n", + "- Count of missing data (Nulls)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "72bb3dcc", + "metadata": {}, + "outputs": [], + "source": [ + "# Missing data check function\n", + "def completeness_check(input_df):\n", + " # Create a new DataFrame\n", + " summary_df = pd.DataFrame(columns=['Column_Name', 'Data_Type', 'Missing_Data'])\n", + "\n", + " # Fill in the data\n", + " summary_df['Column_Name'] = input_df.columns\n", + " summary_df['Data_Type'] = input_df.dtypes.values\n", + " summary_df['Missing_Data'] = input_df.isnull().sum().values\n", + "\n", + " return summary_df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "dd4b5d0f", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Column_NameData_TypeMissing_Data
0idint640
1diagnosisobject0
2radius_meanfloat640
3texture_meanfloat640
4perimeter_meanfloat640
5area_meanfloat640
6smoothness_meanfloat640
7compactness_meanfloat640
8concavity_meanfloat640
9concave points_meanfloat640
10symmetry_meanfloat640
11fractal_dimension_meanfloat640
12radius_sefloat640
13texture_sefloat640
14perimeter_sefloat640
15area_sefloat640
16smoothness_sefloat640
17compactness_sefloat640
18concavity_sefloat640
19concave points_sefloat640
20symmetry_sefloat640
21fractal_dimension_sefloat640
22radius_worstfloat640
23texture_worstfloat640
24perimeter_worstfloat640
25area_worstfloat640
26smoothness_worstfloat640
27compactness_worstfloat640
28concavity_worstfloat640
29concave points_worstfloat640
30symmetry_worstfloat640
31fractal_dimension_worstfloat640
32Unnamed: 32float64569
\n", + "
" + ], + "text/plain": [ + " Column_Name Data_Type Missing_Data\n", + "0 id int64 0\n", + "1 diagnosis object 0\n", + "2 radius_mean float64 0\n", + "3 texture_mean float64 0\n", + "4 perimeter_mean float64 0\n", + "5 area_mean float64 0\n", + "6 smoothness_mean float64 0\n", + "7 compactness_mean float64 0\n", + "8 concavity_mean float64 0\n", + "9 concave points_mean float64 0\n", + "10 symmetry_mean float64 0\n", + "11 fractal_dimension_mean float64 0\n", + "12 radius_se float64 0\n", + "13 texture_se float64 0\n", + "14 perimeter_se float64 0\n", + "15 area_se float64 0\n", + "16 smoothness_se float64 0\n", + "17 compactness_se float64 0\n", + "18 concavity_se float64 0\n", + "19 concave points_se float64 0\n", + "20 symmetry_se float64 0\n", + "21 fractal_dimension_se float64 0\n", + "22 radius_worst float64 0\n", + "23 texture_worst float64 0\n", + "24 perimeter_worst float64 0\n", + "25 area_worst float64 0\n", + "26 smoothness_worst float64 0\n", + "27 compactness_worst float64 0\n", + "28 concavity_worst float64 0\n", + "29 concave points_worst float64 0\n", + "30 symmetry_worst float64 0\n", + "31 fractal_dimension_worst float64 0\n", + "32 Unnamed: 32 float64 569" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completeness_check(df)" + ] + }, + { + "cell_type": "markdown", + "id": "d91cd816", + "metadata": {}, + "source": [ + "Just dropping unnecessary columns, for this context, it is column **'id'** and **'Unnamed: 32'**" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6ce4e125", + "metadata": {}, + "outputs": [], + "source": [ + "# We don't need these 2 columns\n", + "df = df.drop(['id','Unnamed: 32'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "35e3462f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0M17.9910.38122.801001.00.118400.277600.30010.147100.2419...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
1M20.5717.77132.901326.00.084740.078640.08690.070170.1812...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
2M19.6921.25130.001203.00.109600.159900.19740.127900.2069...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
3M11.4220.3877.58386.10.142500.283900.24140.105200.2597...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
4M20.2914.34135.101297.00.100300.132800.19800.104300.1809...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n", + "

5 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", + "0 M 17.99 10.38 122.80 1001.0 \n", + "1 M 20.57 17.77 132.90 1326.0 \n", + "2 M 19.69 21.25 130.00 1203.0 \n", + "3 M 11.42 20.38 77.58 386.1 \n", + "4 M 20.29 14.34 135.10 1297.0 \n", + "\n", + " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", + "0 0.11840 0.27760 0.3001 0.14710 \n", + "1 0.08474 0.07864 0.0869 0.07017 \n", + "2 0.10960 0.15990 0.1974 0.12790 \n", + "3 0.14250 0.28390 0.2414 0.10520 \n", + "4 0.10030 0.13280 0.1980 0.10430 \n", + "\n", + " symmetry_mean ... radius_worst texture_worst perimeter_worst \\\n", + "0 0.2419 ... 25.38 17.33 184.60 \n", + "1 0.1812 ... 24.99 23.41 158.80 \n", + "2 0.2069 ... 23.57 25.53 152.50 \n", + "3 0.2597 ... 14.91 26.50 98.87 \n", + "4 0.1809 ... 22.54 16.67 152.20 \n", + "\n", + " area_worst smoothness_worst compactness_worst concavity_worst \\\n", + "0 2019.0 0.1622 0.6656 0.7119 \n", + "1 1956.0 0.1238 0.1866 0.2416 \n", + "2 1709.0 0.1444 0.4245 0.4504 \n", + "3 567.7 0.2098 0.8663 0.6869 \n", + "4 1575.0 0.1374 0.2050 0.4000 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "0 0.2654 0.4601 0.11890 \n", + "1 0.1860 0.2750 0.08902 \n", + "2 0.2430 0.3613 0.08758 \n", + "3 0.2575 0.6638 0.17300 \n", + "4 0.1625 0.2364 0.07678 \n", + "\n", + "[5 rows x 31 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "4b2e562f", + "metadata": {}, + "source": [ + "# Reformatting\n", + "From the source, the data is assumed already cleaned. Then, the _float_ in this dataset is being reformatted as **#.##** format for easier reading." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b7eca327", + "metadata": {}, + "outputs": [], + "source": [ + "def format_float_columns(df):\n", + " # Get columns with float64 dtype\n", + " float_columns = df.select_dtypes(include='float64').columns\n", + " \n", + " # Format each column without changing the data type\n", + " for col in float_columns:\n", + " df[col] = df[col].round(2)\n", + " \n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "adc5f2f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0M17.9910.38122.801001.00.120.280.300.150.24...25.3817.33184.602019.00.160.670.710.270.460.12
1M20.5717.77132.901326.00.080.080.090.070.18...24.9923.41158.801956.00.120.190.240.190.280.09
2M19.6921.25130.001203.00.110.160.200.130.21...23.5725.53152.501709.00.140.420.450.240.360.09
3M11.4220.3877.58386.10.140.280.240.110.26...14.9126.5098.87567.70.210.870.690.260.660.17
4M20.2914.34135.101297.00.100.130.200.100.18...22.5416.67152.201575.00.140.200.400.160.240.08
..................................................................
564M21.5622.39142.001479.00.110.120.240.140.17...25.4526.40166.102027.00.140.210.410.220.210.07
565M20.1328.25131.201261.00.100.100.140.100.18...23.6938.25155.001731.00.120.190.320.160.260.07
566M16.6028.08108.30858.10.080.100.090.050.16...18.9834.12126.701124.00.110.310.340.140.220.08
567M20.6029.33140.101265.00.120.280.350.150.24...25.7439.42184.601821.00.160.870.940.260.410.12
568B7.7624.5447.92181.00.050.040.000.000.16...9.4630.3759.16268.60.090.060.000.000.290.07
\n", + "

569 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", + "0 M 17.99 10.38 122.80 1001.0 \n", + "1 M 20.57 17.77 132.90 1326.0 \n", + "2 M 19.69 21.25 130.00 1203.0 \n", + "3 M 11.42 20.38 77.58 386.1 \n", + "4 M 20.29 14.34 135.10 1297.0 \n", + ".. ... ... ... ... ... \n", + "564 M 21.56 22.39 142.00 1479.0 \n", + "565 M 20.13 28.25 131.20 1261.0 \n", + "566 M 16.60 28.08 108.30 858.1 \n", + "567 M 20.60 29.33 140.10 1265.0 \n", + "568 B 7.76 24.54 47.92 181.0 \n", + "\n", + " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", + "0 0.12 0.28 0.30 0.15 \n", + "1 0.08 0.08 0.09 0.07 \n", + "2 0.11 0.16 0.20 0.13 \n", + "3 0.14 0.28 0.24 0.11 \n", + "4 0.10 0.13 0.20 0.10 \n", + ".. ... ... ... ... \n", + "564 0.11 0.12 0.24 0.14 \n", + "565 0.10 0.10 0.14 0.10 \n", + "566 0.08 0.10 0.09 0.05 \n", + "567 0.12 0.28 0.35 0.15 \n", + "568 0.05 0.04 0.00 0.00 \n", + "\n", + " symmetry_mean ... radius_worst texture_worst perimeter_worst \\\n", + "0 0.24 ... 25.38 17.33 184.60 \n", + "1 0.18 ... 24.99 23.41 158.80 \n", + "2 0.21 ... 23.57 25.53 152.50 \n", + "3 0.26 ... 14.91 26.50 98.87 \n", + "4 0.18 ... 22.54 16.67 152.20 \n", + ".. ... ... ... ... ... \n", + "564 0.17 ... 25.45 26.40 166.10 \n", + "565 0.18 ... 23.69 38.25 155.00 \n", + "566 0.16 ... 18.98 34.12 126.70 \n", + "567 0.24 ... 25.74 39.42 184.60 \n", + "568 0.16 ... 9.46 30.37 59.16 \n", + "\n", + " area_worst smoothness_worst compactness_worst concavity_worst \\\n", + "0 2019.0 0.16 0.67 0.71 \n", + "1 1956.0 0.12 0.19 0.24 \n", + "2 1709.0 0.14 0.42 0.45 \n", + "3 567.7 0.21 0.87 0.69 \n", + "4 1575.0 0.14 0.20 0.40 \n", + ".. ... ... ... ... \n", + "564 2027.0 0.14 0.21 0.41 \n", + "565 1731.0 0.12 0.19 0.32 \n", + "566 1124.0 0.11 0.31 0.34 \n", + "567 1821.0 0.16 0.87 0.94 \n", + "568 268.6 0.09 0.06 0.00 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "0 0.27 0.46 0.12 \n", + "1 0.19 0.28 0.09 \n", + "2 0.24 0.36 0.09 \n", + "3 0.26 0.66 0.17 \n", + "4 0.16 0.24 0.08 \n", + ".. ... ... ... \n", + "564 0.22 0.21 0.07 \n", + "565 0.16 0.26 0.07 \n", + "566 0.14 0.22 0.08 \n", + "567 0.26 0.41 0.12 \n", + "568 0.00 0.29 0.07 \n", + "\n", + "[569 rows x 31 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "format_float_columns(df)" + ] + }, + { + "cell_type": "markdown", + "id": "c9aaef3a", + "metadata": {}, + "source": [ + "# Encoding\n", + "Since the labels to be predicted only has **M for Malignant** and **B for Benign**, we can encode it to become 0 and 1 respectively.\n", + "I also changed the type to float, so the data types are all the same type" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "60870d9a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
00.017.9910.38122.81001.00.120.280.300.150.24...25.3817.33184.62019.00.160.670.710.270.460.12
10.020.5717.77132.91326.00.080.080.090.070.18...24.9923.41158.81956.00.120.190.240.190.280.09
20.019.6921.25130.01203.00.110.160.200.130.21...23.5725.53152.51709.00.140.420.450.240.360.09
\n", + "

3 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", + "0 0.0 17.99 10.38 122.8 1001.0 \n", + "1 0.0 20.57 17.77 132.9 1326.0 \n", + "2 0.0 19.69 21.25 130.0 1203.0 \n", + "\n", + " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", + "0 0.12 0.28 0.30 0.15 \n", + "1 0.08 0.08 0.09 0.07 \n", + "2 0.11 0.16 0.20 0.13 \n", + "\n", + " symmetry_mean ... radius_worst texture_worst perimeter_worst \\\n", + "0 0.24 ... 25.38 17.33 184.6 \n", + "1 0.18 ... 24.99 23.41 158.8 \n", + "2 0.21 ... 23.57 25.53 152.5 \n", + "\n", + " area_worst smoothness_worst compactness_worst concavity_worst \\\n", + "0 2019.0 0.16 0.67 0.71 \n", + "1 1956.0 0.12 0.19 0.24 \n", + "2 1709.0 0.14 0.42 0.45 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "0 0.27 0.46 0.12 \n", + "1 0.19 0.28 0.09 \n", + "2 0.24 0.36 0.09 \n", + "\n", + "[3 rows x 31 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Replace 'M' with 0 and 'B' with 1 in the 'Column' column\n", + "df['diagnosis'] = df['diagnosis'].replace({'M': 0, 'B': 1})\n", + "df['diagnosis'] = df['diagnosis'].astype(float)\n", + "df.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "ecbe4b22", + "metadata": {}, + "source": [ + "Correct data type" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7e437274", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('float64')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['diagnosis'].dtypes" + ] + }, + { + "cell_type": "markdown", + "id": "5b18b919", + "metadata": {}, + "source": [ + "Running a correlation matrix to get a glimpse of who's affecting who the most." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2c3b26b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
diagnosis1.000000-0.730032-0.415185-0.742636-0.708984-0.330624-0.597576-0.695750-0.774410-0.333477...-0.776453-0.456903-0.782914-0.733825-0.419661-0.590477-0.659345-0.791976-0.417123-0.322046
radius_mean-0.7300321.0000000.3237770.9978560.9873610.1471040.5056220.6770410.8216460.151195...0.9695410.2970000.9651390.9410870.1190310.4130650.5273790.7414940.1644090.009169
texture_mean-0.4151850.3237771.0000000.3295330.321086-0.0404190.2370190.3005360.2876180.067193...0.3525780.9120450.3580400.3435460.0758850.2781120.3000410.2934900.1058240.113425
perimeter_mean-0.7426360.9978560.3295331.0000000.9865070.1833560.5564850.7164480.8498750.186306...0.9694760.3030380.9703870.9415500.1495300.4553700.5643150.7684290.1893850.052794
area_mean-0.7089840.9873610.3210860.9865071.0000000.1516710.4980380.6863080.8225000.154603...0.9627450.2874890.9591200.9592130.1229840.3898740.5129430.7194190.1437620.006854
smoothness_mean-0.3306240.147104-0.0404190.1833560.1516711.0000000.6402920.5019560.5288810.544531...0.1899840.0226690.2155740.1829050.7770840.4619710.4205490.4837730.3912100.498783
compactness_mean-0.5975760.5056220.2370190.5564850.4980380.6402921.0000000.8837290.8290220.602728...0.5349390.2475640.5899440.5094490.5593180.8642090.8151630.8130380.5083300.681343
concavity_mean-0.6957500.6770410.3005360.7164480.6863080.5019560.8837291.0000000.9186870.500739...0.6887030.2989830.7299630.6766560.4462470.7551670.8830880.8594510.4097330.513497
concave points_mean-0.7744100.8216460.2876180.8498750.8225000.5288810.8290220.9186871.0000000.460745...0.8284010.2868280.8536420.8079780.4461570.6657410.7486870.9050580.3704460.366058
symmetry_mean-0.3334770.1511950.0671930.1863060.1546030.5445310.6027280.5007390.4607451.000000...0.1897630.0849240.2229080.1817310.4241350.4749810.4320040.4286270.6980860.427079
fractal_dimension_mean0.027269-0.290001-0.080337-0.245096-0.2604360.5568400.5061560.3011760.1423070.411393...-0.242125-0.051211-0.199454-0.2185660.4518970.4106270.3052280.1482750.2830610.701592
radius_se-0.5673280.6785900.2759730.6912680.7321250.2776000.4973810.6312580.6997640.304353...0.7148720.1950920.7194920.7514310.1401870.2865720.3801100.5288420.0945490.049351
texture_se0.008299-0.0972300.386443-0.086669-0.0661190.0591490.0448750.0758930.0252600.126154...-0.1115920.409071-0.102161-0.083078-0.073042-0.092846-0.070533-0.120938-0.127607-0.044761
perimeter_se-0.5560550.6741090.2815900.6930690.7265640.2721830.5486650.6596850.7120660.313482...0.6971330.2002700.7209660.7306470.1257910.3409510.4182690.5528730.1088770.083512
area_se-0.5482360.7358680.2598440.7449830.8000860.2225950.4560530.6172990.6906420.226088...0.7573720.1964960.7612130.8114080.1243920.2826820.3848200.5354170.0725720.016228
smoothness_se0.012117-0.1674560.019777-0.149510-0.1301920.2681500.1333920.1105900.0516580.153937...-0.152307-0.008034-0.136532-0.1159510.3091110.0330850.018581-0.027878-0.0328290.119089
compactness_se-0.2887130.2043410.1920290.2486830.2111260.3130430.7291640.6652170.4907190.412566...0.2020270.1461210.2569010.1962580.2217080.6721190.6333280.4777390.2717700.574867
concavity_se-0.2550410.1941510.1455360.2281600.2066590.2491320.5719960.6888830.4410370.337796...0.1873710.1025860.2270070.1878360.1723320.4913650.6624370.4431670.1995930.445392
concave points_se-0.3292870.3250420.1533730.3531510.3226100.2829480.5582320.5828690.5220440.312608...0.3072880.0799990.3422270.2970570.1441570.3891430.4634530.5020920.1265080.237510
symmetry_se0.025127-0.117235-0.017239-0.093748-0.0843470.2162860.2364740.1787250.0985100.446116...-0.142188-0.094773-0.117920-0.123926-0.0070470.0705910.035520-0.0329240.3829520.084610
fractal_dimension_se-0.091669-0.0095160.0792100.0271470.0020460.2781760.4966770.4273090.2602000.333318...-0.0057590.0088420.033505-0.0003670.1677140.3812390.3635780.2034680.0987920.522940
radius_worst-0.7764530.9695410.3525780.9694760.9627450.1899840.5349390.6887030.8284010.189763...1.0000000.3599250.9937070.9840140.2158950.4753480.5745620.7849460.2440340.092952
texture_worst-0.4569030.2970000.9120450.3030380.2874890.0226690.2475640.2989830.2868280.084924...0.3599251.0000000.3650980.3458420.2258080.3611230.3676250.3584670.2343370.214237
perimeter_worst-0.7829140.9651390.3580400.9703870.9591200.2155740.5899440.7299630.8536420.222908...0.9937070.3650981.0000000.9775780.2351680.5288760.6189060.8138260.2697880.137973
area_worst-0.7338250.9410870.3435460.9415500.9592130.1829050.5094490.6766560.8079780.181731...0.9840140.3458420.9775781.0000000.2090640.4377270.5437740.7450900.2094430.079535
smoothness_worst-0.4196610.1190310.0758850.1495300.1229840.7770840.5593180.4462470.4461570.424135...0.2158950.2258080.2351680.2090641.0000000.5631760.5139580.5424610.4863610.608113
compactness_worst-0.5904770.4130650.2781120.4553700.3898740.4619710.8642090.7551670.6657410.474981...0.4753480.3611230.5288760.4377270.5631761.0000000.8928590.8003070.6147170.800823
concavity_worst-0.6593450.5273790.3000410.5643150.5129430.4205490.8151630.8830880.7486870.432004...0.5745620.3676250.6189060.5437740.5139580.8928591.0000000.8550350.5305090.682274
concave points_worst-0.7919760.7414940.2934900.7684290.7194190.4837730.8130380.8594510.9050580.428627...0.7849460.3584670.8138260.7450900.5424610.8003070.8550351.0000000.5024870.510454
symmetry_worst-0.4171230.1644090.1058240.1893850.1437620.3912100.5083300.4097330.3704460.698086...0.2440340.2343370.2697880.2094430.4863610.6147170.5305090.5024871.0000000.527177
fractal_dimension_worst-0.3220460.0091690.1134250.0527940.0068540.4987830.6813430.5134970.3660580.427079...0.0929520.2142370.1379730.0795350.6081130.8008230.6822740.5104540.5271771.000000
\n", + "

31 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " diagnosis radius_mean texture_mean perimeter_mean \\\n", + "diagnosis 1.000000 -0.730032 -0.415185 -0.742636 \n", + "radius_mean -0.730032 1.000000 0.323777 0.997856 \n", + "texture_mean -0.415185 0.323777 1.000000 0.329533 \n", + "perimeter_mean -0.742636 0.997856 0.329533 1.000000 \n", + "area_mean -0.708984 0.987361 0.321086 0.986507 \n", + "smoothness_mean -0.330624 0.147104 -0.040419 0.183356 \n", + "compactness_mean -0.597576 0.505622 0.237019 0.556485 \n", + "concavity_mean -0.695750 0.677041 0.300536 0.716448 \n", + "concave points_mean -0.774410 0.821646 0.287618 0.849875 \n", + "symmetry_mean -0.333477 0.151195 0.067193 0.186306 \n", + "fractal_dimension_mean 0.027269 -0.290001 -0.080337 -0.245096 \n", + "radius_se -0.567328 0.678590 0.275973 0.691268 \n", + "texture_se 0.008299 -0.097230 0.386443 -0.086669 \n", + "perimeter_se -0.556055 0.674109 0.281590 0.693069 \n", + "area_se -0.548236 0.735868 0.259844 0.744983 \n", + "smoothness_se 0.012117 -0.167456 0.019777 -0.149510 \n", + "compactness_se -0.288713 0.204341 0.192029 0.248683 \n", + "concavity_se -0.255041 0.194151 0.145536 0.228160 \n", + "concave points_se -0.329287 0.325042 0.153373 0.353151 \n", + "symmetry_se 0.025127 -0.117235 -0.017239 -0.093748 \n", + "fractal_dimension_se -0.091669 -0.009516 0.079210 0.027147 \n", + "radius_worst -0.776453 0.969541 0.352578 0.969476 \n", + "texture_worst -0.456903 0.297000 0.912045 0.303038 \n", + "perimeter_worst -0.782914 0.965139 0.358040 0.970387 \n", + "area_worst -0.733825 0.941087 0.343546 0.941550 \n", + "smoothness_worst -0.419661 0.119031 0.075885 0.149530 \n", + "compactness_worst -0.590477 0.413065 0.278112 0.455370 \n", + "concavity_worst -0.659345 0.527379 0.300041 0.564315 \n", + "concave points_worst -0.791976 0.741494 0.293490 0.768429 \n", + "symmetry_worst -0.417123 0.164409 0.105824 0.189385 \n", + "fractal_dimension_worst -0.322046 0.009169 0.113425 0.052794 \n", + "\n", + " area_mean smoothness_mean compactness_mean \\\n", + "diagnosis -0.708984 -0.330624 -0.597576 \n", + "radius_mean 0.987361 0.147104 0.505622 \n", + "texture_mean 0.321086 -0.040419 0.237019 \n", + "perimeter_mean 0.986507 0.183356 0.556485 \n", + "area_mean 1.000000 0.151671 0.498038 \n", + "smoothness_mean 0.151671 1.000000 0.640292 \n", + "compactness_mean 0.498038 0.640292 1.000000 \n", + "concavity_mean 0.686308 0.501956 0.883729 \n", + "concave points_mean 0.822500 0.528881 0.829022 \n", + "symmetry_mean 0.154603 0.544531 0.602728 \n", + "fractal_dimension_mean -0.260436 0.556840 0.506156 \n", + "radius_se 0.732125 0.277600 0.497381 \n", + "texture_se -0.066119 0.059149 0.044875 \n", + "perimeter_se 0.726564 0.272183 0.548665 \n", + "area_se 0.800086 0.222595 0.456053 \n", + "smoothness_se -0.130192 0.268150 0.133392 \n", + "compactness_se 0.211126 0.313043 0.729164 \n", + "concavity_se 0.206659 0.249132 0.571996 \n", + "concave points_se 0.322610 0.282948 0.558232 \n", + "symmetry_se -0.084347 0.216286 0.236474 \n", + "fractal_dimension_se 0.002046 0.278176 0.496677 \n", + "radius_worst 0.962745 0.189984 0.534939 \n", + "texture_worst 0.287489 0.022669 0.247564 \n", + "perimeter_worst 0.959120 0.215574 0.589944 \n", + "area_worst 0.959213 0.182905 0.509449 \n", + "smoothness_worst 0.122984 0.777084 0.559318 \n", + "compactness_worst 0.389874 0.461971 0.864209 \n", + "concavity_worst 0.512943 0.420549 0.815163 \n", + "concave points_worst 0.719419 0.483773 0.813038 \n", + "symmetry_worst 0.143762 0.391210 0.508330 \n", + "fractal_dimension_worst 0.006854 0.498783 0.681343 \n", + "\n", + " concavity_mean concave points_mean symmetry_mean \\\n", + "diagnosis -0.695750 -0.774410 -0.333477 \n", + "radius_mean 0.677041 0.821646 0.151195 \n", + "texture_mean 0.300536 0.287618 0.067193 \n", + "perimeter_mean 0.716448 0.849875 0.186306 \n", + "area_mean 0.686308 0.822500 0.154603 \n", + "smoothness_mean 0.501956 0.528881 0.544531 \n", + "compactness_mean 0.883729 0.829022 0.602728 \n", + "concavity_mean 1.000000 0.918687 0.500739 \n", + "concave points_mean 0.918687 1.000000 0.460745 \n", + "symmetry_mean 0.500739 0.460745 1.000000 \n", + "fractal_dimension_mean 0.301176 0.142307 0.411393 \n", + "radius_se 0.631258 0.699764 0.304353 \n", + "texture_se 0.075893 0.025260 0.126154 \n", + "perimeter_se 0.659685 0.712066 0.313482 \n", + "area_se 0.617299 0.690642 0.226088 \n", + "smoothness_se 0.110590 0.051658 0.153937 \n", + "compactness_se 0.665217 0.490719 0.412566 \n", + "concavity_se 0.688883 0.441037 0.337796 \n", + "concave points_se 0.582869 0.522044 0.312608 \n", + "symmetry_se 0.178725 0.098510 0.446116 \n", + "fractal_dimension_se 0.427309 0.260200 0.333318 \n", + "radius_worst 0.688703 0.828401 0.189763 \n", + "texture_worst 0.298983 0.286828 0.084924 \n", + "perimeter_worst 0.729963 0.853642 0.222908 \n", + "area_worst 0.676656 0.807978 0.181731 \n", + "smoothness_worst 0.446247 0.446157 0.424135 \n", + "compactness_worst 0.755167 0.665741 0.474981 \n", + "concavity_worst 0.883088 0.748687 0.432004 \n", + "concave points_worst 0.859451 0.905058 0.428627 \n", + "symmetry_worst 0.409733 0.370446 0.698086 \n", + "fractal_dimension_worst 0.513497 0.366058 0.427079 \n", + "\n", + " ... radius_worst texture_worst perimeter_worst \\\n", + "diagnosis ... -0.776453 -0.456903 -0.782914 \n", + "radius_mean ... 0.969541 0.297000 0.965139 \n", + "texture_mean ... 0.352578 0.912045 0.358040 \n", + "perimeter_mean ... 0.969476 0.303038 0.970387 \n", + "area_mean ... 0.962745 0.287489 0.959120 \n", + "smoothness_mean ... 0.189984 0.022669 0.215574 \n", + "compactness_mean ... 0.534939 0.247564 0.589944 \n", + "concavity_mean ... 0.688703 0.298983 0.729963 \n", + "concave points_mean ... 0.828401 0.286828 0.853642 \n", + "symmetry_mean ... 0.189763 0.084924 0.222908 \n", + "fractal_dimension_mean ... -0.242125 -0.051211 -0.199454 \n", + "radius_se ... 0.714872 0.195092 0.719492 \n", + "texture_se ... -0.111592 0.409071 -0.102161 \n", + "perimeter_se ... 0.697133 0.200270 0.720966 \n", + "area_se ... 0.757372 0.196496 0.761213 \n", + "smoothness_se ... -0.152307 -0.008034 -0.136532 \n", + "compactness_se ... 0.202027 0.146121 0.256901 \n", + "concavity_se ... 0.187371 0.102586 0.227007 \n", + "concave points_se ... 0.307288 0.079999 0.342227 \n", + "symmetry_se ... -0.142188 -0.094773 -0.117920 \n", + "fractal_dimension_se ... -0.005759 0.008842 0.033505 \n", + "radius_worst ... 1.000000 0.359925 0.993707 \n", + "texture_worst ... 0.359925 1.000000 0.365098 \n", + "perimeter_worst ... 0.993707 0.365098 1.000000 \n", + "area_worst ... 0.984014 0.345842 0.977578 \n", + "smoothness_worst ... 0.215895 0.225808 0.235168 \n", + "compactness_worst ... 0.475348 0.361123 0.528876 \n", + "concavity_worst ... 0.574562 0.367625 0.618906 \n", + "concave points_worst ... 0.784946 0.358467 0.813826 \n", + "symmetry_worst ... 0.244034 0.234337 0.269788 \n", + "fractal_dimension_worst ... 0.092952 0.214237 0.137973 \n", + "\n", + " area_worst smoothness_worst compactness_worst \\\n", + "diagnosis -0.733825 -0.419661 -0.590477 \n", + "radius_mean 0.941087 0.119031 0.413065 \n", + "texture_mean 0.343546 0.075885 0.278112 \n", + "perimeter_mean 0.941550 0.149530 0.455370 \n", + "area_mean 0.959213 0.122984 0.389874 \n", + "smoothness_mean 0.182905 0.777084 0.461971 \n", + "compactness_mean 0.509449 0.559318 0.864209 \n", + "concavity_mean 0.676656 0.446247 0.755167 \n", + "concave points_mean 0.807978 0.446157 0.665741 \n", + "symmetry_mean 0.181731 0.424135 0.474981 \n", + "fractal_dimension_mean -0.218566 0.451897 0.410627 \n", + "radius_se 0.751431 0.140187 0.286572 \n", + "texture_se -0.083078 -0.073042 -0.092846 \n", + "perimeter_se 0.730647 0.125791 0.340951 \n", + "area_se 0.811408 0.124392 0.282682 \n", + "smoothness_se -0.115951 0.309111 0.033085 \n", + "compactness_se 0.196258 0.221708 0.672119 \n", + "concavity_se 0.187836 0.172332 0.491365 \n", + "concave points_se 0.297057 0.144157 0.389143 \n", + "symmetry_se -0.123926 -0.007047 0.070591 \n", + "fractal_dimension_se -0.000367 0.167714 0.381239 \n", + "radius_worst 0.984014 0.215895 0.475348 \n", + "texture_worst 0.345842 0.225808 0.361123 \n", + "perimeter_worst 0.977578 0.235168 0.528876 \n", + "area_worst 1.000000 0.209064 0.437727 \n", + "smoothness_worst 0.209064 1.000000 0.563176 \n", + "compactness_worst 0.437727 0.563176 1.000000 \n", + "concavity_worst 0.543774 0.513958 0.892859 \n", + "concave points_worst 0.745090 0.542461 0.800307 \n", + "symmetry_worst 0.209443 0.486361 0.614717 \n", + "fractal_dimension_worst 0.079535 0.608113 0.800823 \n", + "\n", + " concavity_worst concave points_worst \\\n", + "diagnosis -0.659345 -0.791976 \n", + "radius_mean 0.527379 0.741494 \n", + "texture_mean 0.300041 0.293490 \n", + "perimeter_mean 0.564315 0.768429 \n", + "area_mean 0.512943 0.719419 \n", + "smoothness_mean 0.420549 0.483773 \n", + "compactness_mean 0.815163 0.813038 \n", + "concavity_mean 0.883088 0.859451 \n", + "concave points_mean 0.748687 0.905058 \n", + "symmetry_mean 0.432004 0.428627 \n", + "fractal_dimension_mean 0.305228 0.148275 \n", + "radius_se 0.380110 0.528842 \n", + "texture_se -0.070533 -0.120938 \n", + "perimeter_se 0.418269 0.552873 \n", + "area_se 0.384820 0.535417 \n", + "smoothness_se 0.018581 -0.027878 \n", + "compactness_se 0.633328 0.477739 \n", + "concavity_se 0.662437 0.443167 \n", + "concave points_se 0.463453 0.502092 \n", + "symmetry_se 0.035520 -0.032924 \n", + "fractal_dimension_se 0.363578 0.203468 \n", + "radius_worst 0.574562 0.784946 \n", + "texture_worst 0.367625 0.358467 \n", + "perimeter_worst 0.618906 0.813826 \n", + "area_worst 0.543774 0.745090 \n", + "smoothness_worst 0.513958 0.542461 \n", + "compactness_worst 0.892859 0.800307 \n", + "concavity_worst 1.000000 0.855035 \n", + "concave points_worst 0.855035 1.000000 \n", + "symmetry_worst 0.530509 0.502487 \n", + "fractal_dimension_worst 0.682274 0.510454 \n", + "\n", + " symmetry_worst fractal_dimension_worst \n", + "diagnosis -0.417123 -0.322046 \n", + "radius_mean 0.164409 0.009169 \n", + "texture_mean 0.105824 0.113425 \n", + "perimeter_mean 0.189385 0.052794 \n", + "area_mean 0.143762 0.006854 \n", + "smoothness_mean 0.391210 0.498783 \n", + "compactness_mean 0.508330 0.681343 \n", + "concavity_mean 0.409733 0.513497 \n", + "concave points_mean 0.370446 0.366058 \n", + "symmetry_mean 0.698086 0.427079 \n", + "fractal_dimension_mean 0.283061 0.701592 \n", + "radius_se 0.094549 0.049351 \n", + "texture_se -0.127607 -0.044761 \n", + "perimeter_se 0.108877 0.083512 \n", + "area_se 0.072572 0.016228 \n", + "smoothness_se -0.032829 0.119089 \n", + "compactness_se 0.271770 0.574867 \n", + "concavity_se 0.199593 0.445392 \n", + "concave points_se 0.126508 0.237510 \n", + "symmetry_se 0.382952 0.084610 \n", + "fractal_dimension_se 0.098792 0.522940 \n", + "radius_worst 0.244034 0.092952 \n", + "texture_worst 0.234337 0.214237 \n", + "perimeter_worst 0.269788 0.137973 \n", + "area_worst 0.209443 0.079535 \n", + "smoothness_worst 0.486361 0.608113 \n", + "compactness_worst 0.614717 0.800823 \n", + "concavity_worst 0.530509 0.682274 \n", + "concave points_worst 0.502487 0.510454 \n", + "symmetry_worst 1.000000 0.527177 \n", + "fractal_dimension_worst 0.527177 1.000000 \n", + "\n", + "[31 rows x 31 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.corr()" + ] + }, + { + "cell_type": "markdown", + "id": "ee2f0278", + "metadata": {}, + "source": [ + "# Model Loading\n", + "In this classification, we used scikitlearn's algorithms for predicting the labels of M and B (now 0's and 1's).\n", + " \n", + "Having multiple models on their libary, we can run many models and compare it later." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4b48f208", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d7b5908a", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.ensemble import AdaBoostClassifier\n", + "from sklearn.ensemble import GradientBoostingClassifier" + ] + }, + { + "cell_type": "markdown", + "id": "3d50df8d", + "metadata": {}, + "source": [ + "Splitting the dataset into Training and Test Sets" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e32ec855", + "metadata": {}, + "outputs": [], + "source": [ + "# Split the data into features (X) and labels (y)\n", + "X = df.drop(columns=['diagnosis'])\n", + "y = df['diagnosis']\n", + "\n", + "# Split the data into training and testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Define a dictionary to store results\n", + "results = {'Model': [], 'F1_score': [], 'Accuracy': [], 'Precision': [], 'Recall': []}" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "28ebe0cf", + "metadata": {}, + "outputs": [], + "source": [ + "models = {\n", + " 'Random Forest': RandomForestClassifier(),\n", + " 'Support Vector Machine': SVC(),\n", + " 'K-Nearest Neighbors': KNeighborsClassifier(),\n", + " 'Logistic Regression': LogisticRegression(),\n", + " 'Decision Tree': DecisionTreeClassifier(),\n", + " 'Naive Bayes': GaussianNB(),\n", + " 'AdaBoost': AdaBoostClassifier(),\n", + " 'Gradient Boosting': GradientBoostingClassifier()\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "17781adc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + } + ], + "source": [ + "for model_name, model in models.items():\n", + " # Train the model\n", + " model.fit(X_train, y_train)\n", + "\n", + " # Make predictions\n", + " y_pred = model.predict(X_test)\n", + "\n", + " # Evaluate the model\n", + " f1 = f1_score(y_test, y_pred)\n", + " accuracy = accuracy_score(y_test, y_pred)\n", + " precision = precision_score(y_test, y_pred)\n", + " recall = recall_score(y_test, y_pred)\n", + "\n", + " # Store results in the dictionary\n", + " results['Model'].append(model_name)\n", + " results['F1_score'].append(f1)\n", + " results['Accuracy'].append(accuracy)\n", + " results['Precision'].append(precision)\n", + " results['Recall'].append(recall)\n", + "\n", + "# Create a DataFrame from the results dictionary\n", + "results_df = pd.DataFrame(results)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c98e51c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelF1_scoreAccuracyPrecisionRecall
0Random Forest0.9722220.9649120.9589040.985915
1Support Vector Machine0.9594590.9473680.9220781.000000
2K-Nearest Neighbors0.9659860.9561400.9342111.000000
3Random Forest0.9722220.9649120.9589040.985915
4Support Vector Machine0.9594590.9473680.9220781.000000
5K-Nearest Neighbors0.9659860.9561400.9342111.000000
6Logistic Regression0.9722220.9649120.9589040.985915
7Decision Tree0.9510490.9385960.9444440.957746
8Naive Bayes0.9793100.9736840.9594591.000000
9AdaBoost0.9577460.9473680.9577460.957746
10Gradient Boosting0.9722220.9649120.9589040.985915
\n", + "
" + ], + "text/plain": [ + " Model F1_score Accuracy Precision Recall\n", + "0 Random Forest 0.972222 0.964912 0.958904 0.985915\n", + "1 Support Vector Machine 0.959459 0.947368 0.922078 1.000000\n", + "2 K-Nearest Neighbors 0.965986 0.956140 0.934211 1.000000\n", + "3 Random Forest 0.972222 0.964912 0.958904 0.985915\n", + "4 Support Vector Machine 0.959459 0.947368 0.922078 1.000000\n", + "5 K-Nearest Neighbors 0.965986 0.956140 0.934211 1.000000\n", + "6 Logistic Regression 0.972222 0.964912 0.958904 0.985915\n", + "7 Decision Tree 0.951049 0.938596 0.944444 0.957746\n", + "8 Naive Bayes 0.979310 0.973684 0.959459 1.000000\n", + "9 AdaBoost 0.957746 0.947368 0.957746 0.957746\n", + "10 Gradient Boosting 0.972222 0.964912 0.958904 0.985915" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "212e9b94", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}