{ "cells": [ { "cell_type": "markdown", "id": "bd00ebdc", "metadata": {}, "source": [ "# Load Basic Libraries\n", " Load some libaries to read and display the data" ] }, { "cell_type": "code", "execution_count": 1, "id": "375f1a0c", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "b8760991", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...texture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worstUnnamed: 32
0842302M17.9910.38122.801001.00.118400.277600.30010.14710...17.33184.602019.00.16220.66560.71190.26540.46010.11890NaN
1842517M20.5717.77132.901326.00.084740.078640.08690.07017...23.41158.801956.00.12380.18660.24160.18600.27500.08902NaN
284300903M19.6921.25130.001203.00.109600.159900.19740.12790...25.53152.501709.00.14440.42450.45040.24300.36130.08758NaN
384348301M11.4220.3877.58386.10.142500.283900.24140.10520...26.5098.87567.70.20980.86630.68690.25750.66380.17300NaN
484358402M20.2914.34135.101297.00.100300.132800.19800.10430...16.67152.201575.00.13740.20500.40000.16250.23640.07678NaN
\n", "

5 rows × 33 columns

\n", "
" ], "text/plain": [ " id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", "0 842302 M 17.99 10.38 122.80 1001.0 \n", "1 842517 M 20.57 17.77 132.90 1326.0 \n", "2 84300903 M 19.69 21.25 130.00 1203.0 \n", "3 84348301 M 11.42 20.38 77.58 386.1 \n", "4 84358402 M 20.29 14.34 135.10 1297.0 \n", "\n", " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", "0 0.11840 0.27760 0.3001 0.14710 \n", "1 0.08474 0.07864 0.0869 0.07017 \n", "2 0.10960 0.15990 0.1974 0.12790 \n", "3 0.14250 0.28390 0.2414 0.10520 \n", "4 0.10030 0.13280 0.1980 0.10430 \n", "\n", " ... texture_worst perimeter_worst area_worst smoothness_worst \\\n", "0 ... 17.33 184.60 2019.0 0.1622 \n", "1 ... 23.41 158.80 1956.0 0.1238 \n", "2 ... 25.53 152.50 1709.0 0.1444 \n", "3 ... 26.50 98.87 567.7 0.2098 \n", "4 ... 16.67 152.20 1575.0 0.1374 \n", "\n", " compactness_worst concavity_worst concave points_worst symmetry_worst \\\n", "0 0.6656 0.7119 0.2654 0.4601 \n", "1 0.1866 0.2416 0.1860 0.2750 \n", "2 0.4245 0.4504 0.2430 0.3613 \n", "3 0.8663 0.6869 0.2575 0.6638 \n", "4 0.2050 0.4000 0.1625 0.2364 \n", "\n", " fractal_dimension_worst Unnamed: 32 \n", "0 0.11890 NaN \n", "1 0.08902 NaN \n", "2 0.08758 NaN \n", "3 0.17300 NaN \n", "4 0.07678 NaN \n", "\n", "[5 rows x 33 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"https://raw.githubusercontent.com/youronlydimwit/Data_ScienceUse_Cases/main/Classification/Data/Breast_Cancer_Wisconsin.csv\")\n", "df.head()" ] }, { "cell_type": "markdown", "id": "6eb25bbe", "metadata": {}, "source": [ "# Preliminary check for missing values\n", "Created a function to outputs a dataframe with columns:\n", "- Column Name\n", "- The Data type\n", "- Count of missing data (Nulls)" ] }, { "cell_type": "code", "execution_count": 3, "id": "72bb3dcc", "metadata": {}, "outputs": [], "source": [ "# Missing data check function\n", "def completeness_check(input_df):\n", " # Create a new DataFrame\n", " summary_df = pd.DataFrame(columns=['Column_Name', 'Data_Type', 'Missing_Data'])\n", "\n", " # Fill in the data\n", " summary_df['Column_Name'] = input_df.columns\n", " summary_df['Data_Type'] = input_df.dtypes.values\n", " summary_df['Missing_Data'] = input_df.isnull().sum().values\n", "\n", " return summary_df" ] }, { "cell_type": "code", "execution_count": 4, "id": "dd4b5d0f", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Column_NameData_TypeMissing_Data
0idint640
1diagnosisobject0
2radius_meanfloat640
3texture_meanfloat640
4perimeter_meanfloat640
5area_meanfloat640
6smoothness_meanfloat640
7compactness_meanfloat640
8concavity_meanfloat640
9concave points_meanfloat640
10symmetry_meanfloat640
11fractal_dimension_meanfloat640
12radius_sefloat640
13texture_sefloat640
14perimeter_sefloat640
15area_sefloat640
16smoothness_sefloat640
17compactness_sefloat640
18concavity_sefloat640
19concave points_sefloat640
20symmetry_sefloat640
21fractal_dimension_sefloat640
22radius_worstfloat640
23texture_worstfloat640
24perimeter_worstfloat640
25area_worstfloat640
26smoothness_worstfloat640
27compactness_worstfloat640
28concavity_worstfloat640
29concave points_worstfloat640
30symmetry_worstfloat640
31fractal_dimension_worstfloat640
32Unnamed: 32float64569
\n", "
" ], "text/plain": [ " Column_Name Data_Type Missing_Data\n", "0 id int64 0\n", "1 diagnosis object 0\n", "2 radius_mean float64 0\n", "3 texture_mean float64 0\n", "4 perimeter_mean float64 0\n", "5 area_mean float64 0\n", "6 smoothness_mean float64 0\n", "7 compactness_mean float64 0\n", "8 concavity_mean float64 0\n", "9 concave points_mean float64 0\n", "10 symmetry_mean float64 0\n", "11 fractal_dimension_mean float64 0\n", "12 radius_se float64 0\n", "13 texture_se float64 0\n", "14 perimeter_se float64 0\n", "15 area_se float64 0\n", "16 smoothness_se float64 0\n", "17 compactness_se float64 0\n", "18 concavity_se float64 0\n", "19 concave points_se float64 0\n", "20 symmetry_se float64 0\n", "21 fractal_dimension_se float64 0\n", "22 radius_worst float64 0\n", "23 texture_worst float64 0\n", "24 perimeter_worst float64 0\n", "25 area_worst float64 0\n", "26 smoothness_worst float64 0\n", "27 compactness_worst float64 0\n", "28 concavity_worst float64 0\n", "29 concave points_worst float64 0\n", "30 symmetry_worst float64 0\n", "31 fractal_dimension_worst float64 0\n", "32 Unnamed: 32 float64 569" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "completeness_check(df)" ] }, { "cell_type": "markdown", "id": "d91cd816", "metadata": {}, "source": [ "Just dropping unnecessary columns, for this context, it is column **'id'** and **'Unnamed: 32'**" ] }, { "cell_type": "code", "execution_count": 5, "id": "6ce4e125", "metadata": {}, "outputs": [], "source": [ "# We don't need these 2 columns\n", "df = df.drop(['id','Unnamed: 32'], axis=1)" ] }, { "cell_type": "code", "execution_count": 6, "id": "35e3462f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0M17.9910.38122.801001.00.118400.277600.30010.147100.2419...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
1M20.5717.77132.901326.00.084740.078640.08690.070170.1812...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
2M19.6921.25130.001203.00.109600.159900.19740.127900.2069...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
3M11.4220.3877.58386.10.142500.283900.24140.105200.2597...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
4M20.2914.34135.101297.00.100300.132800.19800.104300.1809...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n", "

5 rows × 31 columns

\n", "
" ], "text/plain": [ " diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", "0 M 17.99 10.38 122.80 1001.0 \n", "1 M 20.57 17.77 132.90 1326.0 \n", "2 M 19.69 21.25 130.00 1203.0 \n", "3 M 11.42 20.38 77.58 386.1 \n", "4 M 20.29 14.34 135.10 1297.0 \n", "\n", " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", "0 0.11840 0.27760 0.3001 0.14710 \n", "1 0.08474 0.07864 0.0869 0.07017 \n", "2 0.10960 0.15990 0.1974 0.12790 \n", "3 0.14250 0.28390 0.2414 0.10520 \n", "4 0.10030 0.13280 0.1980 0.10430 \n", "\n", " symmetry_mean ... radius_worst texture_worst perimeter_worst \\\n", "0 0.2419 ... 25.38 17.33 184.60 \n", "1 0.1812 ... 24.99 23.41 158.80 \n", "2 0.2069 ... 23.57 25.53 152.50 \n", "3 0.2597 ... 14.91 26.50 98.87 \n", "4 0.1809 ... 22.54 16.67 152.20 \n", "\n", " area_worst smoothness_worst compactness_worst concavity_worst \\\n", "0 2019.0 0.1622 0.6656 0.7119 \n", "1 1956.0 0.1238 0.1866 0.2416 \n", "2 1709.0 0.1444 0.4245 0.4504 \n", "3 567.7 0.2098 0.8663 0.6869 \n", "4 1575.0 0.1374 0.2050 0.4000 \n", "\n", " concave points_worst symmetry_worst fractal_dimension_worst \n", "0 0.2654 0.4601 0.11890 \n", "1 0.1860 0.2750 0.08902 \n", "2 0.2430 0.3613 0.08758 \n", "3 0.2575 0.6638 0.17300 \n", "4 0.1625 0.2364 0.07678 \n", "\n", "[5 rows x 31 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "id": "4b2e562f", "metadata": {}, "source": [ "# Reformatting\n", "From the source, the data is assumed already cleaned. Then, the _float_ in this dataset is being reformatted as **#.##** format for easier reading." ] }, { "cell_type": "code", "execution_count": 7, "id": "b7eca327", "metadata": {}, "outputs": [], "source": [ "def format_float_columns(df):\n", " # Get columns with float64 dtype\n", " float_columns = df.select_dtypes(include='float64').columns\n", " \n", " # Format each column without changing the data type\n", " for col in float_columns:\n", " df[col] = df[col].round(2)\n", " \n", " return df" ] }, { "cell_type": "code", "execution_count": 8, "id": "adc5f2f3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0M17.9910.38122.801001.00.120.280.300.150.24...25.3817.33184.602019.00.160.670.710.270.460.12
1M20.5717.77132.901326.00.080.080.090.070.18...24.9923.41158.801956.00.120.190.240.190.280.09
2M19.6921.25130.001203.00.110.160.200.130.21...23.5725.53152.501709.00.140.420.450.240.360.09
3M11.4220.3877.58386.10.140.280.240.110.26...14.9126.5098.87567.70.210.870.690.260.660.17
4M20.2914.34135.101297.00.100.130.200.100.18...22.5416.67152.201575.00.140.200.400.160.240.08
..................................................................
564M21.5622.39142.001479.00.110.120.240.140.17...25.4526.40166.102027.00.140.210.410.220.210.07
565M20.1328.25131.201261.00.100.100.140.100.18...23.6938.25155.001731.00.120.190.320.160.260.07
566M16.6028.08108.30858.10.080.100.090.050.16...18.9834.12126.701124.00.110.310.340.140.220.08
567M20.6029.33140.101265.00.120.280.350.150.24...25.7439.42184.601821.00.160.870.940.260.410.12
568B7.7624.5447.92181.00.050.040.000.000.16...9.4630.3759.16268.60.090.060.000.000.290.07
\n", "

569 rows × 31 columns

\n", "
" ], "text/plain": [ " diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", "0 M 17.99 10.38 122.80 1001.0 \n", "1 M 20.57 17.77 132.90 1326.0 \n", "2 M 19.69 21.25 130.00 1203.0 \n", "3 M 11.42 20.38 77.58 386.1 \n", "4 M 20.29 14.34 135.10 1297.0 \n", ".. ... ... ... ... ... \n", "564 M 21.56 22.39 142.00 1479.0 \n", "565 M 20.13 28.25 131.20 1261.0 \n", "566 M 16.60 28.08 108.30 858.1 \n", "567 M 20.60 29.33 140.10 1265.0 \n", "568 B 7.76 24.54 47.92 181.0 \n", "\n", " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", "0 0.12 0.28 0.30 0.15 \n", "1 0.08 0.08 0.09 0.07 \n", "2 0.11 0.16 0.20 0.13 \n", "3 0.14 0.28 0.24 0.11 \n", "4 0.10 0.13 0.20 0.10 \n", ".. ... ... ... ... \n", "564 0.11 0.12 0.24 0.14 \n", "565 0.10 0.10 0.14 0.10 \n", "566 0.08 0.10 0.09 0.05 \n", "567 0.12 0.28 0.35 0.15 \n", "568 0.05 0.04 0.00 0.00 \n", "\n", " symmetry_mean ... radius_worst texture_worst perimeter_worst \\\n", "0 0.24 ... 25.38 17.33 184.60 \n", "1 0.18 ... 24.99 23.41 158.80 \n", "2 0.21 ... 23.57 25.53 152.50 \n", "3 0.26 ... 14.91 26.50 98.87 \n", "4 0.18 ... 22.54 16.67 152.20 \n", ".. ... ... ... ... ... \n", "564 0.17 ... 25.45 26.40 166.10 \n", "565 0.18 ... 23.69 38.25 155.00 \n", "566 0.16 ... 18.98 34.12 126.70 \n", "567 0.24 ... 25.74 39.42 184.60 \n", "568 0.16 ... 9.46 30.37 59.16 \n", "\n", " area_worst smoothness_worst compactness_worst concavity_worst \\\n", "0 2019.0 0.16 0.67 0.71 \n", "1 1956.0 0.12 0.19 0.24 \n", "2 1709.0 0.14 0.42 0.45 \n", "3 567.7 0.21 0.87 0.69 \n", "4 1575.0 0.14 0.20 0.40 \n", ".. ... ... ... ... \n", "564 2027.0 0.14 0.21 0.41 \n", "565 1731.0 0.12 0.19 0.32 \n", "566 1124.0 0.11 0.31 0.34 \n", "567 1821.0 0.16 0.87 0.94 \n", "568 268.6 0.09 0.06 0.00 \n", "\n", " concave points_worst symmetry_worst fractal_dimension_worst \n", "0 0.27 0.46 0.12 \n", "1 0.19 0.28 0.09 \n", "2 0.24 0.36 0.09 \n", "3 0.26 0.66 0.17 \n", "4 0.16 0.24 0.08 \n", ".. ... ... ... \n", "564 0.22 0.21 0.07 \n", "565 0.16 0.26 0.07 \n", "566 0.14 0.22 0.08 \n", "567 0.26 0.41 0.12 \n", "568 0.00 0.29 0.07 \n", "\n", "[569 rows x 31 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "format_float_columns(df)" ] }, { "cell_type": "markdown", "id": "c9aaef3a", "metadata": {}, "source": [ "# Encoding\n", "Since the labels to be predicted only has **M for Malignant** and **B for Benign**, we can encode it to become 0 and 1 respectively.\n", "I also changed the type to float, so the data types are all the same type" ] }, { "cell_type": "code", "execution_count": 9, "id": "60870d9a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
00.017.9910.38122.81001.00.120.280.300.150.24...25.3817.33184.62019.00.160.670.710.270.460.12
10.020.5717.77132.91326.00.080.080.090.070.18...24.9923.41158.81956.00.120.190.240.190.280.09
20.019.6921.25130.01203.00.110.160.200.130.21...23.5725.53152.51709.00.140.420.450.240.360.09
\n", "

3 rows × 31 columns

\n", "
" ], "text/plain": [ " diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", "0 0.0 17.99 10.38 122.8 1001.0 \n", "1 0.0 20.57 17.77 132.9 1326.0 \n", "2 0.0 19.69 21.25 130.0 1203.0 \n", "\n", " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", "0 0.12 0.28 0.30 0.15 \n", "1 0.08 0.08 0.09 0.07 \n", "2 0.11 0.16 0.20 0.13 \n", "\n", " symmetry_mean ... radius_worst texture_worst perimeter_worst \\\n", "0 0.24 ... 25.38 17.33 184.6 \n", "1 0.18 ... 24.99 23.41 158.8 \n", "2 0.21 ... 23.57 25.53 152.5 \n", "\n", " area_worst smoothness_worst compactness_worst concavity_worst \\\n", "0 2019.0 0.16 0.67 0.71 \n", "1 1956.0 0.12 0.19 0.24 \n", "2 1709.0 0.14 0.42 0.45 \n", "\n", " concave points_worst symmetry_worst fractal_dimension_worst \n", "0 0.27 0.46 0.12 \n", "1 0.19 0.28 0.09 \n", "2 0.24 0.36 0.09 \n", "\n", "[3 rows x 31 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Replace 'M' with 0 and 'B' with 1 in the 'Column' column\n", "df['diagnosis'] = df['diagnosis'].replace({'M': 0, 'B': 1})\n", "df['diagnosis'] = df['diagnosis'].astype(float)\n", "df.head(3)" ] }, { "cell_type": "markdown", "id": "ecbe4b22", "metadata": {}, "source": [ "Correct data type" ] }, { "cell_type": "code", "execution_count": 10, "id": "7e437274", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dtype('float64')" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['diagnosis'].dtypes" ] }, { "cell_type": "markdown", "id": "5b18b919", "metadata": {}, "source": [ "Running a correlation matrix to get a glimpse of who's affecting who the most." ] }, { "cell_type": "code", "execution_count": 11, "id": "2c3b26b6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
diagnosis1.000000-0.730032-0.415185-0.742636-0.708984-0.330624-0.597576-0.695750-0.774410-0.333477...-0.776453-0.456903-0.782914-0.733825-0.419661-0.590477-0.659345-0.791976-0.417123-0.322046
radius_mean-0.7300321.0000000.3237770.9978560.9873610.1471040.5056220.6770410.8216460.151195...0.9695410.2970000.9651390.9410870.1190310.4130650.5273790.7414940.1644090.009169
texture_mean-0.4151850.3237771.0000000.3295330.321086-0.0404190.2370190.3005360.2876180.067193...0.3525780.9120450.3580400.3435460.0758850.2781120.3000410.2934900.1058240.113425
perimeter_mean-0.7426360.9978560.3295331.0000000.9865070.1833560.5564850.7164480.8498750.186306...0.9694760.3030380.9703870.9415500.1495300.4553700.5643150.7684290.1893850.052794
area_mean-0.7089840.9873610.3210860.9865071.0000000.1516710.4980380.6863080.8225000.154603...0.9627450.2874890.9591200.9592130.1229840.3898740.5129430.7194190.1437620.006854
smoothness_mean-0.3306240.147104-0.0404190.1833560.1516711.0000000.6402920.5019560.5288810.544531...0.1899840.0226690.2155740.1829050.7770840.4619710.4205490.4837730.3912100.498783
compactness_mean-0.5975760.5056220.2370190.5564850.4980380.6402921.0000000.8837290.8290220.602728...0.5349390.2475640.5899440.5094490.5593180.8642090.8151630.8130380.5083300.681343
concavity_mean-0.6957500.6770410.3005360.7164480.6863080.5019560.8837291.0000000.9186870.500739...0.6887030.2989830.7299630.6766560.4462470.7551670.8830880.8594510.4097330.513497
concave points_mean-0.7744100.8216460.2876180.8498750.8225000.5288810.8290220.9186871.0000000.460745...0.8284010.2868280.8536420.8079780.4461570.6657410.7486870.9050580.3704460.366058
symmetry_mean-0.3334770.1511950.0671930.1863060.1546030.5445310.6027280.5007390.4607451.000000...0.1897630.0849240.2229080.1817310.4241350.4749810.4320040.4286270.6980860.427079
fractal_dimension_mean0.027269-0.290001-0.080337-0.245096-0.2604360.5568400.5061560.3011760.1423070.411393...-0.242125-0.051211-0.199454-0.2185660.4518970.4106270.3052280.1482750.2830610.701592
radius_se-0.5673280.6785900.2759730.6912680.7321250.2776000.4973810.6312580.6997640.304353...0.7148720.1950920.7194920.7514310.1401870.2865720.3801100.5288420.0945490.049351
texture_se0.008299-0.0972300.386443-0.086669-0.0661190.0591490.0448750.0758930.0252600.126154...-0.1115920.409071-0.102161-0.083078-0.073042-0.092846-0.070533-0.120938-0.127607-0.044761
perimeter_se-0.5560550.6741090.2815900.6930690.7265640.2721830.5486650.6596850.7120660.313482...0.6971330.2002700.7209660.7306470.1257910.3409510.4182690.5528730.1088770.083512
area_se-0.5482360.7358680.2598440.7449830.8000860.2225950.4560530.6172990.6906420.226088...0.7573720.1964960.7612130.8114080.1243920.2826820.3848200.5354170.0725720.016228
smoothness_se0.012117-0.1674560.019777-0.149510-0.1301920.2681500.1333920.1105900.0516580.153937...-0.152307-0.008034-0.136532-0.1159510.3091110.0330850.018581-0.027878-0.0328290.119089
compactness_se-0.2887130.2043410.1920290.2486830.2111260.3130430.7291640.6652170.4907190.412566...0.2020270.1461210.2569010.1962580.2217080.6721190.6333280.4777390.2717700.574867
concavity_se-0.2550410.1941510.1455360.2281600.2066590.2491320.5719960.6888830.4410370.337796...0.1873710.1025860.2270070.1878360.1723320.4913650.6624370.4431670.1995930.445392
concave points_se-0.3292870.3250420.1533730.3531510.3226100.2829480.5582320.5828690.5220440.312608...0.3072880.0799990.3422270.2970570.1441570.3891430.4634530.5020920.1265080.237510
symmetry_se0.025127-0.117235-0.017239-0.093748-0.0843470.2162860.2364740.1787250.0985100.446116...-0.142188-0.094773-0.117920-0.123926-0.0070470.0705910.035520-0.0329240.3829520.084610
fractal_dimension_se-0.091669-0.0095160.0792100.0271470.0020460.2781760.4966770.4273090.2602000.333318...-0.0057590.0088420.033505-0.0003670.1677140.3812390.3635780.2034680.0987920.522940
radius_worst-0.7764530.9695410.3525780.9694760.9627450.1899840.5349390.6887030.8284010.189763...1.0000000.3599250.9937070.9840140.2158950.4753480.5745620.7849460.2440340.092952
texture_worst-0.4569030.2970000.9120450.3030380.2874890.0226690.2475640.2989830.2868280.084924...0.3599251.0000000.3650980.3458420.2258080.3611230.3676250.3584670.2343370.214237
perimeter_worst-0.7829140.9651390.3580400.9703870.9591200.2155740.5899440.7299630.8536420.222908...0.9937070.3650981.0000000.9775780.2351680.5288760.6189060.8138260.2697880.137973
area_worst-0.7338250.9410870.3435460.9415500.9592130.1829050.5094490.6766560.8079780.181731...0.9840140.3458420.9775781.0000000.2090640.4377270.5437740.7450900.2094430.079535
smoothness_worst-0.4196610.1190310.0758850.1495300.1229840.7770840.5593180.4462470.4461570.424135...0.2158950.2258080.2351680.2090641.0000000.5631760.5139580.5424610.4863610.608113
compactness_worst-0.5904770.4130650.2781120.4553700.3898740.4619710.8642090.7551670.6657410.474981...0.4753480.3611230.5288760.4377270.5631761.0000000.8928590.8003070.6147170.800823
concavity_worst-0.6593450.5273790.3000410.5643150.5129430.4205490.8151630.8830880.7486870.432004...0.5745620.3676250.6189060.5437740.5139580.8928591.0000000.8550350.5305090.682274
concave points_worst-0.7919760.7414940.2934900.7684290.7194190.4837730.8130380.8594510.9050580.428627...0.7849460.3584670.8138260.7450900.5424610.8003070.8550351.0000000.5024870.510454
symmetry_worst-0.4171230.1644090.1058240.1893850.1437620.3912100.5083300.4097330.3704460.698086...0.2440340.2343370.2697880.2094430.4863610.6147170.5305090.5024871.0000000.527177
fractal_dimension_worst-0.3220460.0091690.1134250.0527940.0068540.4987830.6813430.5134970.3660580.427079...0.0929520.2142370.1379730.0795350.6081130.8008230.6822740.5104540.5271771.000000
\n", "

31 rows × 31 columns

\n", "
" ], "text/plain": [ " diagnosis radius_mean texture_mean perimeter_mean \\\n", "diagnosis 1.000000 -0.730032 -0.415185 -0.742636 \n", "radius_mean -0.730032 1.000000 0.323777 0.997856 \n", "texture_mean -0.415185 0.323777 1.000000 0.329533 \n", "perimeter_mean -0.742636 0.997856 0.329533 1.000000 \n", "area_mean -0.708984 0.987361 0.321086 0.986507 \n", "smoothness_mean -0.330624 0.147104 -0.040419 0.183356 \n", "compactness_mean -0.597576 0.505622 0.237019 0.556485 \n", "concavity_mean -0.695750 0.677041 0.300536 0.716448 \n", "concave points_mean -0.774410 0.821646 0.287618 0.849875 \n", "symmetry_mean -0.333477 0.151195 0.067193 0.186306 \n", "fractal_dimension_mean 0.027269 -0.290001 -0.080337 -0.245096 \n", "radius_se -0.567328 0.678590 0.275973 0.691268 \n", "texture_se 0.008299 -0.097230 0.386443 -0.086669 \n", "perimeter_se -0.556055 0.674109 0.281590 0.693069 \n", "area_se -0.548236 0.735868 0.259844 0.744983 \n", "smoothness_se 0.012117 -0.167456 0.019777 -0.149510 \n", "compactness_se -0.288713 0.204341 0.192029 0.248683 \n", "concavity_se -0.255041 0.194151 0.145536 0.228160 \n", "concave points_se -0.329287 0.325042 0.153373 0.353151 \n", "symmetry_se 0.025127 -0.117235 -0.017239 -0.093748 \n", "fractal_dimension_se -0.091669 -0.009516 0.079210 0.027147 \n", "radius_worst -0.776453 0.969541 0.352578 0.969476 \n", "texture_worst -0.456903 0.297000 0.912045 0.303038 \n", "perimeter_worst -0.782914 0.965139 0.358040 0.970387 \n", "area_worst -0.733825 0.941087 0.343546 0.941550 \n", "smoothness_worst -0.419661 0.119031 0.075885 0.149530 \n", "compactness_worst -0.590477 0.413065 0.278112 0.455370 \n", "concavity_worst -0.659345 0.527379 0.300041 0.564315 \n", "concave points_worst -0.791976 0.741494 0.293490 0.768429 \n", "symmetry_worst -0.417123 0.164409 0.105824 0.189385 \n", "fractal_dimension_worst -0.322046 0.009169 0.113425 0.052794 \n", "\n", " area_mean smoothness_mean compactness_mean \\\n", "diagnosis -0.708984 -0.330624 -0.597576 \n", "radius_mean 0.987361 0.147104 0.505622 \n", "texture_mean 0.321086 -0.040419 0.237019 \n", "perimeter_mean 0.986507 0.183356 0.556485 \n", "area_mean 1.000000 0.151671 0.498038 \n", "smoothness_mean 0.151671 1.000000 0.640292 \n", "compactness_mean 0.498038 0.640292 1.000000 \n", "concavity_mean 0.686308 0.501956 0.883729 \n", "concave points_mean 0.822500 0.528881 0.829022 \n", "symmetry_mean 0.154603 0.544531 0.602728 \n", "fractal_dimension_mean -0.260436 0.556840 0.506156 \n", "radius_se 0.732125 0.277600 0.497381 \n", "texture_se -0.066119 0.059149 0.044875 \n", "perimeter_se 0.726564 0.272183 0.548665 \n", "area_se 0.800086 0.222595 0.456053 \n", "smoothness_se -0.130192 0.268150 0.133392 \n", "compactness_se 0.211126 0.313043 0.729164 \n", "concavity_se 0.206659 0.249132 0.571996 \n", "concave points_se 0.322610 0.282948 0.558232 \n", "symmetry_se -0.084347 0.216286 0.236474 \n", "fractal_dimension_se 0.002046 0.278176 0.496677 \n", "radius_worst 0.962745 0.189984 0.534939 \n", "texture_worst 0.287489 0.022669 0.247564 \n", "perimeter_worst 0.959120 0.215574 0.589944 \n", "area_worst 0.959213 0.182905 0.509449 \n", "smoothness_worst 0.122984 0.777084 0.559318 \n", "compactness_worst 0.389874 0.461971 0.864209 \n", "concavity_worst 0.512943 0.420549 0.815163 \n", "concave points_worst 0.719419 0.483773 0.813038 \n", "symmetry_worst 0.143762 0.391210 0.508330 \n", "fractal_dimension_worst 0.006854 0.498783 0.681343 \n", "\n", " concavity_mean concave points_mean symmetry_mean \\\n", "diagnosis -0.695750 -0.774410 -0.333477 \n", "radius_mean 0.677041 0.821646 0.151195 \n", "texture_mean 0.300536 0.287618 0.067193 \n", "perimeter_mean 0.716448 0.849875 0.186306 \n", "area_mean 0.686308 0.822500 0.154603 \n", "smoothness_mean 0.501956 0.528881 0.544531 \n", "compactness_mean 0.883729 0.829022 0.602728 \n", "concavity_mean 1.000000 0.918687 0.500739 \n", "concave points_mean 0.918687 1.000000 0.460745 \n", "symmetry_mean 0.500739 0.460745 1.000000 \n", "fractal_dimension_mean 0.301176 0.142307 0.411393 \n", "radius_se 0.631258 0.699764 0.304353 \n", "texture_se 0.075893 0.025260 0.126154 \n", "perimeter_se 0.659685 0.712066 0.313482 \n", "area_se 0.617299 0.690642 0.226088 \n", "smoothness_se 0.110590 0.051658 0.153937 \n", "compactness_se 0.665217 0.490719 0.412566 \n", "concavity_se 0.688883 0.441037 0.337796 \n", "concave points_se 0.582869 0.522044 0.312608 \n", "symmetry_se 0.178725 0.098510 0.446116 \n", "fractal_dimension_se 0.427309 0.260200 0.333318 \n", "radius_worst 0.688703 0.828401 0.189763 \n", "texture_worst 0.298983 0.286828 0.084924 \n", "perimeter_worst 0.729963 0.853642 0.222908 \n", "area_worst 0.676656 0.807978 0.181731 \n", "smoothness_worst 0.446247 0.446157 0.424135 \n", "compactness_worst 0.755167 0.665741 0.474981 \n", "concavity_worst 0.883088 0.748687 0.432004 \n", "concave points_worst 0.859451 0.905058 0.428627 \n", "symmetry_worst 0.409733 0.370446 0.698086 \n", "fractal_dimension_worst 0.513497 0.366058 0.427079 \n", "\n", " ... radius_worst texture_worst perimeter_worst \\\n", "diagnosis ... -0.776453 -0.456903 -0.782914 \n", "radius_mean ... 0.969541 0.297000 0.965139 \n", "texture_mean ... 0.352578 0.912045 0.358040 \n", "perimeter_mean ... 0.969476 0.303038 0.970387 \n", "area_mean ... 0.962745 0.287489 0.959120 \n", "smoothness_mean ... 0.189984 0.022669 0.215574 \n", "compactness_mean ... 0.534939 0.247564 0.589944 \n", "concavity_mean ... 0.688703 0.298983 0.729963 \n", "concave points_mean ... 0.828401 0.286828 0.853642 \n", "symmetry_mean ... 0.189763 0.084924 0.222908 \n", "fractal_dimension_mean ... -0.242125 -0.051211 -0.199454 \n", "radius_se ... 0.714872 0.195092 0.719492 \n", "texture_se ... -0.111592 0.409071 -0.102161 \n", "perimeter_se ... 0.697133 0.200270 0.720966 \n", "area_se ... 0.757372 0.196496 0.761213 \n", "smoothness_se ... -0.152307 -0.008034 -0.136532 \n", "compactness_se ... 0.202027 0.146121 0.256901 \n", "concavity_se ... 0.187371 0.102586 0.227007 \n", "concave points_se ... 0.307288 0.079999 0.342227 \n", "symmetry_se ... -0.142188 -0.094773 -0.117920 \n", "fractal_dimension_se ... -0.005759 0.008842 0.033505 \n", "radius_worst ... 1.000000 0.359925 0.993707 \n", "texture_worst ... 0.359925 1.000000 0.365098 \n", "perimeter_worst ... 0.993707 0.365098 1.000000 \n", "area_worst ... 0.984014 0.345842 0.977578 \n", "smoothness_worst ... 0.215895 0.225808 0.235168 \n", "compactness_worst ... 0.475348 0.361123 0.528876 \n", "concavity_worst ... 0.574562 0.367625 0.618906 \n", "concave points_worst ... 0.784946 0.358467 0.813826 \n", "symmetry_worst ... 0.244034 0.234337 0.269788 \n", "fractal_dimension_worst ... 0.092952 0.214237 0.137973 \n", "\n", " area_worst smoothness_worst compactness_worst \\\n", "diagnosis -0.733825 -0.419661 -0.590477 \n", "radius_mean 0.941087 0.119031 0.413065 \n", "texture_mean 0.343546 0.075885 0.278112 \n", "perimeter_mean 0.941550 0.149530 0.455370 \n", "area_mean 0.959213 0.122984 0.389874 \n", "smoothness_mean 0.182905 0.777084 0.461971 \n", "compactness_mean 0.509449 0.559318 0.864209 \n", "concavity_mean 0.676656 0.446247 0.755167 \n", "concave points_mean 0.807978 0.446157 0.665741 \n", "symmetry_mean 0.181731 0.424135 0.474981 \n", "fractal_dimension_mean -0.218566 0.451897 0.410627 \n", "radius_se 0.751431 0.140187 0.286572 \n", "texture_se -0.083078 -0.073042 -0.092846 \n", "perimeter_se 0.730647 0.125791 0.340951 \n", "area_se 0.811408 0.124392 0.282682 \n", "smoothness_se -0.115951 0.309111 0.033085 \n", "compactness_se 0.196258 0.221708 0.672119 \n", "concavity_se 0.187836 0.172332 0.491365 \n", "concave points_se 0.297057 0.144157 0.389143 \n", "symmetry_se -0.123926 -0.007047 0.070591 \n", "fractal_dimension_se -0.000367 0.167714 0.381239 \n", "radius_worst 0.984014 0.215895 0.475348 \n", "texture_worst 0.345842 0.225808 0.361123 \n", "perimeter_worst 0.977578 0.235168 0.528876 \n", "area_worst 1.000000 0.209064 0.437727 \n", "smoothness_worst 0.209064 1.000000 0.563176 \n", "compactness_worst 0.437727 0.563176 1.000000 \n", "concavity_worst 0.543774 0.513958 0.892859 \n", "concave points_worst 0.745090 0.542461 0.800307 \n", "symmetry_worst 0.209443 0.486361 0.614717 \n", "fractal_dimension_worst 0.079535 0.608113 0.800823 \n", "\n", " concavity_worst concave points_worst \\\n", "diagnosis -0.659345 -0.791976 \n", "radius_mean 0.527379 0.741494 \n", "texture_mean 0.300041 0.293490 \n", "perimeter_mean 0.564315 0.768429 \n", "area_mean 0.512943 0.719419 \n", "smoothness_mean 0.420549 0.483773 \n", "compactness_mean 0.815163 0.813038 \n", "concavity_mean 0.883088 0.859451 \n", "concave points_mean 0.748687 0.905058 \n", "symmetry_mean 0.432004 0.428627 \n", "fractal_dimension_mean 0.305228 0.148275 \n", "radius_se 0.380110 0.528842 \n", "texture_se -0.070533 -0.120938 \n", "perimeter_se 0.418269 0.552873 \n", "area_se 0.384820 0.535417 \n", "smoothness_se 0.018581 -0.027878 \n", "compactness_se 0.633328 0.477739 \n", "concavity_se 0.662437 0.443167 \n", "concave points_se 0.463453 0.502092 \n", "symmetry_se 0.035520 -0.032924 \n", "fractal_dimension_se 0.363578 0.203468 \n", "radius_worst 0.574562 0.784946 \n", "texture_worst 0.367625 0.358467 \n", "perimeter_worst 0.618906 0.813826 \n", "area_worst 0.543774 0.745090 \n", "smoothness_worst 0.513958 0.542461 \n", "compactness_worst 0.892859 0.800307 \n", "concavity_worst 1.000000 0.855035 \n", "concave points_worst 0.855035 1.000000 \n", "symmetry_worst 0.530509 0.502487 \n", "fractal_dimension_worst 0.682274 0.510454 \n", "\n", " symmetry_worst fractal_dimension_worst \n", "diagnosis -0.417123 -0.322046 \n", "radius_mean 0.164409 0.009169 \n", "texture_mean 0.105824 0.113425 \n", "perimeter_mean 0.189385 0.052794 \n", "area_mean 0.143762 0.006854 \n", "smoothness_mean 0.391210 0.498783 \n", "compactness_mean 0.508330 0.681343 \n", "concavity_mean 0.409733 0.513497 \n", "concave points_mean 0.370446 0.366058 \n", "symmetry_mean 0.698086 0.427079 \n", "fractal_dimension_mean 0.283061 0.701592 \n", "radius_se 0.094549 0.049351 \n", "texture_se -0.127607 -0.044761 \n", "perimeter_se 0.108877 0.083512 \n", "area_se 0.072572 0.016228 \n", "smoothness_se -0.032829 0.119089 \n", "compactness_se 0.271770 0.574867 \n", "concavity_se 0.199593 0.445392 \n", "concave points_se 0.126508 0.237510 \n", "symmetry_se 0.382952 0.084610 \n", "fractal_dimension_se 0.098792 0.522940 \n", "radius_worst 0.244034 0.092952 \n", "texture_worst 0.234337 0.214237 \n", "perimeter_worst 0.269788 0.137973 \n", "area_worst 0.209443 0.079535 \n", "smoothness_worst 0.486361 0.608113 \n", "compactness_worst 0.614717 0.800823 \n", "concavity_worst 0.530509 0.682274 \n", "concave points_worst 0.502487 0.510454 \n", "symmetry_worst 1.000000 0.527177 \n", "fractal_dimension_worst 0.527177 1.000000 \n", "\n", "[31 rows x 31 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.corr()" ] }, { "cell_type": "markdown", "id": "ee2f0278", "metadata": {}, "source": [ "# Model Loading\n", "In this classification, we used scikitlearn's algorithms for predicting the labels of M and B (now 0's and 1's).\n", " \n", "Having multiple models on their libary, we can run many models and compare it later." ] }, { "cell_type": "code", "execution_count": 12, "id": "4b48f208", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score" ] }, { "cell_type": "code", "execution_count": 17, "id": "d7b5908a", "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.svm import SVC\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.ensemble import AdaBoostClassifier\n", "from sklearn.ensemble import GradientBoostingClassifier" ] }, { "cell_type": "markdown", "id": "3d50df8d", "metadata": {}, "source": [ "Splitting the dataset into Training and Test Sets" ] }, { "cell_type": "code", "execution_count": 13, "id": "e32ec855", "metadata": {}, "outputs": [], "source": [ "# Split the data into features (X) and labels (y)\n", "X = df.drop(columns=['diagnosis'])\n", "y = df['diagnosis']\n", "\n", "# Split the data into training and testing sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "# Define a dictionary to store results\n", "results = {'Model': [], 'F1_score': [], 'Accuracy': [], 'Precision': [], 'Recall': []}" ] }, { "cell_type": "code", "execution_count": 18, "id": "28ebe0cf", "metadata": {}, "outputs": [], "source": [ "models = {\n", " 'Random Forest': RandomForestClassifier(),\n", " 'Support Vector Machine': SVC(),\n", " 'K-Nearest Neighbors': KNeighborsClassifier(),\n", " 'Logistic Regression': LogisticRegression(),\n", " 'Decision Tree': DecisionTreeClassifier(),\n", " 'Naive Bayes': GaussianNB(),\n", " 'AdaBoost': AdaBoostClassifier(),\n", " 'Gradient Boosting': GradientBoostingClassifier()\n", "}" ] }, { "cell_type": "code", "execution_count": 19, "id": "17781adc", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\sang.yogi\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n" ] } ], "source": [ "for model_name, model in models.items():\n", " # Train the model\n", " model.fit(X_train, y_train)\n", "\n", " # Make predictions\n", " y_pred = model.predict(X_test)\n", "\n", " # Evaluate the model\n", " f1 = f1_score(y_test, y_pred)\n", " accuracy = accuracy_score(y_test, y_pred)\n", " precision = precision_score(y_test, y_pred)\n", " recall = recall_score(y_test, y_pred)\n", "\n", " # Store results in the dictionary\n", " results['Model'].append(model_name)\n", " results['F1_score'].append(f1)\n", " results['Accuracy'].append(accuracy)\n", " results['Precision'].append(precision)\n", " results['Recall'].append(recall)\n", "\n", "# Create a DataFrame from the results dictionary\n", "results_df = pd.DataFrame(results)" ] }, { "cell_type": "code", "execution_count": 20, "id": "c98e51c8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelF1_scoreAccuracyPrecisionRecall
0Random Forest0.9722220.9649120.9589040.985915
1Support Vector Machine0.9594590.9473680.9220781.000000
2K-Nearest Neighbors0.9659860.9561400.9342111.000000
3Random Forest0.9722220.9649120.9589040.985915
4Support Vector Machine0.9594590.9473680.9220781.000000
5K-Nearest Neighbors0.9659860.9561400.9342111.000000
6Logistic Regression0.9722220.9649120.9589040.985915
7Decision Tree0.9510490.9385960.9444440.957746
8Naive Bayes0.9793100.9736840.9594591.000000
9AdaBoost0.9577460.9473680.9577460.957746
10Gradient Boosting0.9722220.9649120.9589040.985915
\n", "
" ], "text/plain": [ " Model F1_score Accuracy Precision Recall\n", "0 Random Forest 0.972222 0.964912 0.958904 0.985915\n", "1 Support Vector Machine 0.959459 0.947368 0.922078 1.000000\n", "2 K-Nearest Neighbors 0.965986 0.956140 0.934211 1.000000\n", "3 Random Forest 0.972222 0.964912 0.958904 0.985915\n", "4 Support Vector Machine 0.959459 0.947368 0.922078 1.000000\n", "5 K-Nearest Neighbors 0.965986 0.956140 0.934211 1.000000\n", "6 Logistic Regression 0.972222 0.964912 0.958904 0.985915\n", "7 Decision Tree 0.951049 0.938596 0.944444 0.957746\n", "8 Naive Bayes 0.979310 0.973684 0.959459 1.000000\n", "9 AdaBoost 0.957746 0.947368 0.957746 0.957746\n", "10 Gradient Boosting 0.972222 0.964912 0.958904 0.985915" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results_df" ] }, { "cell_type": "code", "execution_count": null, "id": "212e9b94", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }