{ "cells": [ { "cell_type": "markdown", "id": "ee34a7c4", "metadata": {}, "source": [ "## Import Libraries" ] }, { "cell_type": "code", "execution_count": 1, "id": "1a23a10f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import sklearn\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "markdown", "id": "3333920d", "metadata": {}, "source": [ "## Dataset\n", "For our dataset, you can find it [here.](https://www.kaggle.com/datasets/elakiricoder/gender-classification-dataset)" ] }, { "cell_type": "code", "execution_count": 2, "id": "5aea2295", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longgender
0111.86.11011Male
1014.05.40010Female
2011.86.31111Male
3014.46.10111Male
4113.55.90000Female
\n", "
" ], "text/plain": [ " long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n", "0 1 11.8 6.1 1 0 \n", "1 0 14.0 5.4 0 0 \n", "2 0 11.8 6.3 1 1 \n", "3 0 14.4 6.1 0 1 \n", "4 1 13.5 5.9 0 0 \n", "\n", " lips_thin distance_nose_to_lip_long gender \n", "0 1 1 Male \n", "1 1 0 Female \n", "2 1 1 Male \n", "3 1 1 Male \n", "4 0 0 Female " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load dataset\n", "df = pd.read_csv(r'D:\\archive\\gender_classification_v7.csv', encoding='utf-8')\n", "df.head()" ] }, { "cell_type": "markdown", "id": "58b8ed5e", "metadata": {}, "source": [ "## Data Pre-processing\n", "For this example I skipped the Descriptive Statistics, and went to minor adjustments." ] }, { "cell_type": "code", "execution_count": 3, "id": "d93ff56d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "long_hair int64\n", "forehead_width_cm float64\n", "forehead_height_cm float64\n", "nose_wide int64\n", "nose_long int64\n", "lips_thin int64\n", "distance_nose_to_lip_long int64\n", "gender object\n", "dtype: object" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check Data types of dataframe columns\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": 4, "id": "19ae1cf5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longgender
0111.86.110110
1014.05.400101
2011.86.311110
3014.46.101110
4113.55.900001
\n", "
" ], "text/plain": [ " long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n", "0 1 11.8 6.1 1 0 \n", "1 0 14.0 5.4 0 0 \n", "2 0 11.8 6.3 1 1 \n", "3 0 14.4 6.1 0 1 \n", "4 1 13.5 5.9 0 0 \n", "\n", " lips_thin distance_nose_to_lip_long gender \n", "0 1 1 0 \n", "1 1 0 1 \n", "2 1 1 0 \n", "3 1 1 0 \n", "4 0 0 1 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Convert Gender labels into integer values, for classification\n", "df['gender']=df['gender'].replace('Male',0)\n", "df['gender']=df['gender'].replace('Female',1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "b573f11e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "long_hair int64\n", "forehead_width_cm float64\n", "forehead_height_cm float64\n", "nose_wide int64\n", "nose_long int64\n", "lips_thin int64\n", "distance_nose_to_lip_long int64\n", "gender int64\n", "dtype: object" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now all is numeric data\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": 7, "id": "35388ca3", "metadata": {}, "outputs": [], "source": [ "# Split dataset into X (Features) and y (Labels)\n", "\n", "# X is ALL columns except the last column (usually the label to be predicted)\n", "X = df.iloc[:,:-1]\n", "# y is the LABEL column (to be predicted)\n", "y = df.iloc[:,-1]" ] }, { "cell_type": "code", "execution_count": 8, "id": "14c3347e", "metadata": {}, "outputs": [], "source": [ "# Use sklearn's train_test_split function imported before\n", "X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)" ] }, { "cell_type": "markdown", "id": "fe832e3f", "metadata": {}, "source": [ "## Using 4 Classifiers\n", "It is sugggested to take a deeper look of the parameters provided in documentations below, for better tweaking of the classifiers.\n", "- [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)\n", "- [Decision Tree](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)\n", "- [SVM](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)\n", "- [K-Means / KNN](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)" ] }, { "cell_type": "code", "execution_count": 9, "id": "f83a2e5c", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.svm import SVC\n", "from sklearn.neighbors import KNeighborsClassifier" ] }, { "cell_type": "code", "execution_count": 10, "id": "dc4c2062", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Logistic Regression Accuracy: 0.9682063587282543\n", "Decision Tree Accuracy: 0.8792241551689662\n", "SVM Accuracy: 0.967006598680264\n", "K-Means Accuracy: 0.9754049190161967\n" ] } ], "source": [ "# Logistic Regression\n", "# Train the model\n", "LogR = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)\n", "# Predict the test set\n", "LogR_pred = LogR.predict(X_test)\n", "\n", "# Decision Tree\n", "dtree = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train)\n", "dtree_pred = dtree.predict(X_test)\n", "\n", "# SVM\n", "svm = SVC(kernel='linear',C=1).fit(X_train, y_train)\n", "svm_pred = svm.predict(X_test)\n", "\n", "# K-Means\n", "knn = KNeighborsClassifier(n_neighbors=5).fit(X_train,y_train)\n", "knn_pred = knn.predict(X_test)\n", "\n", "# See Accuracy of each classifier\n", "print(\"Logistic Regression Accuracy: \"+ str(LogR.score(X,y)))\n", "print(\"Decision Tree Accuracy: \"+ str(dtree.score(X,y)))\n", "print(\"SVM Accuracy: \"+ str(svm.score(X,y)))\n", "print(\"K-Means Accuracy: \"+ str(knn.score(X,y)))" ] }, { "cell_type": "markdown", "id": "00f72b96", "metadata": {}, "source": [ "## Try on a new dataset\n", "Use one (or many) model above as predictor in a new dataset. Assuming we have the same columns but different values, we get.." ] }, { "cell_type": "code", "execution_count": 11, "id": "9c24db9a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_long
0114.56.70111
1114.05.90000
2112.96.41001
\n", "
" ], "text/plain": [ " long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n", "0 1 14.5 6.7 0 1 \n", "1 1 14.0 5.9 0 0 \n", "2 1 12.9 6.4 1 0 \n", "\n", " lips_thin distance_nose_to_lip_long \n", "0 1 1 \n", "1 0 0 \n", "2 0 1 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# For this example we use 3 rows of data to be predicted\n", "dval = pd.read_csv(r'D:\\archive\\valgend.csv', encoding='utf-8')\n", "dval.head()" ] }, { "cell_type": "code", "execution_count": 12, "id": "ad501b6a", "metadata": { "scrolled": false }, "outputs": [], "source": [ "# predict with knn (change to which model you choose)\n", "knn_pred_new = knn.predict(dval)" ] }, { "cell_type": "code", "execution_count": 13, "id": "8896ab72", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 1, 0], dtype=int64)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# See the result\n", "knn_pred_new" ] }, { "cell_type": "code", "execution_count": 14, "id": "7fa9db00", "metadata": {}, "outputs": [], "source": [ "# Add new column in new dataframe for placing the results, pass the \"result\" from before\n", "dval[\"pred_gender\"]=knn_pred_new" ] }, { "cell_type": "code", "execution_count": 15, "id": "6155a519", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longpred_gender
0114.56.701110
1114.05.900001
2112.96.410010
\n", "
" ], "text/plain": [ " long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n", "0 1 14.5 6.7 0 1 \n", "1 1 14.0 5.9 0 0 \n", "2 1 12.9 6.4 1 0 \n", "\n", " lips_thin distance_nose_to_lip_long pred_gender \n", "0 1 1 0 \n", "1 0 0 1 \n", "2 0 1 0 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# See data with appended prediction (last column)\n", "dval.head()" ] }, { "cell_type": "code", "execution_count": 16, "id": "c2587a57", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
long_hairforehead_width_cmforehead_height_cmnose_widenose_longlips_thindistance_nose_to_lip_longpred_gender
0114.56.70111Male
1114.05.90000Female
2112.96.41001Male
\n", "
" ], "text/plain": [ " long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n", "0 1 14.5 6.7 0 1 \n", "1 1 14.0 5.9 0 0 \n", "2 1 12.9 6.4 1 0 \n", "\n", " lips_thin distance_nose_to_lip_long pred_gender \n", "0 1 1 Male \n", "1 0 0 Female \n", "2 0 1 Male " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Converting back to labels\n", "dval['pred_gender']=dval['pred_gender'].replace(0,'Male')\n", "dval['pred_gender']=dval['pred_gender'].replace(1,'Female')\n", "dval.head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }