Files
Data_ScienceUse_Cases/Label Prediction (Binary Example).ipynb
2023-09-05 10:19:56 +07:00

815 lines
24 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "ee34a7c4",
"metadata": {},
"source": [
"## Import Libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1a23a10f",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import sklearn\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "markdown",
"id": "3333920d",
"metadata": {},
"source": [
"## Dataset\n",
"For our dataset, you can find it [here.](https://www.kaggle.com/datasets/elakiricoder/gender-classification-dataset)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5aea2295",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>11.8</td>\n",
" <td>6.1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>14.0</td>\n",
" <td>5.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>11.8</td>\n",
" <td>6.3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>14.4</td>\n",
" <td>6.1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>13.5</td>\n",
" <td>5.9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"0 1 11.8 6.1 1 0 \n",
"1 0 14.0 5.4 0 0 \n",
"2 0 11.8 6.3 1 1 \n",
"3 0 14.4 6.1 0 1 \n",
"4 1 13.5 5.9 0 0 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"0 1 1 Male \n",
"1 1 0 Female \n",
"2 1 1 Male \n",
"3 1 1 Male \n",
"4 0 0 Female "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Load dataset\n",
"df = pd.read_csv(r'D:\\archive\\gender_classification_v7.csv', encoding='utf-8')\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "58b8ed5e",
"metadata": {},
"source": [
"## Data Pre-processing\n",
"For this example I skipped the Descriptive Statistics, and went to minor adjustments."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d93ff56d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"long_hair int64\n",
"forehead_width_cm float64\n",
"forehead_height_cm float64\n",
"nose_wide int64\n",
"nose_long int64\n",
"lips_thin int64\n",
"distance_nose_to_lip_long int64\n",
"gender object\n",
"dtype: object"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check Data types of dataframe columns\n",
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "19ae1cf5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>11.8</td>\n",
" <td>6.1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>14.0</td>\n",
" <td>5.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>11.8</td>\n",
" <td>6.3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>14.4</td>\n",
" <td>6.1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>13.5</td>\n",
" <td>5.9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"0 1 11.8 6.1 1 0 \n",
"1 0 14.0 5.4 0 0 \n",
"2 0 11.8 6.3 1 1 \n",
"3 0 14.4 6.1 0 1 \n",
"4 1 13.5 5.9 0 0 \n",
"\n",
" lips_thin distance_nose_to_lip_long gender \n",
"0 1 1 0 \n",
"1 1 0 1 \n",
"2 1 1 0 \n",
"3 1 1 0 \n",
"4 0 0 1 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Convert Gender labels into integer values, for classification\n",
"df['gender']=df['gender'].replace('Male',0)\n",
"df['gender']=df['gender'].replace('Female',1)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b573f11e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"long_hair int64\n",
"forehead_width_cm float64\n",
"forehead_height_cm float64\n",
"nose_wide int64\n",
"nose_long int64\n",
"lips_thin int64\n",
"distance_nose_to_lip_long int64\n",
"gender int64\n",
"dtype: object"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now all is numeric data\n",
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "35388ca3",
"metadata": {},
"outputs": [],
"source": [
"# Split dataset into X (Features) and y (Labels)\n",
"\n",
"# X is ALL columns except the last column (usually the label to be predicted)\n",
"X = df.iloc[:,:-1]\n",
"# y is the LABEL column (to be predicted)\n",
"y = df.iloc[:,-1]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "14c3347e",
"metadata": {},
"outputs": [],
"source": [
"# Use sklearn's train_test_split function imported before\n",
"X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)"
]
},
{
"cell_type": "markdown",
"id": "fe832e3f",
"metadata": {},
"source": [
"## Using 4 Classifiers\n",
"It is sugggested to take a deeper look of the parameters provided in documentations below, for better tweaking of the classifiers.\n",
"- [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)\n",
"- [Decision Tree](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)\n",
"- [SVM](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)\n",
"- [K-Means / KNN](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f83a2e5c",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.svm import SVC\n",
"from sklearn.neighbors import KNeighborsClassifier"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "dc4c2062",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Logistic Regression Accuracy: 0.9682063587282543\n",
"Decision Tree Accuracy: 0.8792241551689662\n",
"SVM Accuracy: 0.967006598680264\n",
"K-Means Accuracy: 0.9754049190161967\n"
]
}
],
"source": [
"# Logistic Regression\n",
"# Train the model\n",
"LogR = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)\n",
"# Predict the test set\n",
"LogR_pred = LogR.predict(X_test)\n",
"\n",
"# Decision Tree\n",
"dtree = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train)\n",
"dtree_pred = dtree.predict(X_test)\n",
"\n",
"# SVM\n",
"svm = SVC(kernel='linear',C=1).fit(X_train, y_train)\n",
"svm_pred = svm.predict(X_test)\n",
"\n",
"# K-Means\n",
"knn = KNeighborsClassifier(n_neighbors=5).fit(X_train,y_train)\n",
"knn_pred = knn.predict(X_test)\n",
"\n",
"# See Accuracy of each classifier\n",
"print(\"Logistic Regression Accuracy: \"+ str(LogR.score(X,y)))\n",
"print(\"Decision Tree Accuracy: \"+ str(dtree.score(X,y)))\n",
"print(\"SVM Accuracy: \"+ str(svm.score(X,y)))\n",
"print(\"K-Means Accuracy: \"+ str(knn.score(X,y)))"
]
},
{
"cell_type": "markdown",
"id": "00f72b96",
"metadata": {},
"source": [
"## Try on a new dataset\n",
"Use one (or many) model above as predictor in a new dataset. Assuming we have the same columns but different values, we get.."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9c24db9a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>14.5</td>\n",
" <td>6.7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>14.0</td>\n",
" <td>5.9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>12.9</td>\n",
" <td>6.4</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"0 1 14.5 6.7 0 1 \n",
"1 1 14.0 5.9 0 0 \n",
"2 1 12.9 6.4 1 0 \n",
"\n",
" lips_thin distance_nose_to_lip_long \n",
"0 1 1 \n",
"1 0 0 \n",
"2 0 1 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# For this example we use 3 rows of data to be predicted\n",
"dval = pd.read_csv(r'D:\\archive\\valgend.csv', encoding='utf-8')\n",
"dval.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "ad501b6a",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# predict with knn (change to which model you choose)\n",
"knn_pred_new = knn.predict(dval)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8896ab72",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 0], dtype=int64)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# See the result\n",
"knn_pred_new"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "7fa9db00",
"metadata": {},
"outputs": [],
"source": [
"# Add new column in new dataframe for placing the results, pass the \"result\" from before\n",
"dval[\"pred_gender\"]=knn_pred_new"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "6155a519",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>pred_gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>14.5</td>\n",
" <td>6.7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>14.0</td>\n",
" <td>5.9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>12.9</td>\n",
" <td>6.4</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"0 1 14.5 6.7 0 1 \n",
"1 1 14.0 5.9 0 0 \n",
"2 1 12.9 6.4 1 0 \n",
"\n",
" lips_thin distance_nose_to_lip_long pred_gender \n",
"0 1 1 0 \n",
"1 0 0 1 \n",
"2 0 1 0 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# See data with appended prediction (last column)\n",
"dval.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c2587a57",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>long_hair</th>\n",
" <th>forehead_width_cm</th>\n",
" <th>forehead_height_cm</th>\n",
" <th>nose_wide</th>\n",
" <th>nose_long</th>\n",
" <th>lips_thin</th>\n",
" <th>distance_nose_to_lip_long</th>\n",
" <th>pred_gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>14.5</td>\n",
" <td>6.7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>14.0</td>\n",
" <td>5.9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>12.9</td>\n",
" <td>6.4</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" long_hair forehead_width_cm forehead_height_cm nose_wide nose_long \\\n",
"0 1 14.5 6.7 0 1 \n",
"1 1 14.0 5.9 0 0 \n",
"2 1 12.9 6.4 1 0 \n",
"\n",
" lips_thin distance_nose_to_lip_long pred_gender \n",
"0 1 1 Male \n",
"1 0 0 Female \n",
"2 0 1 Male "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Converting back to labels\n",
"dval['pred_gender']=dval['pred_gender'].replace(0,'Male')\n",
"dval['pred_gender']=dval['pred_gender'].replace(1,'Female')\n",
"dval.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}