## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import sklearn
from sklearn.model_selection import train_test_split

## Dataset
For our dataset, you can find it [here.](https://www.kaggle.com/datasets/elakiricoder/gender-classification-dataset)

In [2]:
# Load dataset
df = pd.read_csv(r'D:\archive\gender_classification_v7.csv', encoding='utf-8')
df.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


## Data Pre-processing
For this example I skipped the Descriptive Statistics, and went to minor adjustments.

In [3]:
# Check Data types of dataframe columns
df.dtypes

long_hair                      int64
forehead_width_cm            float64
forehead_height_cm           float64
nose_wide                      int64
nose_long                      int64
lips_thin                      int64
distance_nose_to_lip_long      int64
gender                        object
dtype: object

In [4]:
# Convert Gender labels into integer values, for classification
df['gender']=df['gender'].replace('Male',0)
df['gender']=df['gender'].replace('Female',1)
df.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,0
1,0,14.0,5.4,0,0,1,0,1
2,0,11.8,6.3,1,1,1,1,0
3,0,14.4,6.1,0,1,1,1,0
4,1,13.5,5.9,0,0,0,0,1


In [5]:
# Now all is numeric data
df.dtypes

long_hair                      int64
forehead_width_cm            float64
forehead_height_cm           float64
nose_wide                      int64
nose_long                      int64
lips_thin                      int64
distance_nose_to_lip_long      int64
gender                         int64
dtype: object

In [7]:
# Split dataset into X (Features) and y (Labels)

# X is ALL columns except the last column (usually the label to be predicted)
X = df.iloc[:,:-1]
# y is the LABEL column (to be predicted)
y = df.iloc[:,-1]

In [8]:
# Use sklearn's train_test_split function imported before
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

## Using 4 Classifiers
It is sugggested to take a deeper look of the parameters provided in documentations below, for better tweaking of the classifiers.
- [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
- [Decision Tree](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)
- [SVM](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)
- [K-Means / KNN](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [10]:
# Logistic Regression
# Train the model
LogR = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
# Predict the test set
LogR_pred = LogR.predict(X_test)

# Decision Tree
dtree = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train)
dtree_pred = dtree.predict(X_test)

# SVM
svm = SVC(kernel='linear',C=1).fit(X_train, y_train)
svm_pred = svm.predict(X_test)

# K-Means
knn = KNeighborsClassifier(n_neighbors=5).fit(X_train,y_train)
knn_pred = knn.predict(X_test)

# See Accuracy of each classifier
print("Logistic Regression Accuracy: "+ str(LogR.score(X,y)))
print("Decision Tree Accuracy: "+ str(dtree.score(X,y)))
print("SVM Accuracy: "+ str(svm.score(X,y)))
print("K-Means Accuracy: "+ str(knn.score(X,y)))

Logistic Regression Accuracy: 0.9682063587282543
Decision Tree Accuracy: 0.8792241551689662
SVM Accuracy: 0.967006598680264
K-Means Accuracy: 0.9754049190161967


## Try on a new dataset
Use one (or many) model above as predictor in a new dataset. Assuming we have the same columns but different values, we get..

In [11]:
# For this example we use 3 rows of data to be predicted
dval = pd.read_csv(r'D:\archive\valgend.csv', encoding='utf-8')
dval.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long
0,1,14.5,6.7,0,1,1,1
1,1,14.0,5.9,0,0,0,0
2,1,12.9,6.4,1,0,0,1


In [12]:
# predict with knn (change to which model you choose)
knn_pred_new = knn.predict(dval)

In [13]:
# See the result
knn_pred_new

array([0, 1, 0], dtype=int64)

In [14]:
# Add new column in new dataframe for placing the results, pass the "result" from before
dval["pred_gender"]=knn_pred_new

In [15]:
# See data with appended prediction (last column)
dval.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,pred_gender
0,1,14.5,6.7,0,1,1,1,0
1,1,14.0,5.9,0,0,0,0,1
2,1,12.9,6.4,1,0,0,1,0


In [16]:
# Converting back to labels
dval['pred_gender']=dval['pred_gender'].replace(0,'Male')
dval['pred_gender']=dval['pred_gender'].replace(1,'Female')
dval.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,pred_gender
0,1,14.5,6.7,0,1,1,1,Male
1,1,14.0,5.9,0,0,0,0,Female
2,1,12.9,6.4,1,0,0,1,Male
