{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Loading the required packages\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n", "from sklearn.model_selection import train_test_split\n", "import matplotlib.pyplot as plt\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn import neighbors" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Loading the data\n", "train = pd.read_csv('train.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>PassengerId</th>\n", " <th>Survived</th>\n", " <th>Pclass</th>\n", " <th>Name</th>\n", " <th>Sex</th>\n", " <th>Age</th>\n", " <th>SibSp</th>\n", " <th>Parch</th>\n", " <th>Ticket</th>\n", " <th>Fare</th>\n", " <th>Cabin</th>\n", " <th>Embarked</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Braund, Mr. Owen Harris</td>\n", " <td>male</td>\n", " <td>22.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>A/5 21171</td>\n", " <td>7.2500</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", " <td>female</td>\n", " <td>38.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>PC 17599</td>\n", " <td>71.2833</td>\n", " <td>C85</td>\n", " <td>C</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>Heikkinen, Miss. Laina</td>\n", " <td>female</td>\n", " <td>26.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>STON/O2. 3101282</td>\n", " <td>7.9250</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", " <td>female</td>\n", " <td>35.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>113803</td>\n", " <td>53.1000</td>\n", " <td>C123</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Allen, Mr. William Henry</td>\n", " <td>male</td>\n", " <td>35.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>373450</td>\n", " <td>8.0500</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Glimpse of the dataset\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 891 entries, 0 to 890\n", "Data columns (total 12 columns):\n", "PassengerId 891 non-null int64\n", "Survived 891 non-null int64\n", "Pclass 891 non-null int64\n", "Name 891 non-null object\n", "Sex 891 non-null object\n", "Age 714 non-null float64\n", "SibSp 891 non-null int64\n", "Parch 891 non-null int64\n", "Ticket 891 non-null object\n", "Fare 891 non-null float64\n", "Cabin 204 non-null object\n", "Embarked 889 non-null object\n", "dtypes: float64(2), int64(5), object(5)\n", "memory usage: 83.6+ KB\n" ] } ], "source": [ "# Dataset Information in brief\n", "train.info()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Feature selection: remove variables no longer containing relevant information and drop missing values\n", "drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin']\n", "train = train.drop(drop_elements, axis = 1)\n", "\n", "train.dropna(inplace=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Mapping categorical features to numbers\n", "train['Sex'] = train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)\n", "train['Embarked'] = train['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Train test split\n", "X = train.drop('Survived', axis=1)\n", "y = train['Survived']\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", " criterion='gini', max_depth=None, max_features='auto',\n", " max_leaf_nodes=None, max_samples=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=100,\n", " n_jobs=None, oob_score=False, random_state=1, verbose=0,\n", " warm_start=False)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fitting the Random Forest Model\n", "model = RandomForestClassifier(random_state=1)\n", "model.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "preds = model.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy : 0.8111888111888111\n", "[[69 9]\n", " [18 47]]\n", " precision recall f1-score support\n", "\n", " 0 0.79 0.88 0.84 78\n", " 1 0.84 0.72 0.78 65\n", "\n", " accuracy 0.81 143\n", " macro avg 0.82 0.80 0.81 143\n", "weighted avg 0.81 0.81 0.81 143\n", "\n" ] } ], "source": [ "# Accuracy, confusion matrix and classification report\n", "print(\"Accuracy : \", accuracy_score(y_test, preds))\n", "print(confusion_matrix(y_test, preds))\n", "print(classification_report(y_test, preds))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The report shows the main classification metrics precision, recall and f1-score on a per-class basis. The metrics are calculated by using true and false positives, true and false negatives. Positive and negative in this case are generic names for the predicted classes. There are four ways to check if the predictions are right or wrong:\n", "\n", "* TN / True Negative: when a case was negative and predicted negative\n", "* TP / True Positive: when a case was positive and predicted positive\n", "* FN / False Negative: when a case was positive but predicted negative\n", "* FP / False Positive: when a case was negative but predicted positive" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Precision – What percent of your predictions were correct?\n", "Precision is the ability of a classifier not to label an instance positive that is actually negative. For each class it is defined as the ratio of true positives to the sum of true and false positives.\n", "\n", "* Precision = TP/(TP + FP)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Recall – What percent of the positive cases did you catch? \n", "Recall is the ability of a classifier to find all positive instances. For each class it is defined as the ratio of true positives to the sum of true positives and false negatives.\n", "\n", "* Recall = TP/(TP+FN)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "F1 score – What percent of positive predictions were correct? \n", "The F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0. Generally speaking, F1 scores are lower than accuracy measures as they embed precision and recall into their computation. As a rule of thumb, the weighted average of F1 should be used to compare classifier models, not global accuracy.\n", "\n", "* F1 Score = 2*(Recall * Precision) / (Recall + Precision)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1) Age 0.281820\n", " 2) Fare 0.258279\n", " 3) Sex 0.243350\n", " 4) Pclass 0.091592\n", " 5) SibSp 0.053021\n", " 6) Parch 0.043027\n", " 7) Embarked 0.028911\n" ] } ], "source": [ "# Assessing feature importance\n", "feat_labels = X_train.columns\n", "importances = model.feature_importances_\n", "\n", "indices = np.argsort(importances)[::-1] #[::-1] reverse the array\n", "for f in range(X_train.shape[1]):\n", " print(\"%2d) %-*s %f\" % (f + 1, 30, \n", " feat_labels[indices[f]], \n", " importances[indices[f]])) " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Feature importance plot\n", "plt.title('Feature Importance')\n", "plt.bar(range(X_train.shape[1]), \n", " importances[indices],\n", " align='center')\n", "\n", "plt.xticks(range(X_train.shape[1]), \n", " feat_labels[indices], rotation=90)\n", "plt.xlim([-1, X_train.shape[1]])\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": true }, "outputs": [], "source": [ "from pydotplus import graph_from_dot_data\n", "from sklearn.tree import export_graphviz\n", "\n", "forest_small = RandomForestClassifier(criterion='gini',\n", " n_estimators=10, \n", " max_depth = 3, random_state=1,\n", " n_jobs=2)\n", "forest_small.fit(X_train, y_train)\n", "# Pull out one tree from the forest\n", "tree = forest_small.estimators_[5]\n", "\n", "# Export the image to a dot data\n", "feature_list = list(X_train.columns)\n", "dot_data = export_graphviz(tree, out_file = None, \n", " feature_names = feature_list, \n", " rounded = True, precision = 1)\n", "# Use dot data to create a graph\n", "graph = graph_from_dot_data(dot_data)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "<IPython.core.display.Image object>" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import Image \n", "Image(graph.create_png())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Decision tree analysis using top 5 features" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# Top 5 features\n", "top5 = ['Age', 'Fare', 'Sex', 'Pclass', 'SibSp']" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "X_train = X_train[top5]\n", "X_test = X_test[top5]" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy Score on the test data: 0.8251748251748252\n", "[[75 12]\n", " [13 43]]\n", " precision recall f1-score support\n", "\n", " 0 0.85 0.86 0.86 87\n", " 1 0.78 0.77 0.77 56\n", "\n", " accuracy 0.83 143\n", " macro avg 0.82 0.81 0.82 143\n", "weighted avg 0.82 0.83 0.82 143\n", "\n" ] } ], "source": [ "# Decision tree classifier\n", "model = DecisionTreeClassifier(criterion='gini', \n", " max_depth=4, \n", " random_state=1)\n", "\n", "# Fitting the model\n", "model.fit(X_train, y_train)\n", "\n", "# Prediction on test set and Classification report\n", "print('Accuracy Score on the test data: ', accuracy_score(y_true=y_test, y_pred=model.predict(X_test)))\n", "print(confusion_matrix(y_test, model.predict(X_test)))\n", "print(classification_report(y_test, model.predict(X_test)))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Analysis using KNN" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "# Train test split\n", "X = train.drop('Survived', axis=1)\n", "y = train['Survived']\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "# Fitting KNN model\n", "knn = neighbors.KNeighborsClassifier()\n", "knn.fit(X_train, y_train)\n", "y_pred = knn.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy : 0.4825174825174825\n", "[[62 21]\n", " [25 35]]\n", " precision recall f1-score support\n", "\n", " 0 0.71 0.75 0.73 83\n", " 1 0.62 0.58 0.60 60\n", "\n", " accuracy 0.68 143\n", " macro avg 0.67 0.67 0.67 143\n", "weighted avg 0.68 0.68 0.68 143\n", "\n" ] } ], "source": [ "# Accuracy, confusion matrix and classification report\n", "print(\"Accuracy : \", accuracy_score(y_test, preds))\n", "print(confusion_matrix(y_test, y_pred))\n", "print(classification_report(y_test, y_pred))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Comparing the Random Forrest model and the KNN model, We can see that the precision, recall and f1-score of the Random forest model is higher than the KNN model. Hence we can sat that Random Forest model has advantage over KNN model on the predictions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }