{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "___\n", "\n", "\n", "___\n", "
Copyright by Pierian Data Inc.
\n", "
For more information, visit us at www.pieriandata.com
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Model Persistence" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.ensemble import RandomForestRegressor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('../DATA/Advertising.csv')" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TVradionewspapersales
0230.137.869.222.1
144.539.345.110.4
217.245.969.39.3
3151.541.358.518.5
4180.810.858.412.9
...............
19538.23.713.87.6
19694.24.98.19.7
197177.09.36.412.8
198283.642.066.225.5
199232.18.68.713.4
\n", "

200 rows × 4 columns

\n", "
" ], "text/plain": [ " TV radio newspaper sales\n", "0 230.1 37.8 69.2 22.1\n", "1 44.5 39.3 45.1 10.4\n", "2 17.2 45.9 69.3 9.3\n", "3 151.5 41.3 58.5 18.5\n", "4 180.8 10.8 58.4 12.9\n", ".. ... ... ... ...\n", "195 38.2 3.7 13.8 7.6\n", "196 94.2 4.9 8.1 9.7\n", "197 177.0 9.3 6.4 12.8\n", "198 283.6 42.0 66.2 25.5\n", "199 232.1 8.6 8.7 13.4\n", "\n", "[200 rows x 4 columns]" ] }, "execution_count": 172, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TVradionewspapersales
count200.000000200.000000200.000000200.000000
mean147.04250023.26400030.55400014.022500
std85.85423614.84680921.7786215.217457
min0.7000000.0000000.3000001.600000
25%74.3750009.97500012.75000010.375000
50%149.75000022.90000025.75000012.900000
75%218.82500036.52500045.10000017.400000
max296.40000049.600000114.00000027.000000
\n", "
" ], "text/plain": [ " TV radio newspaper sales\n", "count 200.000000 200.000000 200.000000 200.000000\n", "mean 147.042500 23.264000 30.554000 14.022500\n", "std 85.854236 14.846809 21.778621 5.217457\n", "min 0.700000 0.000000 0.300000 1.600000\n", "25% 74.375000 9.975000 12.750000 10.375000\n", "50% 149.750000 22.900000 25.750000 12.900000\n", "75% 218.825000 36.525000 45.100000 17.400000\n", "max 296.400000 49.600000 114.000000 27.000000" ] }, "execution_count": 173, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Preparation" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "X = df.drop('sales',axis=1)\n", "y = df['sales']" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "# HOLD OUT SET" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "# Further split 30% of test into validation and hold-out (15% and 15% each)\n", "X_validation, X_holdout_test, y_validation, y_holdout_test = train_test_split(X_test, y_test, test_size=0.5, random_state=101)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model Training" ] }, { "cell_type": "code", "execution_count": 158, "metadata": {}, "outputs": [], "source": [ "model = RandomForestRegressor(n_estimators=10,random_state=101)" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestRegressor(n_estimators=10, random_state=101)" ] }, "execution_count": 159, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(X_train,y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model Evaluation" ] }, { "cell_type": "code", "execution_count": 160, "metadata": {}, "outputs": [], "source": [ "validation_predictions = model.predict(X_validation)" ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import mean_absolute_error,mean_squared_error" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6636666666666673" ] }, "execution_count": 162, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mean_absolute_error(y_validation,validation_predictions)" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7831368547918899" ] }, "execution_count": 163, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mean_squared_error(y_validation,validation_predictions)**0.5 #RMSE" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Hyperparameter Tuning" ] }, { "cell_type": "code", "execution_count": 164, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestRegressor(n_estimators=35, random_state=101)" ] }, "execution_count": 164, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = RandomForestRegressor(n_estimators=35,random_state=101)\n", "model.fit(X_train,y_train)" ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [], "source": [ "validation_predictions = model.predict(X_validation)" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6759047619047621" ] }, "execution_count": 166, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mean_absolute_error(y_validation,validation_predictions)" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8585352183157281" ] }, "execution_count": 167, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mean_squared_error(y_validation,validation_predictions)**0.5 #RMSE" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Final Hold Out Test Performance for Reporting" ] }, { "cell_type": "code", "execution_count": 168, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestRegressor(n_estimators=35, random_state=101)" ] }, "execution_count": 168, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = RandomForestRegressor(n_estimators=35,random_state=101)\n", "model.fit(X_train,y_train)" ] }, { "cell_type": "code", "execution_count": 169, "metadata": {}, "outputs": [], "source": [ "test_predictions = model.predict(X_holdout_test)" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5817142857142852" ] }, "execution_count": 170, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mean_absolute_error(y_holdout_test,test_predictions)" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.730550812603694" ] }, "execution_count": 171, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mean_squared_error(y_holdout_test,test_predictions)**0.5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Full Training" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "final_model = RandomForestRegressor(n_estimators=35,random_state=101)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestRegressor()" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_model.fit(X,y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Saving Model (and anything else as pickle file)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "import joblib" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['final_model.pkl']" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "joblib.dump(final_model,'final_model.pkl')" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['TV', 'radio', 'newspaper'], dtype='object')" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.columns" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['TV', 'radio', 'newspaper']" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(X.columns)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['column_names.pkl']" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "joblib.dump(list(X.columns),'column_names.pkl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading Model (Model Persistence)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "col_names = joblib.load('column_names.pkl')" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['TV', 'radio', 'newspaper']" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "col_names" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "loaded_model = joblib.load('final_model.pkl')" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([21.998])" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loaded_model.predict([[230.1,37.8,69.2]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 1 }