You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

854 lines
18 KiB

2 years ago
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"___\n",
"\n",
"<a href='http://www.pieriandata.com'><img src='../Pierian_Data_Logo.png'/></a>\n",
"___\n",
"<center><em>Copyright by Pierian Data Inc.</em></center>\n",
"<center><em>For more information, visit us at <a href='http://www.pieriandata.com'>www.pieriandata.com</a></em></center>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Model Persistence"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.ensemble import RandomForestRegressor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('../DATA/Advertising.csv')"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>TV</th>\n",
" <th>radio</th>\n",
" <th>newspaper</th>\n",
" <th>sales</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>230.1</td>\n",
" <td>37.8</td>\n",
" <td>69.2</td>\n",
" <td>22.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>44.5</td>\n",
" <td>39.3</td>\n",
" <td>45.1</td>\n",
" <td>10.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.2</td>\n",
" <td>45.9</td>\n",
" <td>69.3</td>\n",
" <td>9.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>151.5</td>\n",
" <td>41.3</td>\n",
" <td>58.5</td>\n",
" <td>18.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>180.8</td>\n",
" <td>10.8</td>\n",
" <td>58.4</td>\n",
" <td>12.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>195</th>\n",
" <td>38.2</td>\n",
" <td>3.7</td>\n",
" <td>13.8</td>\n",
" <td>7.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>196</th>\n",
" <td>94.2</td>\n",
" <td>4.9</td>\n",
" <td>8.1</td>\n",
" <td>9.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>197</th>\n",
" <td>177.0</td>\n",
" <td>9.3</td>\n",
" <td>6.4</td>\n",
" <td>12.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198</th>\n",
" <td>283.6</td>\n",
" <td>42.0</td>\n",
" <td>66.2</td>\n",
" <td>25.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199</th>\n",
" <td>232.1</td>\n",
" <td>8.6</td>\n",
" <td>8.7</td>\n",
" <td>13.4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>200 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" TV radio newspaper sales\n",
"0 230.1 37.8 69.2 22.1\n",
"1 44.5 39.3 45.1 10.4\n",
"2 17.2 45.9 69.3 9.3\n",
"3 151.5 41.3 58.5 18.5\n",
"4 180.8 10.8 58.4 12.9\n",
".. ... ... ... ...\n",
"195 38.2 3.7 13.8 7.6\n",
"196 94.2 4.9 8.1 9.7\n",
"197 177.0 9.3 6.4 12.8\n",
"198 283.6 42.0 66.2 25.5\n",
"199 232.1 8.6 8.7 13.4\n",
"\n",
"[200 rows x 4 columns]"
]
},
"execution_count": 172,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>TV</th>\n",
" <th>radio</th>\n",
" <th>newspaper</th>\n",
" <th>sales</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>200.000000</td>\n",
" <td>200.000000</td>\n",
" <td>200.000000</td>\n",
" <td>200.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>147.042500</td>\n",
" <td>23.264000</td>\n",
" <td>30.554000</td>\n",
" <td>14.022500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>85.854236</td>\n",
" <td>14.846809</td>\n",
" <td>21.778621</td>\n",
" <td>5.217457</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.700000</td>\n",
" <td>0.000000</td>\n",
" <td>0.300000</td>\n",
" <td>1.600000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>74.375000</td>\n",
" <td>9.975000</td>\n",
" <td>12.750000</td>\n",
" <td>10.375000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>149.750000</td>\n",
" <td>22.900000</td>\n",
" <td>25.750000</td>\n",
" <td>12.900000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>218.825000</td>\n",
" <td>36.525000</td>\n",
" <td>45.100000</td>\n",
" <td>17.400000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>296.400000</td>\n",
" <td>49.600000</td>\n",
" <td>114.000000</td>\n",
" <td>27.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" TV radio newspaper sales\n",
"count 200.000000 200.000000 200.000000 200.000000\n",
"mean 147.042500 23.264000 30.554000 14.022500\n",
"std 85.854236 14.846809 21.778621 5.217457\n",
"min 0.700000 0.000000 0.300000 1.600000\n",
"25% 74.375000 9.975000 12.750000 10.375000\n",
"50% 149.750000 22.900000 25.750000 12.900000\n",
"75% 218.825000 36.525000 45.100000 17.400000\n",
"max 296.400000 49.600000 114.000000 27.000000"
]
},
"execution_count": 173,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Preparation"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"X = df.drop('sales',axis=1)\n",
"y = df['sales']"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"# HOLD OUT SET"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"# Further split 30% of test into validation and hold-out (15% and 15% each)\n",
"X_validation, X_holdout_test, y_validation, y_holdout_test = train_test_split(X_test, y_test, test_size=0.5, random_state=101)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model Training"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"model = RandomForestRegressor(n_estimators=10,random_state=101)"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor(n_estimators=10, random_state=101)"
]
},
"execution_count": 159,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.fit(X_train,y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [],
"source": [
"validation_predictions = model.predict(X_validation)"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import mean_absolute_error,mean_squared_error"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6636666666666673"
]
},
"execution_count": 162,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_absolute_error(y_validation,validation_predictions)"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7831368547918899"
]
},
"execution_count": 163,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_squared_error(y_validation,validation_predictions)**0.5 #RMSE"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Hyperparameter Tuning"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor(n_estimators=35, random_state=101)"
]
},
"execution_count": 164,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = RandomForestRegressor(n_estimators=35,random_state=101)\n",
"model.fit(X_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [],
"source": [
"validation_predictions = model.predict(X_validation)"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6759047619047621"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_absolute_error(y_validation,validation_predictions)"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8585352183157281"
]
},
"execution_count": 167,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_squared_error(y_validation,validation_predictions)**0.5 #RMSE"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Final Hold Out Test Performance for Reporting"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor(n_estimators=35, random_state=101)"
]
},
"execution_count": 168,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = RandomForestRegressor(n_estimators=35,random_state=101)\n",
"model.fit(X_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [],
"source": [
"test_predictions = model.predict(X_holdout_test)"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5817142857142852"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_absolute_error(y_holdout_test,test_predictions)"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.730550812603694"
]
},
"execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_squared_error(y_holdout_test,test_predictions)**0.5"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Full Training"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"final_model = RandomForestRegressor(n_estimators=35,random_state=101)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor()"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_model.fit(X,y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Saving Model (and anything else as pickle file)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"import joblib"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['final_model.pkl']"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"joblib.dump(final_model,'final_model.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['TV', 'radio', 'newspaper'], dtype='object')"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.columns"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['TV', 'radio', 'newspaper']"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(X.columns)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['column_names.pkl']"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"joblib.dump(list(X.columns),'column_names.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading Model (Model Persistence)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"col_names = joblib.load('column_names.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['TV', 'radio', 'newspaper']"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"col_names"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"loaded_model = joblib.load('final_model.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([21.998])"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"loaded_model.predict([[230.1,37.8,69.2]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"-----"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 1
}