You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

854 lines
18 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"___\n",
"\n",
"<a href='http://www.pieriandata.com'><img src='../Pierian_Data_Logo.png'/></a>\n",
"___\n",
"<center><em>Copyright by Pierian Data Inc.</em></center>\n",
"<center><em>For more information, visit us at <a href='http://www.pieriandata.com'>www.pieriandata.com</a></em></center>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Model Persistence"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.ensemble import RandomForestRegressor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('../DATA/Advertising.csv')"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>TV</th>\n",
" <th>radio</th>\n",
" <th>newspaper</th>\n",
" <th>sales</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>230.1</td>\n",
" <td>37.8</td>\n",
" <td>69.2</td>\n",
" <td>22.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>44.5</td>\n",
" <td>39.3</td>\n",
" <td>45.1</td>\n",
" <td>10.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.2</td>\n",
" <td>45.9</td>\n",
" <td>69.3</td>\n",
" <td>9.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>151.5</td>\n",
" <td>41.3</td>\n",
" <td>58.5</td>\n",
" <td>18.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>180.8</td>\n",
" <td>10.8</td>\n",
" <td>58.4</td>\n",
" <td>12.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>195</th>\n",
" <td>38.2</td>\n",
" <td>3.7</td>\n",
" <td>13.8</td>\n",
" <td>7.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>196</th>\n",
" <td>94.2</td>\n",
" <td>4.9</td>\n",
" <td>8.1</td>\n",
" <td>9.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>197</th>\n",
" <td>177.0</td>\n",
" <td>9.3</td>\n",
" <td>6.4</td>\n",
" <td>12.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198</th>\n",
" <td>283.6</td>\n",
" <td>42.0</td>\n",
" <td>66.2</td>\n",
" <td>25.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199</th>\n",
" <td>232.1</td>\n",
" <td>8.6</td>\n",
" <td>8.7</td>\n",
" <td>13.4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>200 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" TV radio newspaper sales\n",
"0 230.1 37.8 69.2 22.1\n",
"1 44.5 39.3 45.1 10.4\n",
"2 17.2 45.9 69.3 9.3\n",
"3 151.5 41.3 58.5 18.5\n",
"4 180.8 10.8 58.4 12.9\n",
".. ... ... ... ...\n",
"195 38.2 3.7 13.8 7.6\n",
"196 94.2 4.9 8.1 9.7\n",
"197 177.0 9.3 6.4 12.8\n",
"198 283.6 42.0 66.2 25.5\n",
"199 232.1 8.6 8.7 13.4\n",
"\n",
"[200 rows x 4 columns]"
]
},
"execution_count": 172,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>TV</th>\n",
" <th>radio</th>\n",
" <th>newspaper</th>\n",
" <th>sales</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>200.000000</td>\n",
" <td>200.000000</td>\n",
" <td>200.000000</td>\n",
" <td>200.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>147.042500</td>\n",
" <td>23.264000</td>\n",
" <td>30.554000</td>\n",
" <td>14.022500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>85.854236</td>\n",
" <td>14.846809</td>\n",
" <td>21.778621</td>\n",
" <td>5.217457</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.700000</td>\n",
" <td>0.000000</td>\n",
" <td>0.300000</td>\n",
" <td>1.600000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>74.375000</td>\n",
" <td>9.975000</td>\n",
" <td>12.750000</td>\n",
" <td>10.375000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>149.750000</td>\n",
" <td>22.900000</td>\n",
" <td>25.750000</td>\n",
" <td>12.900000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>218.825000</td>\n",
" <td>36.525000</td>\n",
" <td>45.100000</td>\n",
" <td>17.400000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>296.400000</td>\n",
" <td>49.600000</td>\n",
" <td>114.000000</td>\n",
" <td>27.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" TV radio newspaper sales\n",
"count 200.000000 200.000000 200.000000 200.000000\n",
"mean 147.042500 23.264000 30.554000 14.022500\n",
"std 85.854236 14.846809 21.778621 5.217457\n",
"min 0.700000 0.000000 0.300000 1.600000\n",
"25% 74.375000 9.975000 12.750000 10.375000\n",
"50% 149.750000 22.900000 25.750000 12.900000\n",
"75% 218.825000 36.525000 45.100000 17.400000\n",
"max 296.400000 49.600000 114.000000 27.000000"
]
},
"execution_count": 173,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Preparation"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"X = df.drop('sales',axis=1)\n",
"y = df['sales']"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"# HOLD OUT SET"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"# Further split 30% of test into validation and hold-out (15% and 15% each)\n",
"X_validation, X_holdout_test, y_validation, y_holdout_test = train_test_split(X_test, y_test, test_size=0.5, random_state=101)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model Training"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"model = RandomForestRegressor(n_estimators=10,random_state=101)"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor(n_estimators=10, random_state=101)"
]
},
"execution_count": 159,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.fit(X_train,y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [],
"source": [
"validation_predictions = model.predict(X_validation)"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import mean_absolute_error,mean_squared_error"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6636666666666673"
]
},
"execution_count": 162,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_absolute_error(y_validation,validation_predictions)"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7831368547918899"
]
},
"execution_count": 163,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_squared_error(y_validation,validation_predictions)**0.5 #RMSE"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Hyperparameter Tuning"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor(n_estimators=35, random_state=101)"
]
},
"execution_count": 164,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = RandomForestRegressor(n_estimators=35,random_state=101)\n",
"model.fit(X_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [],
"source": [
"validation_predictions = model.predict(X_validation)"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6759047619047621"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_absolute_error(y_validation,validation_predictions)"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8585352183157281"
]
},
"execution_count": 167,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_squared_error(y_validation,validation_predictions)**0.5 #RMSE"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Final Hold Out Test Performance for Reporting"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor(n_estimators=35, random_state=101)"
]
},
"execution_count": 168,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = RandomForestRegressor(n_estimators=35,random_state=101)\n",
"model.fit(X_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [],
"source": [
"test_predictions = model.predict(X_holdout_test)"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5817142857142852"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_absolute_error(y_holdout_test,test_predictions)"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.730550812603694"
]
},
"execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_squared_error(y_holdout_test,test_predictions)**0.5"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Full Training"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"final_model = RandomForestRegressor(n_estimators=35,random_state=101)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor()"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_model.fit(X,y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Saving Model (and anything else as pickle file)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"import joblib"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['final_model.pkl']"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"joblib.dump(final_model,'final_model.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['TV', 'radio', 'newspaper'], dtype='object')"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.columns"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['TV', 'radio', 'newspaper']"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(X.columns)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['column_names.pkl']"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"joblib.dump(list(X.columns),'column_names.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading Model (Model Persistence)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"col_names = joblib.load('column_names.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['TV', 'radio', 'newspaper']"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"col_names"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"loaded_model = joblib.load('final_model.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([21.998])"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"loaded_model.predict([[230.1,37.8,69.2]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"-----"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 1
}