You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

18 KiB

<html> <head> </head>

___

Copyright by Pierian Data Inc. For more information, visit us at www.pieriandata.com

Model Persistence

Imports

In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

Data

In [8]:
df = pd.read_csv('../DATA/Advertising.csv')
In [172]:
df
Out[172]:
TV radio newspaper sales
0 230.1 37.8 69.2 22.1
1 44.5 39.3 45.1 10.4
2 17.2 45.9 69.3 9.3
3 151.5 41.3 58.5 18.5
4 180.8 10.8 58.4 12.9
... ... ... ... ...
195 38.2 3.7 13.8 7.6
196 94.2 4.9 8.1 9.7
197 177.0 9.3 6.4 12.8
198 283.6 42.0 66.2 25.5
199 232.1 8.6 8.7 13.4

200 rows × 4 columns

In [173]:
df.describe()
Out[173]:
TV radio newspaper sales
count 200.000000 200.000000 200.000000 200.000000
mean 147.042500 23.264000 30.554000 14.022500
std 85.854236 14.846809 21.778621 5.217457
min 0.700000 0.000000 0.300000 1.600000
25% 74.375000 9.975000 12.750000 10.375000
50% 149.750000 22.900000 25.750000 12.900000
75% 218.825000 36.525000 45.100000 17.400000
max 296.400000 49.600000 114.000000 27.000000
In [ ]:

Data Preparation

In [68]:
X = df.drop('sales',axis=1)
y = df['sales']
In [69]:
from sklearn.model_selection import train_test_split
In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
In [71]:
# HOLD OUT SET
In [72]:
# Further split 30% of test into validation and hold-out (15% and 15% each)
X_validation, X_holdout_test, y_validation, y_holdout_test = train_test_split(X_test, y_test, test_size=0.5, random_state=101)

Model Training

In [158]:
model = RandomForestRegressor(n_estimators=10,random_state=101)
In [159]:
model.fit(X_train,y_train)
Out[159]:
RandomForestRegressor(n_estimators=10, random_state=101)

Model Evaluation

In [160]:
validation_predictions = model.predict(X_validation)
In [161]:
from sklearn.metrics import mean_absolute_error,mean_squared_error
In [162]:
mean_absolute_error(y_validation,validation_predictions)
Out[162]:
0.6636666666666673
In [163]:
mean_squared_error(y_validation,validation_predictions)**0.5 #RMSE
Out[163]:
0.7831368547918899

Hyperparameter Tuning

In [164]:
model = RandomForestRegressor(n_estimators=35,random_state=101)
model.fit(X_train,y_train)
Out[164]:
RandomForestRegressor(n_estimators=35, random_state=101)
In [165]:
validation_predictions = model.predict(X_validation)
In [166]:
mean_absolute_error(y_validation,validation_predictions)
Out[166]:
0.6759047619047621
In [167]:
mean_squared_error(y_validation,validation_predictions)**0.5 #RMSE
Out[167]:
0.8585352183157281

Final Hold Out Test Performance for Reporting

In [168]:
model = RandomForestRegressor(n_estimators=35,random_state=101)
model.fit(X_train,y_train)
Out[168]:
RandomForestRegressor(n_estimators=35, random_state=101)
In [169]:
test_predictions = model.predict(X_holdout_test)
In [170]:
mean_absolute_error(y_holdout_test,test_predictions)
Out[170]:
0.5817142857142852
In [171]:
mean_squared_error(y_holdout_test,test_predictions)**0.5
Out[171]:
0.730550812603694

Full Training

In [28]:
final_model = RandomForestRegressor(n_estimators=35,random_state=101)
In [30]:
final_model.fit(X,y)
Out[30]:
RandomForestRegressor()

Saving Model (and anything else as pickle file)

In [34]:
import joblib
In [35]:
joblib.dump(final_model,'final_model.pkl')
Out[35]:
['final_model.pkl']
In [36]:
X.columns
Out[36]:
Index(['TV', 'radio', 'newspaper'], dtype='object')
In [37]:
list(X.columns)
Out[37]:
['TV', 'radio', 'newspaper']
In [38]:
joblib.dump(list(X.columns),'column_names.pkl')
Out[38]:
['column_names.pkl']

Loading Model (Model Persistence)

In [40]:
col_names = joblib.load('column_names.pkl')
In [41]:
col_names
Out[41]:
['TV', 'radio', 'newspaper']
In [42]:
loaded_model = joblib.load('final_model.pkl')
In [43]:
loaded_model.predict([[230.1,37.8,69.2]])
Out[43]:
array([21.998])

</html>