import pandas as pd
import matplotlib.pyplot as plt
import pydot
from IPython.display import Image
import graphviz
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report, accuracy_score
from six import StringIO
import warnings
warnings.simplefilter('ignore')
%matplotlib inline
import seaborn as sns
plt.style.use('seaborn-white')
Define a function to create images of tree models using pydot.
def print_tree(estimator, features, class_names=None, filled=True):
tree = estimator
names = features
color = filled
classn = class_names
dot_data = StringIO()
export_graphviz(estimator, out_file=dot_data, feature_names=features, class_names=classn, filled=filled)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
return(graph)
We use Carseats dataset from ISLR R
package, a dataset containing sales of child car seats at 400 different stores.
carseats = pd.read_csv('Carseats.csv')
carseats.head()
Sales | CompPrice | Income | Advertising | Population | Price | ShelveLoc | Age | Education | Urban | US | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9.50 | 138 | 73 | 11 | 276 | 120 | Bad | 42 | 17 | Yes | Yes |
1 | 11.22 | 111 | 48 | 16 | 260 | 83 | Good | 65 | 10 | Yes | Yes |
2 | 10.06 | 113 | 35 | 10 | 269 | 80 | Medium | 59 | 12 | Yes | Yes |
3 | 7.40 | 117 | 100 | 4 | 466 | 97 | Medium | 55 | 14 | Yes | Yes |
4 | 4.15 | 141 | 64 | 3 | 340 | 128 | Bad | 38 | 13 | Yes | No |
Sales: Unit sales (in thousands) at each location
CompPrice: Price charged by competitor at each location
Income: Community income level (in thousands of dollars)
Advertising: Local advertising budget for company at each location (in thousands of dollars)
Population: Population size in region (in thousands)
Price: Price company charges for car seats at each site
ShelveLoc: A factor with levels Bad, Good and Medium indicating the quality of the shelving location for the car seats at each site
Age: Average age of the local population
Education: Education level at each location
Urban: A factor with levels No and Yes to indicate whether the store is in an urban or rural location
US: A factor with levels No and Yes to indicate whether the store is in the US or not
We create the response variable High, which is "True" if the Sales variable exceeds 8 and "False" otherwise.
carseats['High'] = carseats.Sales >= 8
carseats['Urban'] = carseats.Urban == 'Yes'
carseats['US'] = carseats.US == 'Yes'
carseats.ShelveLoc = carseats.ShelveLoc.map({'Bad':0, 'Medium':1, 'Good':2})
carseats.head()
Sales | CompPrice | Income | Advertising | Population | Price | ShelveLoc | Age | Education | Urban | US | High | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9.50 | 138 | 73 | 11 | 276 | 120 | 0 | 42 | 17 | True | True | True |
1 | 11.22 | 111 | 48 | 16 | 260 | 83 | 2 | 65 | 10 | True | True | True |
2 | 10.06 | 113 | 35 | 10 | 269 | 80 | 1 | 59 | 12 | True | True | True |
3 | 7.40 | 117 | 100 | 4 | 466 | 97 | 1 | 55 | 14 | True | True | False |
4 | 4.15 | 141 | 64 | 3 | 340 | 128 | 0 | 38 | 13 | True | False | False |
Fit a classification tree to predict High by using all variables except Sales.
model = DecisionTreeClassifier()
model.fit(X=carseats.drop(['Sales', 'High'], axis=1) , y=carseats.High)
graph, = print_tree(model, features=carseats.columns[1:-1],class_names=['Low', 'High'])
Image(graph.create_png())
Now we run classification tree on the training dataset.
carseats_train, carseats_test = train_test_split(carseats, train_size=0.5, random_state=1)
model = DecisionTreeClassifier()
model.fit(X=carseats_train.drop(['Sales', 'High'], axis=1) , y=carseats_train.High)
graph, = print_tree(model, features=carseats.columns[1:-1],class_names=['Low', 'High'])
Image(graph.create_png())
Predict the class on the test dataset and construct the confusion matrix.
predict = model.predict(X=carseats_test.drop(['Sales', 'High'], axis=1))
cm = pd.DataFrame(confusion_matrix(carseats_test.High,predict), index=['Low','High'], columns=['Low','High'])
cm.index.name = 'Predicted'
cm.columns.name = 'True'
cm
True | Low | High |
---|---|---|
Predicted | ||
Low | 81 | 38 |
High | 27 | 54 |
print(classification_report(carseats_test.High, predict, target_names=['Low','High']))
precision recall f1-score support Low 0.75 0.68 0.71 119 High 0.59 0.67 0.62 81 accuracy 0.68 200 macro avg 0.67 0.67 0.67 200 weighted avg 0.68 0.68 0.68 200
Misclassification error:
sum(predict != carseats_test.High) / len(predict)
0.325
We consider whether pruning the tree might lead to improved results.
parameters = {'max_leaf_nodes':range(2, 50)}
model = DecisionTreeClassifier()
cv_tree = GridSearchCV(model, parameters, cv=10, scoring='accuracy')
cv_tree.fit(X=carseats_train[carseats_train.columns[1:-1]] , y=carseats_train.High)
cv_tree.cv_results_.keys()
fig, ax = plt.subplots()
ax.plot( range(2,50),1-cv_tree.cv_results_['mean_test_score'], 'ro-')
ax.set_ylabel('Error Rate')
ax.set_xlabel('Tree size')
Text(0.5, 0, 'Tree size')
best_model = cv_tree.best_estimator_
cv_tree.best_params_
{'max_leaf_nodes': 8}
graph, = print_tree(best_model, features=carseats.columns[1:-1],class_names=['Low', 'High'])
Image(graph.create_png())
Compute the misclassification error using the pruned tree.
best_predict = best_model.predict(X=carseats_test[carseats_test.columns[1:-1]])
sum(best_predict != carseats_test.High) / len(best_predict)
0.275
We use Boston Housing dataset. MEDV is the response variable.
boston = pd.read_csv('Boston.csv')
boston.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | BLACK | LSTAT | MEDV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
CRIM: per capita crime rate by town.
ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS: proportion of non-retail business acres per town.
CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
NOX: nitrogen oxides concentration (parts per 10 million).
RM: average number of rooms per dwelling.
AGE: proportion of owner-occupied units built prior to 1940.
DIS: weighted mean of distances to five Boston employment centres.
RAD: index of accessibility to radial highways
...
MEDV: median value of owner-occupied homes in $1000s.
Split the data into training and test datasets.
boston_train, boston_test = train_test_split(boston, train_size=0.5, random_state=1)
We use MSE (default) to find splits and set the maximum depth of tree = 3.
model = DecisionTreeRegressor(max_depth=3)
model.fit(X=boston_train.drop(['MEDV'], axis=1), y=boston_train.MEDV)
graph, = print_tree(model, features=boston.columns.drop(['MEDV']))
Image(graph.create_png())
Compare the predicted and true values.
predict = model.predict(X=boston_test.drop(['MEDV'], axis=1))
fig, ax = plt.subplots()
ax.scatter(predict, boston_test.MEDV, label='MEDV')
ax.plot([0, 1], [0, 1], '--k', transform=plt.gca().transAxes)
ax.set_xlabel('Predict')
ax.set_ylabel('Test')
Text(0, 0.5, 'Test')
Compute the test MSE.
mean_squared_error(boston_test.MEDV,predict)
24.07874500815818
We use 10-fold cross validation to find the best number of leaf nodes. Note that all scorer objects follow the convention that higher return values are bettter than lower return values so we use the negated value of MSE.
parameters = {'max_leaf_nodes': range(2, 15)}
model = DecisionTreeRegressor()
cv_tree = GridSearchCV(model, parameters, cv=10, scoring='neg_mean_squared_error')
cv_tree.fit(X=boston.drop(['MEDV'], axis=1), y=boston.MEDV)
GridSearchCV(cv=10, estimator=DecisionTreeRegressor(), param_grid={'max_leaf_nodes': range(2, 15)}, scoring='neg_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=10, estimator=DecisionTreeRegressor(), param_grid={'max_leaf_nodes': range(2, 15)}, scoring='neg_mean_squared_error')
DecisionTreeRegressor()
DecisionTreeRegressor()
fig, ax = plt.subplots()
ax.plot( range(2,15),-cv_tree.cv_results_['mean_test_score'], 'ro-')
ax.set_xlabel('Tree size')
ax.set_ylabel('MSE')
Text(0, 0.5, 'MSE')
cv_tree.best_params_
{'max_leaf_nodes': 8}
We fit the tree by using the CV result.
best_model = cv_tree.best_estimator_
graph, = print_tree(best_model, features=boston.columns.drop(['MEDV']))
Image(graph.create_png())
best_predict = best_model.predict(X=boston_test.drop(['MEDV'], axis=1))
fig, ax = plt.subplots()
ax.scatter(best_predict, boston_test.MEDV, label='MEDV')
ax.plot([0, 1], [0, 1], '--k', transform=plt.gca().transAxes)
ax.set_xlabel('Predict')
ax.set_ylabel('Test')
Text(0, 0.5, 'Test')
mean_squared_error(boston_test.MEDV,best_predict)
15.466926404185816
X = boston_train.drop(['MEDV'], axis=1)
y = boston_train.MEDV
X.shape
(253, 13)
There are 13 features in the dataset. Recall that bagging is simply a special case of a random forest with $m = p$, here we use all features.
bagging = RandomForestRegressor(max_features=13, random_state=1)
bagging.fit(X=X, y=y)
RandomForestRegressor(max_features=13, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(max_features=13, random_state=1)
Predicted values on the testing data using bagging.
predict = bagging.predict(X=boston_test.drop(['MEDV'], axis=1))
fig, ax = plt.subplots()
ax.scatter(predict, boston_test.MEDV, label='MEDV')
ax.plot([0, 1], [0, 1], '--k', transform=plt.gca().transAxes)
ax.set_xlabel('Predict')
ax.set_ylabel('Test')
Text(0, 0.5, 'Test')
The test MSE:
mean_squared_error(boston_test.MEDV,predict)
12.02672559683794
Growing a random forest is exactly the same way. By default, max_features = $p$. Here we use max_features $ = \sqrt{p}$.
randomForest = RandomForestRegressor(max_features="sqrt", random_state=11)
randomForest.fit(X=X, y=y)
RandomForestRegressor(max_features='sqrt', random_state=11)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(max_features='sqrt', random_state=11)
predict = randomForest.predict(X=boston_test.drop(['MEDV'], axis=1))
fig, ax = plt.subplots()
ax.scatter(predict, boston_test.MEDV, label='MEDV')
ax.plot([0, 1], [0, 1], '--k', transform=plt.gca().transAxes)
ax.set_xlabel('Predict')
ax.set_ylabel('Test')
print(f"The test MSE is {mean_squared_error(boston_test.MEDV, predict)}.")
The test MSE is 13.01357290513834.
Now we try max_features $ = 6$.
randomForest = RandomForestRegressor(max_features=6, random_state=1)
randomForest.fit(X=X, y=y)
RandomForestRegressor(max_features=6, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(max_features=6, random_state=1)
predict = randomForest.predict(X=boston_test.drop(['MEDV'], axis=1))
fig, ax = plt.subplots()
ax.scatter(predict, boston_test.MEDV, label='MEDV')
ax.plot([0, 1], [0, 1], '--k', transform=plt.gca().transAxes)
ax.set_xlabel('Predict')
ax.set_ylabel('Test')
print(f"The test MSE is {mean_squared_error(boston_test.MEDV,predict)}.")
The test MSE is 11.246821798418969.
We can view the importance of each variable.
Importance = pd.DataFrame({'Importance':randomForest.feature_importances_*100}, index=X.columns)
Importance.sort_values('Importance', axis=0, ascending=True).plot(kind='barh', color='r', )
plt.xlabel('Variable Importance')
plt.gca().legend_ = None
RM: average number of rooms per dwelling.
CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).