Muhammad Mohsin Raza (original development) & Chris Harding (code review and documentation updates)
pip install <package>
scikit-learn:
eli5 (https://eli5.readthedocs.io/en/latest/overview.html) is only available via conda-forge or pip. It is only used in one cell to print out the Variable permutation importance, so you have trouble installing it you could skip it.
graphviz and pydotplus are only used to plot a decision tree. Note that the graphviz moduls is called python-graphviz, not graphviz in anaconda. Again, if you have trouble installing it, you can skip it. You won't be able to choose a specific tree but we have included an example of a tree plot.
If you have ArcGIS (Desktop or Pro) you already have arcpy. If you don't have ArcGIS, you cannot simply install arcpy via anaconda as it is not freely available outside of ArcGIS
If you do not have ArcGIS, you can still use most of this code but you will need to somehow get the attribute table in something like xls or csv format instead.
SDS: state of quadrat: 1 = diseased with SDS, 0 = healthy
As you cannot run arcpy methods, you will need to instead have to read in the xls table into pandas (see commented out cell)
import numpy as NUM
import numpy as np
import pandas as PD
import matplotlib.pyplot as PLOT
import seaborn as SEA
from sklearn.ensemble import RandomForestClassifier # module to install is called scikit-learn
# only needed for Variable permutation importance
import eli5
# only needed to plot a node-graph of a decision tree
import graphviz
import pydotplus
import arcpy # comment out if you don't have ArcGIS
# Set the workspace environment to local file geodatabase
arcpy.env.workspace = "SDS project data.gdb"
#arcpy.env.workspace = "." # current folder, contains a shapefile
# Select the featureclass and list its fields
feature_class = 'Soybean_Quadrats_2016_zstats_cr_T20160705_120520_0c65_3B_AnalyticMS' # feature class in geoDB
#feature_class = "Soybean_Quadrats_2016_zstats_cr_T20160705_120520_0c65_3B_AnalyticMS.shp" # shapefile
fields = arcpy.ListFields(feature_class)
for field in fields:
print(field.name, " type:", field.type)
# Non-ArcGIS alternative: use a xls file and look at its first row to see the variable names
import xlrd
table_name = "Soybean_Quadrats_2016_zstats_cr_T20160705_120520_0c65_3B_AnalyticMS.xls"
workbook = xlrd.open_workbook(table_name)
sheet = workbook.sheet_by_index(0) # assumes that your table in the first worksheet
row = sheet.row(0) # assumes that the first row contains the variable names
for cell in row:
print(cell.value)
# Names of explanatory variables (used to predict the response variable)
predictVars = ['MEAN_1', 'MEAN_2', 'MEAN_3', 'MEAN_4', 'Rotation']
#predictVars = ['STD_1', 'STD_2', 'STD_3', 'STD_4', 'Rotation']
print("exploratory variables:")
for e in predictVars: print(e)
# name of response Variable
classVar = ['SDS']
print("\nrespones variable:", classVar[0])
# list with all variables
allVars = predictVars + classVar
# Convert feature class attribute table to numpy array
# also get Quadrat id as "name" for each polyon. This isn't use in the model
# but is useful for cross checking with a GIS
column_names = ["Quadrat"] + allVars
trainFC = arcpy.da.FeatureClassToNumPyArray(feature_class, column_names)
print(column_names)
print(trainFC[:5]) # show first 5 rows
# Convert numpy array to Pandas dataframe
data = PD.DataFrame(trainFC, columns=column_names)
display(data.head())
# Non-ArcGIS only:
# make dataframe from xls
#data = PD.read_excel(table_name, usecols=column_names) # use only these columns, but they may be in a different order!
#data = data.reindex(column_names, axis=1) # re-arrange so it matches the column_names list
#display(data.head())
# use better names for the band numbers (NIR = Near Infrared)
new_names = {'MEAN_1': 'Blue', 'MEAN_2': 'Green', 'MEAN_3': 'Red', 'MEAN_4': 'NIR'}
data.rename(columns=new_names, inplace=True)
display(data.head())
# Calculate NDVI and put it in a new column
ndvi = (data["NIR"] - data["Red"]) / (data["NIR"] + data["Red"])
data.insert(5, 'NDVI', ndvi)
display(data.head())
numeric_vars = ["Blue", "Green", "Red", "NIR", "NDVI"]
#numeric_vars = ["Blue", "Green", "Red", "NIR"]
series = [data[name] for name in numeric_vars]
num_df = PD.concat(series, axis=1)
num_df.describe() # summary statistics
for m in ("pearson", "spearman", "kendall"):
print("\n", m, "correlation coefficient:")
corr = num_df.corr(method=m)
# uncomment these for larger plots
#PLOT.figure(figsize=(12, 12))
#PLOT.rc('font', size=12) # kludgy way to set fontsize for plots
ax = SEA.heatmap(corr,
label=m,
cmap=SEA.diverging_palette(20, 150,center="light", as_cmap=True),
vmin=-1, vmax=1, # min/max for color ramp
square=True, # square cells
fmt=".2f", # 2 decimals
annot=True, # draw values at cell center
#linecolor='w', # white line separators
linewidths=1)
PLOT.show()
There's generally a high correlation among Blue, Green and Red and between NIR and NDVI, suggesting that Random Forest is a good choice as it is robust to multicollinearity.
# Rotation is a categorical variable with 3 different levels that encodes the type of crop rotation
# used in each quadrant. It is initally a string but it is easier if each level is encoded as an integer
def tran_Rotation(x):
if x == 'S2':
return 2
if x == 'S3':
return 3
if x == 'S4':
return 4
data['Rotation'] = data['Rotation'].apply(tran_Rotation)
data['Rotation'] = data['Rotation'].astype('category')
display(data.head(15))
# create separate data frames for Explanatory and Response variables:
expl_vars = ['Blue', 'Green', 'Red', 'NIR', 'NDVI', 'Rotation']
#expl_vars = ['Blue', 'Green', 'Red', 'NIR', 'Rotation']
resp_var = "SDS"
print("Predicting", resp_var , "from", expl_vars)
X = data[expl_vars] # Explanatory variables
y = data[resp_var] # Response variable
from sklearn.model_selection import train_test_split
# Split dataset into training set and test set
SPLIT_RND_SEED = 12345
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3,
random_state=SPLIT_RND_SEED) # 70% training and 30% test
print("Using", len(X_train), "quadrants for training,", len(y_test), "quadrants for testing")
# Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
# Create a Gaussian Classifier with 500 trees
rf_simple = RandomForestClassifier(n_estimators=500,
oob_score=True,
random_state=12345, # random number to be used, needed to reproduce the same result
verbose=False)
# Train the model using the training sets
c_simple = rf_simple.fit(X_train, y_train)
# printing rhe classifier object shows its parameters
print(c_simple)
rf_gridsearch.best_params_
, which are shown here in best_params
# classifier with optimized parameters
best_params = {'max_depth': 5, 'max_features': 3, 'min_samples_leaf': 3, 'n_estimators': 20}
rf = RandomForestClassifier(**best_params,
oob_score=True,
random_state=12345,
verbose=False)
# Train the tuned model using the training sets
c = rf.fit(X_train, y_train)
print(c)
# Accuracy of SDS prediction in the training and testing dataset
def prediction_accuracy(rf, X_train, y_train, X_test, y_test):
print('Accuracy on the training subset: {:.3f}'.format(rf.score(X_train, y_train)))
print('Accuracy on the test subset: {:.3f}'.format(rf.score(X_test, y_test)))
# Out of bag score and accuracy
from sklearn.metrics import accuracy_score
def OOB_score_and_accuracy(rf, X_train, y_train, X_test, y_test):
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Out-of-bag score estimate: {rf.oob_score_:.3}')
print(f'Mean accuracy score: {accuracy:.3}')
A confusion matrix is a table that is used to evaluate the quality of the predictions made be the model from the test data set, compared the the ground truth. 0 represents healthy quadrats and 1 represents diseased quadrats.
The 2 x 2 matrix shows the number of:
from sklearn.metrics import confusion_matrix
# Confusion (error) Matrix of Prediction
def plot_confusion_matrix(X_test, y_test):
# predict y from test
y_pred = rf.predict(X_test)
cm = PD.DataFrame(confusion_matrix(y_test, y_pred))
print('Confusion (error) matrix of prediction')
print(cm)
# use seaborn to plot matrix as heatmap
PLOT.rc('font', size=16)
p = SEA.heatmap(cm,
annot=True,
cbar=False,
cmap="Oranges")
PLOT.ylabel('Ground Truth SDS')
PLOT.xlabel('Predicted SDS')
return p
from sklearn.metrics import classification_report
def print_classification_report(X_test, y_test):
y_pred = rf.predict(X_test)
stats = classification_report(y_test, y_pred,
labels=None,
target_names=["Healty", "SDS"],
sample_weight=None,
digits=2,
output_dict=False)
print("Classification report:\n")
print(stats)
A Receiver Operator Characteristic (ROC) curve is a graphical plot used to show the diagnostic ability of a classifier (model).
from sklearn.datasets import make_classification
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
def plot_ROC_curve(X_test, y_test):
print("Receiver Operator Characteristic (ROC) curve")
# predict probabilities
probs = rf.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, probs)
# plot no skill
p = PLOT.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
PLOT.plot(fpr, tpr, marker='.')
PLOT.xlabel('False positive rate (1 - Specificity)')
PLOT.ylabel('True positive rate (Sensitivity)')
PLOT.title('ROC Curve')
return(p)
Cohen's Kappa is the measure of how well the classifier performed as compared to how well it would have performed simply by chance.
from sklearn.metrics import cohen_kappa_score
def kappa_statistics(X_test, y_test):
y_pred = rf.predict(X_test)
cohen_score = cohen_kappa_score(y_test, y_pred)
print("Kappa score:", cohen_score)
# Variable importance
def feature_importance(rf, X_test):
print("Variable importance:")
fi = PD.DataFrame({'variable name': list(X_test.columns),
'importance': rf.feature_importances_})
return fi.sort_values('importance', ascending = False)
# Permutation Importance
try:
from eli5.sklearn import PermutationImportance
except:
def permutation_importance(X_test, y_test):
print ("Variable permutation importance not available, eli5 not installed")
else:
def permutation_importance(X_test, y_test):
print("Variable permutation importance:")
perm = PermutationImportance(rf).fit(X_test, y_test)
p = eli5.show_weights(perm, feature_names=X.columns.tolist())
return p
print("Predicting", resp_var , "from", expl_vars)
print("Using", len(X_train), "quadrants for training,", len(y_test), "quadrants for testing")
# classifier with optimized parameters
best_params = {'max_depth': 2 , 'max_features': 3, 'min_samples_leaf': 3, 'n_estimators': 20}
rf = RandomForestClassifier(**best_params,
oob_score=True,
random_state=12345,
verbose=False)
c = rf.fit(X_train, y_train)
print(c)
# Show quality for tuned classifier (trained on training data) in predicting the test data
%matplotlib inline
prediction_accuracy(rf, X_train, y_train, X_test, y_test)
OOB_score_and_accuracy(rf, X_train, y_train, X_test, y_test)
print()
PLOT.show(plot_confusion_matrix(X_test, y_test))
print()
print_classification_report(X_test, y_test)
print()
#The ROC curve and Area under curve (AUC) value of 0.92 indicates that our model detected SDS very accurately.
PLOT.show(plot_ROC_curve(X_test, y_test))
print()
kappa_statistics(X_test, y_test)
print()
display(feature_importance(rf, X_test))
print()
display(permutation_importance(X_test, y_test));
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
from io import StringIO
# show simple inlined images for plots, etc.
%matplotlib inline
# graph tree i of rt
def graph_tree(rf, i):
print("Showing tree", i)
# Extract the tree
estimator = rf.estimators_[i]
#print(estimator)
# Create a .dot file
dot_data_buffer = StringIO() # in-memory 'file'
export_graphviz(estimator,
out_file=dot_data_buffer,
feature_names=X.columns, #
class_names = ['healthy', 'diseased'],
rounded=True,
proportion=False,
precision=2,
filled=True,
special_characters=True)
dotgraph = pydotplus.graph_from_dot_data(dot_data_buffer.getvalue())
# save the graph as a png file
#filename = "tree_graph" + str(i) + ".png"
#dotgraph.write_png(filename) # save as png file to disk
# Show and image of the tree
#img = Image(filename=filename) # use file written to disk
img = Image(dotgraph.create_png()) # or directly from buffer
display(img)
print("Total of ", c.n_estimators, "trees were created")
graph_tree(rf, 0) # plot a single tree, e.g. Tree 0, change to see other trees
# show all trees
#for i in range(0, c.n_estimators): graph_tree(rf, i) # plot range of trees
106 samples were randomly used for training, while 62 remained out-of-bag (OOB)
We are going to use this example of a decision tree, which should be very similar to a tree in the model used above.:,
[d, h]
Number of samples at that node per category with diseased (d) left and health (h) right.from sklearn.model_selection import GridSearchCV
import pandas as pd
model = RandomForestClassifier(n_jobs=-1, random_state=12345, verbose=2)
# Important parameters to tune
# n_estimators (“ntree” in R)
# max_features(“mtry” in R)
# min_sample_leaf (“nodesize” in R)
grid = {'n_estimators': [10, 20, 50, 100],
'max_features': [2, 3, 4],
'max_depth': [5, 6, 7, 8, 10],
'min_samples_leaf': [1, 3, 5, 7, 10]}
rf_gridsearch = GridSearchCV(estimator=model,
param_grid=grid,
scoring='roc_auc',
n_jobs=-1,
cv=5,
verbose=2,
return_train_score=True)
rf_gridsearch.fit(X_train, y_train)
# and after some time...
df_gridsearch = pd.DataFrame(rf_gridsearch.cv_results_)
#Best parameters
best_n_estimators_value = rf_gridsearch.best_params_['n_estimators']
best_max_features_value = rf_gridsearch.best_params_['max_features']
best_max_depth_value = rf_gridsearch.best_params_['max_depth']
best_min_samples_leaf = rf_gridsearch.best_params_['min_samples_leaf']
#Best AUC score
best_score = rf_gridsearch.best_score_
print(best_score)
print("Best parameters are:", rf_gridsearch.best_params_)
estimators_list = list(rf_gridsearch.cv_results_['param_n_estimators'].data)
max_features_list = list(rf_gridsearch.cv_results_['param_max_features'].data)
max_depth_list = list(rf_gridsearch.cv_results_['param_max_depth'].data)
min_samples_leaf_list = list(rf_gridsearch.cv_results_['param_min_samples_leaf'].data)
import seaborn as sns
import matplotlib.pyplot as plt
print("AUC values for Estimators (number of trees) vs all other types of parameters")
sns.set_style("whitegrid")
fig = plt.figure(figsize=(25, 15), dpi=300)
plt.rc('font', size=12)
fig.subplots_adjust(hspace=0.4, wspace=0.4)
# n_estimators vs Maximum depth
plt.subplot(3, 2, 1)
data = pd.DataFrame(data={'Estimators': estimators_list, 'Max Depth': max_depth_list,
'AUC': rf_gridsearch.cv_results_['mean_train_score']})
data = data.pivot_table(index='Estimators', columns='Max Depth', values='AUC')
sns.heatmap(data, fmt=".3f", annot=True, cmap="YlGnBu").set_title('AUC for Training data')
plt.subplot(3, 2, 2)
data = pd.DataFrame(data={'Estimators': estimators_list, 'Max Depth': max_depth_list,
'AUC': rf_gridsearch.cv_results_['mean_test_score']})
data = data.pivot_table(index='Estimators', columns='Max Depth', values='AUC')
sns.heatmap(data, fmt=".3f", annot=True, cmap="YlGnBu").set_title('AUC for Test data')
# n_estimators vs Maximum features
plt.subplot(3, 2, 3)
data = pd.DataFrame(data={'Estimators': estimators_list, 'Max Features': max_features_list,
'AUC': rf_gridsearch.cv_results_['mean_train_score']})
data = data.pivot_table(
index='Estimators', columns='Max Features', values='AUC')
sns.heatmap(data, fmt=".3f", annot=True, cmap="YlGnBu").set_title('AUC for Training data')
plt.subplot(3, 2, 4)
data = pd.DataFrame(data={'Estimators': estimators_list, 'Max Features': max_features_list,
'AUC': rf_gridsearch.cv_results_['mean_test_score']})
data = data.pivot_table(
index='Estimators', columns='Max Features', values='AUC')
sns.heatmap(data, fmt=".3f", annot=True, cmap="YlGnBu").set_title('AUC for Test data')
# n_estimators vs Minimum sample leaf
plt.subplot(3, 2, 5)
data = pd.DataFrame(data={'Estimators': estimators_list, 'Minimum Samples at Leaf node':
min_samples_leaf_list, 'AUC': rf_gridsearch.cv_results_['mean_train_score']})
data = data.pivot_table(
index='Estimators', columns='Minimum Samples at Leaf node', values='AUC')
sns.heatmap(data, fmt=".3f", annot=True, cmap="YlGnBu").set_title('AUC for Training data')
plt.subplot(3, 2, 6)
data = pd.DataFrame(data={'Estimators': estimators_list, 'Minimum Samples at Leaf node':
min_samples_leaf_list, 'AUC': rf_gridsearch.cv_results_['mean_test_score']})
data = data.pivot_table(
index='Estimators', columns='Minimum Samples at Leaf node', values='AUC')
sns.heatmap(data, fmt=".3f", annot=True, cmap="YlGnBu").set_title('AUC for Test data');
print("AUC values for combinations of Max Features vs all other types of parameters")
sns.set_style("whitegrid")
fig = plt.figure(figsize=(25, 15), dpi=300)
plt.rc('font', size=12)
fig.subplots_adjust(hspace=0.4, wspace=0.4)
# Maximum features vs Maximum depth
plt.subplot(3, 2, 1)
data = pd.DataFrame(data={'Max Features': max_features_list, 'Max Depth': max_depth_list,
'AUC': rf_gridsearch.cv_results_['mean_train_score']})
data = data.pivot_table(index='Max Features',
columns='Max Depth', values='AUC')
sns.heatmap(data, fmt=".3f", annot=True, cmap="YlGnBu").set_title('AUC for Training data')
plt.subplot(3, 2, 2)
data = pd.DataFrame(data={'Max Features': max_features_list,
'Max Depth': max_depth_list, 'AUC': rf_gridsearch.cv_results_['mean_test_score']})
data = data.pivot_table(index='Max Features',
columns='Max Depth', values='AUC')
sns.heatmap(data, fmt=".3f", annot=True, cmap="YlGnBu").set_title('AUC for Test data')
# Maximum features vs Minimum sample leaf
plt.subplot(3, 2, 3)
data = pd.DataFrame(data={'Max Features': max_features_list, 'Minimum Samples at Leaf node':
min_samples_leaf_list, 'AUC': rf_gridsearch.cv_results_['mean_train_score']})
data = data.pivot_table(index='Max Features',
columns='Minimum Samples at Leaf node', values='AUC')
sns.heatmap(data, fmt=".3f", annot=True, cmap="YlGnBu").set_title('AUC for Training data')
plt.subplot(3, 2, 4)
data = pd.DataFrame(data={'Max Features': max_features_list, 'Minimum Samples at Leaf node':
min_samples_leaf_list, 'AUC': rf_gridsearch.cv_results_['mean_test_score']})
data = data.pivot_table(index='Max Features',
columns='Minimum Samples at Leaf node', values='AUC')
sns.heatmap(data, fmt=".3f", annot=True, cmap="YlGnBu").set_title('AUC for Test data');
print("AUC values for Max depth vs Minimum sample leaf")
sns.set_style("whitegrid")
fig = plt.figure(figsize=(25, 15), dpi=300)
plt.rc('font', size=12)
fig.subplots_adjust(hspace=0.4, wspace=0.4)
# Maximum depth vs Minimum sample leaf
plt.subplot(2, 2, 1)
data = pd.DataFrame(data={'Max Depth': max_depth_list, 'Minimum Samples at Leaf node':
min_samples_leaf_list, 'AUC': rf_gridsearch.cv_results_['mean_train_score']})
data = data.pivot_table(
index='Max Depth', columns='Minimum Samples at Leaf node', values='AUC')
sns.heatmap(data, fmt=".3f", annot=True, cmap="YlGnBu").set_title('AUC for Training data')
plt.subplot(2, 2, 2)
data = pd.DataFrame(data={'Max Depth': max_depth_list, 'Minimum Samples at Leaf node':
min_samples_leaf_list, 'AUC': rf_gridsearch.cv_results_['mean_test_score']})
data = data.pivot_table(
index='Max Depth', columns='Minimum Samples at Leaf node', values='AUC')
sns.heatmap(data, fmt=".3f", annot=True, cmap="YlGnBu").set_title('AUC for Test data');