import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm


#Loading the mroz dataset: 
mroz = pd.read_csv("MROZ.csv")
print(mroz.shape)
print(mroz.head(12))
print(mroz['inlf'].value_counts()) #this is the response variable which has 428 ones and 325 zeros

(753, 22)
    inlf  hours  kidslt6  kidsge6  age  educ    wage  repwage  hushrs  husage  \
0      1   1610        1        0   32    12  3.3540     2.65    2708      34   
1      1   1656        0        2   30    12  1.3889     2.65    2310      30   
2      1   1980        1        3   35    12  4.5455     4.04    3072      40   
3      1    456        0        3   34    12  1.0965     3.25    1920      53   
4      1   1568        1        2   31    14  4.5918     3.60    2000      32   
5      1   2032        0        0   54    12  4.7421     4.70    1040      57   
6      1   1440        0        2   37    16  8.3333     5.95    2670      37   
7      1   1020        0        0   54    12  7.8431     9.98    4120      53   
8      1   1458        0        2   48    12  2.1262     0.00    1995      52   
9      1   1600        0        2   39    12  4.6875     4.15    2100      43   
10     1   1969        0        1   33    12  4.0630     4.30    2450      34   
11     1   1960        0        1   42    11  4.5918     4.58    2375      47   

    ...  faminc     mtr  motheduc  fatheduc  unem  city  exper   nwifeinc  \
0   ...   16310  0.7215        12         7   5.0     0     14  10.910060   
1   ...   21800  0.6615         7         7  11.0     1      5  19.499981   
2   ...   21040  0.6915        12         7   5.0     0     15  12.039910   
3   ...    7300  0.7815         7         7   5.0     0      6   6.799996   
4   ...   27300  0.6215        12        14   9.5     1      7  20.100058   
5   ...   19495  0.6915        14         7   7.5     1     33   9.859054   
6   ...   21152  0.6915        14         7   5.0     0     11   9.152048   
7   ...   18900  0.6915         3         3   5.0     0     35  10.900038   
8   ...   20405  0.7515         7         7   3.0     0     24  17.305000   
9   ...   20425  0.6915         7         7   5.0     0     21  12.925000   
10  ...   32300  0.5815        12         3   5.0     0     15  24.299953   
11  ...   28700  0.6215        14         7   5.0     0     14  19.700071   

       lwage  expersq  
0   1.210154      196  
1   0.328512       25  
2   1.514138      225  
3   0.092123       36  
4   1.524272       49  
5   1.556480     1089  
6   2.120260      121  
7   2.059634     1225  
8   0.754336      576  
9   1.544899      441  
10  1.401922      225  
11  1.524272      196  

[12 rows x 22 columns]
inlf
1    428
0    325
Name: count, dtype: int64


#Logistic Regression
Y = mroz['inlf'] #this is the binary response variable
X = mroz[['kidslt6', 'age', 'educ', 
        'huswage', 'exper', 'expersq']] #our covariates kidslt6, age, educ, huswage, exper, expersq
X = sm.add_constant(X)
logimodel = sm.GLM(Y, X, family=sm.families.Binomial()).fit()
print(logimodel.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                   inlf   No. Observations:                  753
Model:                            GLM   Df Residuals:                      746
Model Family:                Binomial   Df Model:                            6
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -403.13
Date:                Wed, 11 Oct 2023   Deviance:                       806.25
Time:                        13:51:42   Pearson chi2:                     728.
No. Iterations:                     5   Pseudo R-squ. (CS):             0.2568
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8430      0.757      1.114      0.265      -0.640       2.326
kidslt6       -1.4516      0.201     -7.232      0.000      -1.845      -1.058
age           -0.0945      0.014     -6.948      0.000      -0.121      -0.068
educ           0.2071      0.042      4.886      0.000       0.124       0.290
huswage       -0.0440      0.021     -2.066      0.039      -0.086      -0.002
exper          0.2079      0.032      6.540      0.000       0.146       0.270
expersq       -0.0032      0.001     -3.137      0.002      -0.005      -0.001
==============================================================================


#Loading the scikit-learn library for logistic regression:
from sklearn.linear_model import LogisticRegression


#Let us re-fit this logistic regression model using scikit-learn
Y = mroz['inlf']  # This is a binary variable
X = mroz[['kidslt6', 'age', 'educ', 'huswage', 'exper', 'expersq']]
logimodel_sk = LogisticRegression(max_iter = 1000)
logimodel_sk.fit(X, Y)
print("Intercept:", logimodel_sk.intercept_)
print("Coefficients:", logimodel_sk.coef_)

Intercept: [0.78500383]
Coefficients: [[-1.39555397 -0.09272429  0.20477548 -0.04376797  0.2071669  -0.0031617 ]]


#With no regularization
logimodel_sk = LogisticRegression(penalty=None, max_iter = 2000) #"penalty = None" makes it compute the MLE just like statsmodels.api
logimodel_sk.fit(X, Y)
print("Intercept:", logimodel_sk.intercept_)
print("Coefficients:", logimodel_sk.coef_)
#These estimates are essentially the same as the ones given by statsmodels.api

Intercept: [0.84388436]
Coefficients: [[-1.45179926 -0.09456605  0.20715796 -0.04402401  0.20796276 -0.00317439]]


#Split the original data into training and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=503, test_size=250, random_state=1)


#Let us fit logistic regression on the training dataset:
logimodel_train = LogisticRegression(penalty = None, max_iter = 1000)
logimodel_train.fit(X_train, Y_train)
print("Intercept:", logimodel_train.intercept_)
print("Coefficients:", logimodel_train.coef_)

Intercept: [0.8104027]
Coefficients: [[-1.42578681 -0.08662768  0.19075837 -0.05340385  0.22425066 -0.00391879]]


#Prediction for the test data
Y_pred = logimodel_train.predict(X_test)
print(Y_pred)
accuracy_logimodel = np.mean(Y_test == Y_pred)
print(f"Accuracy of Logistic Regression on test set: {accuracy_logimodel}")

[1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 1 0 0 0 1 1
 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 0 0 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 0 0
 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 0 1 0 0 1 1 0 1 1 1
 1 0 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1
 1 1 1 0 1 1 0 1 0 0 1 1 0 0 1 1 0 1 1 0 0 0 0 1 1 1 0 1 0 1 0 0 0 0 1 1 1
 1 0 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1 1 1 0 1
 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0]
Accuracy of Logistic Regression on test set: 0.728


contingency_table = pd.crosstab(Y_test, Y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(contingency_table)

Predicted   0    1  All
Actual                 
0          64   49  113
1          19  118  137
All        83  167  250


from sklearn.neighbors import KNeighborsClassifier

#Using k-NN classification:
k_value = 8
knn = KNeighborsClassifier(n_neighbors = k_value)
knn.fit(X_train, Y_train)
Y_pred_knn = knn.predict(X_test)
accuracy_knn = np.mean(Y_test == Y_pred_knn)
print(f"Accuracy of k-NN Classification on test set: {accuracy_knn}")
#Contingency table (or confusion matrix)
print(pd.crosstab(Y_test, Y_pred_knn, rownames=['Actual'], colnames=['Predicted'], margins=True))

Accuracy of k-NN Classification on test set: 0.744
Predicted    0    1  All
Actual                  
0           78   35  113
1           29  108  137
All        107  143  250


from sklearn.tree import DecisionTreeClassifier

# Initialize and train the Decision Tree classifier
dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train, Y_train)

# Predict on the test set
Y_pred_dt = dt.predict(X_test)

# Calculate the accuracy
accuracy_dt = np.mean(Y_test == Y_pred_dt)
print(f"Accuracy of Decision Tree Classification on test set: {accuracy_dt}")

# Create the confusion matrix (contingency table)
print(pd.crosstab(Y_test, Y_pred_dt, rownames=['Actual'], colnames=['Predicted'], margins=True))

Accuracy of Decision Tree Classification on test set: 0.668
Predicted    0    1  All
Actual                  
0           67   46  113
1           37  100  137
All        104  146  250


# Plot the tree
from sklearn.tree import plot_tree

plt.figure(figsize=(20,10))
plot_tree(dt, feature_names=X.columns,class_names = ('0','1'),  filled=True, rounded=True)
plt.show()


#This tree is way too big to allow any meaningful interpretation
#Here are a few ways to create smaller trees:
dt_small = DecisionTreeClassifier(max_depth = 4, random_state = 1)
dt_small.fit(X_train, Y_train)

Y_pred_dt_small = dt_small.predict(X_test)
# Calculate the accuracy
accuracy_dt_small = np.mean(Y_test == Y_pred_dt_small)
print(f"Accuracy of Decision Tree Classification on test set: {accuracy_dt_small}")

# Plot the tree
from sklearn.tree import plot_tree

plt.figure(figsize=(20,10))
plot_tree(dt_small, feature_names=X.columns, class_names = ('0','1'),  filled=True, rounded=True)
plt.show()

Accuracy of Decision Tree Classification on test set: 0.72


#Random Forest:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest classifier
rf = RandomForestClassifier(random_state=1, max_features = None)
rf.fit(X_train, Y_train)

# Predict on the test set
Y_pred_rf = rf.predict(X_test)

# Calculate the accuracy
accuracy_rf = np.mean(Y_test == Y_pred_rf)
print(f"Accuracy of Random Forest Classification on test set: {accuracy_rf}")

# Create the confusion matrix (contingency table)
print(pd.crosstab(Y_test, Y_pred_rf, rownames=['Actual'], colnames=['Predicted'], margins=True))

Accuracy of Random Forest Classification on test set: 0.728
Predicted   0    1  All
Actual                 
0          71   42  113
1          26  111  137
All        97  153  250


#EXAMPLE ONE:
n = 750
n_train = 500
n_test = 250

x1 = np.random.uniform(-1, 1, n)
x2 = np.random.uniform(-1, 1, n)
X = np.vstack([x1, x2]).transpose()

Y = (x1 * x2 > 0).astype(np.int64)

#Split the dataset into training and test datasets of sizes 500 and 250 respectively
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=500, test_size=250, random_state=1)

x1_train = X_train[:,0]
x2_train = X_train[:,1]
x1_test = X_test[:,0]
x2_test = X_test[:,1]


#In this case where there are only two features, the data can be plotted 
#color of the points denotes Y
def draw_results(x1, x2, color, plot_title=''):
    plt.figure()
    plt.scatter(x1, x2, c=color, cmap='viridis', alpha=0.7);
    plt.colorbar()
    plt.title(plot_title)
    plt.axis('equal')
    plt.xlabel('$x_1$')
    plt.ylabel('$x_2$')
    plt.tight_layout()

draw_results(x1_train, x2_train, color=Y_train, plot_title='Training data')
draw_results(x1_test, x2_test, color=Y_test, plot_title='Test data (ground truth)')


#Method One: Logistic regression:
logimodel_train = LogisticRegression(penalty = None, max_iter = 1000)
logimodel_train.fit(X_train, Y_train)
print("Intercept:", logimodel_train.intercept_)
print("Coefficients:", logimodel_train.coef_)
#Prediction for the test data
Y_pred = logimodel_train.predict(X_test)
print(Y_pred)
accuracy_logimodel = np.mean(Y_test == Y_pred)
print(f"Accuracy of Logistic Regression on test set: {accuracy_logimodel}")

Intercept: [0.09310359]
Coefficients: [[-0.21360691  0.03125306]]
[1 1 1 0 1 1 1 1 0 1 1 0 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0
 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 0 0 0 1 0 0 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 1 1 0 0 0
 0 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 0 0 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1]
Accuracy of Logistic Regression on test set: 0.54


# Visualize the results
pred_probs_logistic = logimodel_train.predict_proba(X_test)[:,1]
draw_results(
    x1_test, x2_test, color=pred_probs_logistic, 
    plot_title="Predicted probability of y=1 (logistic)"
)

draw_results(
    x1_test, x2_test, color=Y_pred, 
    plot_title="Logistic model prediction"
)


#Logistic Regression will work here if we do feature engineering (include x1 * x2 as a third feature):
# Create a new feature: x1 * x2
#the following function returns an array like X but with a new feature that is x1 * x2
def add_mult_feature(X):
    """Returns an array like X, but with a new feature that's X1 * X2"""
    new_feature = X[:, 0] * X[:, 1]
    return np.hstack([X, new_feature[:, None]])

X_train_feat = add_mult_feature(X_train)
X_test_feat = add_mult_feature(X_test)

logimodel_train_feat = LogisticRegression(penalty = None, max_iter = 2000)
logimodel_train_feat.fit(X_train_feat, Y_train)
print("Intercept:", logimodel_train_feat.intercept_)
print("Coefficients:", logimodel_train_feat.coef_)
#Prediction for the test data
Y_pred_feat = logimodel_train_feat.predict(X_test_feat)
print(Y_pred_feat)
accuracy_logimodel_feat = np.mean(Y_test == Y_pred_feat)
print(f"Accuracy of Logistic Regression (with new feature x1*x2) on test set: {accuracy_logimodel_feat}")

pred_probs_logistic_feat = logimodel_train_feat.predict_proba(X_test_feat)[:,1]
draw_results(
    x1_test, x2_test, color=pred_probs_logistic_feat, 
    plot_title="Predicted probability of y=1 (logistic)"
)

draw_results(
    x1_test, x2_test, color=Y_pred_feat, 
    plot_title="Logistic model prediction"
)

Intercept: [3.47948303]
Coefficients: [[  54.09568077   14.39074806 5570.18753813]]
[1 1 0 0 1 0 1 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 1 1 0 1 1 0 1 1 0 1 1 1 1 0
 0 1 0 1 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 0
 0 0 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0 0 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0
 1 1 1 1 1 0 1 0 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1
 1 1 1 0 1 1 0 0 0 0 1 1 1 1 0 1 1 0 0 1 0 0 0 1 0 0 1 1 1 1 1 1 0 1 0 1 1
 0 1 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 1 1 1 1 1 0 1 0 1 1 1 0 0 1 0 1 0 0
 0 0 0 1 1 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 1 0 0 0 1 0]
Accuracy of Logistic Regression (with new feature x1*x2) on test set: 0.996


#Method Two: k-NN
#Using k-NN classification:
k_value = 8
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = k_value)
knn.fit(X_train, Y_train)
Y_pred_knn = knn.predict(X_test)
accuracy_knn = np.mean(Y_test == Y_pred_knn)
print(f"Accuracy of k-NN Classification on test set: {accuracy_knn}")
#Contingency table (or confusion matrix)
print(pd.crosstab(Y_test, Y_pred_knn, rownames=['Actual'], colnames=['Predicted'], margins=True))

Accuracy of k-NN Classification on test set: 0.972
Predicted    0    1  All
Actual                  
0          123    5  128
1            2  120  122
All        125  125  250


#Visualizing the results:
from sklearn.neighbors import KNeighborsClassifier

probs_knn = knn.predict_proba(X_test)[:, 1]
y_hat_knn = (probs_knn > 0.5).astype(np.int64)


draw_results(
    x1_test, x2_test, color=probs_knn, 
    plot_title="Predicted probability of y=1 (k-NN)"
)

draw_results(
    x1_test, x2_test, color=y_hat_knn, 
    plot_title="k-NN Model prediction"
)


#Model Three: Decision Tree
dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train, Y_train)

# Predict on the test set
Y_pred_dt = dt.predict(X_test)

# Calculate the accuracy
accuracy_dt = np.mean(Y_test == Y_pred_dt)
print(f"Accuracy of Decision Tree Classification on test set: {accuracy_dt}")

# Create the confusion matrix (contingency table)
print(pd.crosstab(Y_test, Y_pred_dt, rownames=['Actual'], colnames=['Predicted'], margins=True))

Accuracy of Decision Tree Classification on test set: 0.992
Predicted    0    1  All
Actual                  
0          126    2  128
1            0  122  122
All        126  124  250


# Plot the tree
from sklearn.tree import plot_tree
plt.figure(figsize=(20,10))
plot_tree(dt, feature_names=['x1', 'x2'],class_names = ('0','1'),  filled=True, rounded=True)
plt.show()


#Model Four: Random Forest
rf = RandomForestClassifier(random_state=1, max_features = None)
rf.fit(X_train, Y_train)

# Predict on the test set
Y_pred_rf = rf.predict(X_test)

# Calculate the accuracy
accuracy_rf = np.mean(Y_test == Y_pred_rf)
print(f"Accuracy of Random Forest Classification on test set: {accuracy_rf}")

# Create the confusion matrix (contingency table)
print(pd.crosstab(Y_test, Y_pred_rf, rownames=['Actual'], colnames=['Predicted'], margins=True))

Accuracy of Random Forest Classification on test set: 1.0
Predicted    0    1  All
Actual                  
0          128    0  128
1            0  122  122
All        128  122  250


#EXAMPLE TWO: Let us add some training noise to the previous dataset:
Y_train_noisy = Y_train.copy()

pts_to_flip = np.random.random(n_train) < 0.1 #we are selecting 10% of the datapoints
Y_train_noisy[pts_to_flip] = 1 - Y_train_noisy[pts_to_flip] #we are corrupting the responses of these datapoints

draw_results(x1_train, x2_train, color=Y_train_noisy, plot_title='Training data with noise')


#Performance of Logistic Regression on this corrupted dataset:
logimodel_train = LogisticRegression(penalty = None, max_iter = 1000)
logimodel_train.fit(X_train, Y_train_noisy)
print("Intercept:", logimodel_train.intercept_)
print("Coefficients:", logimodel_train.coef_)
#Prediction for the test data
Y_pred = logimodel_train.predict(X_test)
print(Y_pred)
accuracy_logimodel = np.mean(Y_test == Y_pred)
print(f"Accuracy of Logistic Regression on test set: {accuracy_logimodel}")

Intercept: [0.19758252]
Coefficients: [[-0.03969418 -0.0795844 ]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Accuracy of Logistic Regression on test set: 0.488


#Logistic Regression with the x1*x2 feature
logimodel_train_feat = LogisticRegression(penalty = None, max_iter = 1000)
logimodel_train_feat.fit(X_train_feat, Y_train_noisy)
print("Intercept:", logimodel_train_feat.intercept_)
print("Coefficients:", logimodel_train_feat.coef_)
#Prediction for the test data
Y_pred_feat = logimodel_train_feat.predict(X_test_feat)
print(Y_pred_feat)
accuracy_logimodel_feat = np.mean(Y_test == Y_pred_feat)
print(f"Accuracy of Logistic Regression (with new feature x1*x2) on test set: {accuracy_logimodel_feat}")

Intercept: [0.25340968]
Coefficients: [[ 0.19991346 -0.02759687  7.38911646]]
[1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 0 1 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1 0
 0 0 0 0 0 1 0 1 0 1 0 1 1 1 1 0 0 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0 0 0 0 0 1
 1 1 1 1 1 0 1 0 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 1 1 1
 1 1 1 0 1 1 0 0 0 1 1 1 1 1 0 1 1 0 0 1 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1
 0 1 0 1 1 1 1 0 1 0 0 0 1 1 1 0 1 1 0 1 1 1 1 1 0 1 0 1 1 1 0 0 1 0 1 0 0
 0 0 0 1 1 1 0 0 1 0 1 0 1 0 1 0 0 1 0 0 1 1 1 0 1 0 1 0]
Accuracy of Logistic Regression (with new feature x1*x2) on test set: 0.92


#Performance of k-NN on this corrupted data:
k_value = 8
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = k_value)
knn.fit(X_train, Y_train_noisy)
Y_pred_knn = knn.predict(X_test)
accuracy_knn = np.mean(Y_test == Y_pred_knn)
print(f"Accuracy of k-NN Classification on test set: {accuracy_knn}")
#Contingency table (or confusion matrix)
print(pd.crosstab(Y_test, Y_pred_knn, rownames=['Actual'], colnames=['Predicted'], margins=True))

Accuracy of k-NN Classification on test set: 0.96
Predicted    0    1  All
Actual                  
0          119    9  128
1            1  121  122
All        120  130  250


#Performance of Decision Tree on corrupted data:
dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train, Y_train_noisy)

# Predict on the test set
Y_pred_dt = dt.predict(X_test)

# Calculate the accuracy
accuracy_dt = np.mean(Y_test == Y_pred_dt)
print(f"Accuracy of Decision Tree Classification on test set: {accuracy_dt}")

# Create the confusion matrix (contingency table)
print(pd.crosstab(Y_test, Y_pred_dt, rownames=['Actual'], colnames=['Predicted'], margins=True))

# Plot the tree
from sklearn.tree import plot_tree
plt.figure(figsize=(20,10))
plot_tree(dt, feature_names=['x1', 'x2'],class_names = ('0','1'),  filled=True, rounded=True)
plt.show()

Accuracy of Decision Tree Classification on test set: 0.908
Predicted    0    1  All
Actual                  
0          113   15  128
1            8  114  122
All        121  129  250


#Performance of Random Forest:
rf = RandomForestClassifier(random_state=1, max_features = None)
rf.fit(X_train, Y_train_noisy)

# Predict on the test set
Y_pred_rf = rf.predict(X_test)

# Calculate the accuracy
accuracy_rf = np.mean(Y_test == Y_pred_rf)
print(f"Accuracy of Random Forest Classification on test set: {accuracy_rf}")

# Create the confusion matrix (contingency table)
print(pd.crosstab(Y_test, Y_pred_rf, rownames=['Actual'], colnames=['Predicted'], margins=True))

Accuracy of Random Forest Classification on test set: 0.952
Predicted    0    1  All
Actual                  
0          118   10  128
1            2  120  122
All        120  130  250


#EXAMPLE THREE:
n_train = 500
n_test = 250
n = n_train + n_test

x1 = np.random.uniform(-1, 1, n)
x2 = np.random.uniform(-1, 1, n)
X = np.vstack([x1, x2]).transpose()

Y = ((x1+x2) * (x1 - x2) > 0).astype(np.int64)

#Split the dataset into training and test datasets of sizes 500 and 250 respectively
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=500, test_size=250, random_state=1)

x1_train = X_train[:,0]
x2_train = X_train[:,1]
x1_test = X_test[:,0]
x2_test = X_test[:,1]

draw_results(x1_train, x2_train, color=Y_train, plot_title='Training data')
draw_results(x1_test, x2_test, color=Y_test, plot_title='Test data (ground truth)')


#Logistic Regression
logimodel_train = LogisticRegression(penalty = None, max_iter = 1000)
logimodel_train.fit(X_train, Y_train)
print("Intercept:", logimodel_train.intercept_)
print("Coefficients:", logimodel_train.coef_)
#Prediction for the test data
Y_pred = logimodel_train.predict(X_test)
print(Y_pred)
accuracy_logimodel = np.mean(Y_test == Y_pred)
print(f"Accuracy of Logistic Regression on test set: {accuracy_logimodel}")

Intercept: [0.01178031]
Coefficients: [[-0.12368168 -0.07390931]]
[0 1 1 1 1 1 1 0 1 0 0 0 1 0 0 0 1 1 0 1 0 0 1 1 0 1 1 0 1 0 1 1 0 1 1 0 1
 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 0 0 1 1 0 1 1 0 0 0 1 1 0 0 1 0 0 0
 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 0 0 0 0 0 1 1 1 0 1 1 1 1 0 1 0 1 0 1 1 0 1
 0 0 1 1 0 0 1 0 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0
 0 1 1 1 1 1 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 1 1 0 1 1 1 1 1 0 1
 1 1 0 0 0 1 1 1 0 0 1 1 0 0 0 0 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 1 0 0 1 1 0
 0 1 0 1 1 0 1 1 0 1 0 1 0 0 1 0 1 0 1 0 0 1 1 1 1 1 1 1]
Accuracy of Logistic Regression on test set: 0.504


#Performance of k-NN:
k_value = 8
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = k_value)
knn.fit(X_train, Y_train)
Y_pred_knn = knn.predict(X_test)
accuracy_knn = np.mean(Y_test == Y_pred_knn)
print(f"Accuracy of k-NN Classification on test set: {accuracy_knn}")
#Contingency table (or confusion matrix)
print(pd.crosstab(Y_test, Y_pred_knn, rownames=['Actual'], colnames=['Predicted'], margins=True))

Accuracy of k-NN Classification on test set: 0.932
Predicted    0    1  All
Actual                  
0          104    4  108
1           13  129  142
All        117  133  250


#Performance of Decision Tree:
dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train, Y_train)

# Predict on the test set
Y_pred_dt = dt.predict(X_test)

# Calculate the accuracy
accuracy_dt = np.mean(Y_test == Y_pred_dt)
print(f"Accuracy of Decision Tree Classification on test set: {accuracy_dt}")

# Create the confusion matrix (contingency table)
print(pd.crosstab(Y_test, Y_pred_dt, rownames=['Actual'], colnames=['Predicted'], margins=True))

# Plot the tree
from sklearn.tree import plot_tree
plt.figure(figsize=(20,10))
plot_tree(dt, feature_names=['x1', 'x2'],class_names = ('0','1'),  filled=True, rounded=True)
plt.show()

Accuracy of Decision Tree Classification on test set: 0.912
Predicted    0    1  All
Actual                  
0           97   11  108
1           11  131  142
All        108  142  250


#Performance of Random Forest:
rf = RandomForestClassifier(random_state=1, max_features = None)
rf.fit(X_train, Y_train)

# Predict on the test set
Y_pred_rf = rf.predict(X_test)

# Calculate the accuracy
accuracy_rf = np.mean(Y_test == Y_pred_rf)
print(f"Accuracy of Random Forest Classification on test set: {accuracy_rf}")

# Create the confusion matrix (contingency table)
print(pd.crosstab(Y_test, Y_pred_rf, rownames=['Actual'], colnames=['Predicted'], margins=True))

Accuracy of Random Forest Classification on test set: 0.968
Predicted    0    1  All
Actual                  
0          105    3  108
1            5  137  142
All        110  140  250

Nonparametric methods (for Classification)¶

Logistic Regression¶

Accuracy Assessment for Logistic Regression¶

Nearest Neighbour Classification¶

Decision Trees¶

Random Forest¶

Evaluation of these Classification Methods on Some Simulated Toy Datasets¶