#Load the dataset wagedata.csv
import pandas as pd
dt = pd.read_csv("wagedata.csv")
print(dt.head(10))

      Region   MetropolitanStatus  Exper  Educ  WeeklyEarnings
0      South  NotMetropolitanArea      8    12          859.71
1    Midwest     MetropolitanArea     30    12          786.73
2       West     MetropolitanArea     31    14         1424.50
3       West     MetropolitanArea     17    16          959.16
4       West     MetropolitanArea      6    12          154.32
5    Midwest     MetropolitanArea     24    17          282.19
6  Northeast     MetropolitanArea     14    12          444.44
7  Northeast     MetropolitanArea     28    12          451.09
8      South     MetropolitanArea     40    12          490.27
9      South     MetropolitanArea      7    16          493.83


import numpy as np
y = np.log(dt['WeeklyEarnings'])
x = dt['Exper']
import matplotlib.pyplot as plt
plt.figure(figsize = (10, 6))
plt.scatter(x, y, s = 1)
plt.xlabel('Years of Experience')
plt.ylabel('Log of Weekly Earnings')
plt.title('Scatter Plot of Years of Experience vs Log(Weekly Earnings)')
plt.show()


#The simplest model to fit is linear regression: y = b0 + b1 x
import statsmodels.api as sm

X = sm.add_constant(x) #adding intercept
model_1 = sm.OLS(y, X).fit()

print(model_1.summary())

#Plotting the fitted line on the scatter plot
plt.figure(figsize = (10, 6))
plt.scatter(x, y, s = 1)
plt.plot(x, model_1.predict(X), color = 'red', label = 'Fitted line')
plt.xlabel('Years of Experience')
plt.ylabel('Log of Weekly Earnings')
plt.title('Linear Regression of Log(Weekly Earnings) on Years of Experience')
plt.legend()
plt.show()

                            OLS Regression Results                            
==============================================================================
Dep. Variable:         WeeklyEarnings   R-squared:                       0.049
Model:                            OLS   Adj. R-squared:                  0.049
Method:                 Least Squares   F-statistic:                     1311.
Date:                Sat, 18 Nov 2023   Prob (F-statistic):          4.67e-280
Time:                        18:49:29   Log-Likelihood:                -23459.
No. Observations:               25437   AIC:                         4.692e+04
Df Residuals:                   25435   BIC:                         4.694e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.0704      0.007    875.641      0.000       6.057       6.084
Exper          0.0112      0.000     36.214      0.000       0.011       0.012
==============================================================================
Omnibus:                      587.040   Durbin-Watson:                   2.001
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              706.522
Skew:                          -0.317   Prob(JB):                    3.81e-154
Kurtosis:                       3.515   Cond. No.                         40.8
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


X = sm.add_constant(x)
X['Exper_Square'] = np.square(x)
model_2 = sm.OLS(y, X).fit()

print(model_2.summary())

#Plotting the fitted quadratic on the scatter plot

b0, b1, b2 = model_2.params
x_range = np.linspace(x.min(), x.max(), 100)
fitted_model_2 = b0 + b1*x_range + b2*np.square(x_range)

plt.figure(figsize = (10, 6))
plt.scatter(x, y, s = 1)
plt.plot(x_range,fitted_model_2, color = 'red', label = 'Fitted Quadratic')
plt.xlabel('Years of Experience')
plt.ylabel('Log of Weekly Earnings')
plt.title('Quadratic Regression of Log(Weekly Earnings) on Experience')
plt.legend()
plt.show()

                            OLS Regression Results                            
==============================================================================
Dep. Variable:         WeeklyEarnings   R-squared:                       0.132
Model:                            OLS   Adj. R-squared:                  0.132
Method:                 Least Squares   F-statistic:                     1939.
Date:                Sat, 18 Nov 2023   Prob (F-statistic):               0.00
Time:                        18:49:30   Log-Likelihood:                -22294.
No. Observations:               25437   AIC:                         4.459e+04
Df Residuals:                   25434   BIC:                         4.462e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            5.6996      0.010    569.429      0.000       5.680       5.719
Exper            0.0601      0.001     58.167      0.000       0.058       0.062
Exper_Square    -0.0011    2.2e-05    -49.400      0.000      -0.001      -0.001
==============================================================================
Omnibus:                      706.284   Durbin-Watson:                   2.013
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1008.724
Skew:                          -0.302   Prob(JB):                    9.09e-220
Kurtosis:                       3.766   Cond. No.                     2.12e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.12e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


#Coefficient interpretation for the quadratic model:
#Interpretation of the coefficients is now trickier because of the square term
#Now the percentage increase for an additional year of experience is given by: 
#100*(b1 + 2*Exper*b2 + b2). For example if Exper = 0, then this is 100*b1 = 6%
#If Exper = 25, then this is 100*(b1 + 50*b2+b2) = 
exper_value = 25
perc_increase = 100*(b1 + 2*exper_value*b2 + b2)
print(perc_increase)

0.48166128316092205


#The point where the quadratic goes from increasing to decreasing is given by: 
peak_quadratic = -b1/(2*b2)
print(peak_quadratic)

27.72060595169368


#Suppose we go back to the linear model, and instead of
#adding the quadratic term, we add the term (x - 27)_+ leading to the model:
#y = b0 + b1 x + b2 (x - 27)_+
#This model keeps the slope b1 for x smaller than 27, but after 27 it modifies the slope to b1 + b2
#We would expect b2 to be negative. 
#This model can also be fit by usual linear regression

X = sm.add_constant(x) #adding intercept
X['pos_part_x_minus_27'] = np.maximum(x - 27, 0)
model_3 = sm.OLS(y, X).fit()

print(model_3.summary())

#Plotting the fitted equation on the scatter plot:

b0, b1, b2 = model_3.params
x_range = np.linspace(x.min(), x.max(), 100)
fitted_model_3 = b0 + b1*x_range + b2*np.maximum(x_range-27,0)

plt.figure(figsize = (10, 6))
plt.scatter(x, y, s = 1)
plt.plot(x_range,fitted_model_3, color = 'green', label = 'Fitted Broken Stick')
plt.plot(x_range, fitted_model_2, color = 'red', label = 'Fitted Quadratic')
plt.xlabel('Years of Experience')
plt.ylabel('Log of Weekly Earnings')
plt.title('Broken Stick Regression of Log(Weekly Earnings) on Experience')
plt.legend()
plt.show()

                            OLS Regression Results                            
==============================================================================
Dep. Variable:         WeeklyEarnings   R-squared:                       0.115
Model:                            OLS   Adj. R-squared:                  0.115
Method:                 Least Squares   F-statistic:                     1653.
Date:                Sat, 18 Nov 2023   Prob (F-statistic):               0.00
Time:                        18:49:30   Log-Likelihood:                -22545.
No. Observations:               25437   AIC:                         4.510e+04
Df Residuals:                   25434   BIC:                         4.512e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   5.8595      0.008    709.655      0.000       5.843       5.876
Exper                   0.0290      0.001     57.298      0.000       0.028       0.030
pos_part_x_minus_27    -0.0524      0.001    -43.544      0.000      -0.055      -0.050
==============================================================================
Omnibus:                      650.115   Durbin-Watson:                   2.010
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              879.115
Skew:                          -0.300   Prob(JB):                    1.27e-191
Kurtosis:                       3.684   Cond. No.                         51.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


#This model has an AIC that is much smaller than the first model (simple linear regression)
#But its AIC is still larger than that of the quadratic model (indicating the quadratic
#model is a better fit). However, the coefficients in this model are easier to interpret
#corresponding to the quadratic model. 
#For x <= 27, wages increase by 100*b1 = 2.9% for every additional year in experience
#For x >= 27, wages increase by 100*(b1 + b2) = -2.34% for every additional year in experience
print(100*(b1 + b2))

-2.3422621148239346


#Below we search over all possible knots and then 
#choose the knot with the highest log-likelihood:
best_knot = None
highest_log_likelihood = -np.inf

for knot in range(1, 63):
    X = sm.add_constant(x)
    X['knot_variable'] = np.maximum(x - knot, 0)
    model_knot = sm.OLS(y, X).fit()
    if model_knot.llf > highest_log_likelihood:
        highest_log_likelihood = model_knot.llf
        best_knot = knot

print(f"The knot with highest log-likelihood is: {best_knot}")
print(f"Highest Log-Likelihood: {highest_log_likelihood}")

The knot with highest log-likelihood is: 17
Highest Log-Likelihood: -22398.051362075297


#We can now fit the model with the best_knot
X = sm.add_constant(x) #adding intercept
X['knot_variable'] = np.maximum(x - best_knot, 0)
model_4 = sm.OLS(y, X).fit()

print(model_4.summary())

#Plotting the fitted equation on the scatter plot

b0, b1, b2 = model_4.params
x_range = np.linspace(x.min(), x.max(), 100)
fitted_model_4 = b0 + b1*x_range + b2*np.maximum(x_range-best_knot,0)

plt.figure(figsize = (10, 6))
plt.scatter(x, y, s = 1)
plt.plot(x_range,fitted_model_4, color = 'green', label = 'Fitted Broken Stick')
plt.plot(x_range,fitted_model_2, color = 'red', label = 'Fitted Quadratic')
plt.xlabel('Years of Experience')
plt.ylabel('Log of Weekly Earnings')
plt.title('Broken Stick Regression of y on x with best knot')
plt.legend()
plt.show()

                            OLS Regression Results                            
==============================================================================
Dep. Variable:         WeeklyEarnings   R-squared:                       0.125
Model:                            OLS   Adj. R-squared:                  0.125
Method:                 Least Squares   F-statistic:                     1819.
Date:                Sat, 18 Nov 2023   Prob (F-statistic):               0.00
Time:                        18:49:32   Log-Likelihood:                -22398.
No. Observations:               25437   AIC:                         4.480e+04
Df Residuals:                   25434   BIC:                         4.483e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             5.7143      0.010    567.213      0.000       5.695       5.734
Exper             0.0474      0.001     57.468      0.000       0.046       0.049
knot_variable    -0.0548      0.001    -47.043      0.000      -0.057      -0.053
==============================================================================
Omnibus:                      688.082   Durbin-Watson:                   2.011
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              971.466
Skew:                          -0.300   Prob(JB):                    1.12e-211
Kurtosis:                       3.747   Cond. No.                         67.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


#We can further improve the broken stick regression model by adding more knot variables. 
#Suppose we add another knot variable at 35:
new_knot = 35
X = sm.add_constant(x) #adding intercept
X['knot_variable'] = np.maximum(x - best_knot, 0)
X['another_knot'] = np.maximum(x - new_knot, 0)

model_5 = sm.OLS(y, X).fit()

print(model_5.summary())

#Plotting the fitted quadratic on the scatter plot

b0, b1, b2, b3 = model_5.params
x_range = np.linspace(x.min(), x.max(), 100)
fitted_model_5 = b0 + b1*x_range + b2*np.maximum(x_range-best_knot,0) + b3 * np.maximum(x_range - new_knot, 0)

plt.figure(figsize = (10, 6))
plt.scatter(x, y, s = 1)
plt.plot(x_range,fitted_model_5, color = 'green', label = 'Fitted Broken Stick')
plt.plot(x_range,fitted_model_2, color = 'red', label = 'Fitted Quadratic')
plt.xlabel('Years of Experience')
plt.ylabel('Log of Weekly Earnings')
plt.title('Broken Stick Regression of y on x with two knots')
plt.legend()
plt.show()

                            OLS Regression Results                            
==============================================================================
Dep. Variable:         WeeklyEarnings   R-squared:                       0.130
Model:                            OLS   Adj. R-squared:                  0.130
Method:                 Least Squares   F-statistic:                     1267.
Date:                Sat, 18 Nov 2023   Prob (F-statistic):               0.00
Time:                        18:49:32   Log-Likelihood:                -22327.
No. Observations:               25437   AIC:                         4.466e+04
Df Residuals:                   25433   BIC:                         4.470e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             5.7340      0.010    563.180      0.000       5.714       5.754
Exper             0.0442      0.001     51.187      0.000       0.043       0.046
knot_variable    -0.0443      0.001    -30.437      0.000      -0.047      -0.041
another_knot     -0.0261      0.002    -11.916      0.000      -0.030      -0.022
==============================================================================
Omnibus:                      701.022   Durbin-Watson:                   2.012
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              999.068
Skew:                          -0.301   Prob(JB):                    1.14e-217
Kurtosis:                       3.762   Cond. No.                         68.9
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


#Interpretation of the coefficients:
#for x < 17, the slope is b1. So Wages increase by 100*b1 = 4.42% for every one year increase in experience
# for x between 17 and 35, slope is b1 + b2. So wages increase by 100*(b1 + b2) for every one year increase in experience:
print(100*(b1 + b2))
#for x larger than 35, slope is b1 + b2 + b3. So wages increase by 100*(b1 + b2 + b3):
print(100*(b1 + b2 + b3))

-0.01141398259054266
-2.6233912674073494


#Here is how we can use OLS to fit the regression model with all possible knots 1, 2,.., 62.
X = sm.add_constant(x) #adding intercept
for knot in range(1, 63):
    X[f'knot_{knot}'] = np.maximum(x - knot, 0)

model_all_knots = sm.OLS(y, X).fit()
print(model_all_knots.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:         WeeklyEarnings   R-squared:                       0.139
Model:                            OLS   Adj. R-squared:                  0.137
Method:                 Least Squares   F-statistic:                     67.42
Date:                Sat, 18 Nov 2023   Prob (F-statistic):               0.00
Time:                        18:49:33   Log-Likelihood:                -22188.
No. Observations:               25437   AIC:                         4.450e+04
Df Residuals:                   25375   BIC:                         4.501e+04
Df Model:                          61                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.4711      0.030    185.217      0.000       5.413       5.529
Exper          0.1347      0.040      3.390      0.001       0.057       0.213
knot_1         0.0323      0.066      0.491      0.623      -0.097       0.161
knot_2        -0.0847      0.061     -1.386      0.166      -0.205       0.035
knot_3         0.0185      0.056      0.331      0.741      -0.091       0.128
knot_4        -0.0753      0.053     -1.425      0.154      -0.179       0.028
knot_5         0.0762      0.052      1.465      0.143      -0.026       0.178
knot_6        -0.0510      0.051     -1.005      0.315      -0.151       0.049
knot_7        -0.0773      0.049     -1.565      0.118      -0.174       0.020
knot_8         0.1382      0.049      2.804      0.005       0.042       0.235
knot_9        -0.0891      0.049     -1.832      0.067      -0.184       0.006
knot_10        0.0031      0.048      0.064      0.949      -0.091       0.097
knot_11       -0.0265      0.048     -0.556      0.578      -0.120       0.067
knot_12        0.0711      0.048      1.490      0.136      -0.022       0.165
knot_13       -0.0524      0.048     -1.098      0.272      -0.146       0.041
knot_14       -0.0099      0.049     -0.201      0.841      -0.107       0.087
knot_15        0.0158      0.049      0.319      0.750      -0.081       0.113
knot_16        0.0372      0.050      0.739      0.460      -0.061       0.136
knot_17       -0.0554      0.050     -1.097      0.272      -0.154       0.044
knot_18       -0.0034      0.051     -0.066      0.947      -0.103       0.096
knot_19       -0.0130      0.054     -0.243      0.808      -0.118       0.092
knot_20        0.0394      0.056      0.698      0.485      -0.071       0.150
knot_21       -0.0460      0.059     -0.783      0.434      -0.161       0.069
knot_22        0.0370      0.059      0.623      0.533      -0.079       0.153
knot_23       -0.0154      0.061     -0.250      0.802      -0.136       0.105
knot_24       -0.0035      0.064     -0.055      0.956      -0.128       0.121
knot_25        0.0113      0.065      0.174      0.862      -0.116       0.139
knot_26       -0.0121      0.066     -0.182      0.856      -0.142       0.118
knot_27       -0.0291      0.068     -0.426      0.670      -0.163       0.105
knot_28        0.0647      0.069      0.933      0.351      -0.071       0.201
knot_29       -0.0132      0.070     -0.189      0.850      -0.150       0.124
knot_30       -0.0305      0.071     -0.429      0.668      -0.170       0.109
knot_31       -0.0261      0.073     -0.360      0.719      -0.168       0.116
knot_32       -0.0034      0.074     -0.045      0.964      -0.149       0.142
knot_33        0.0766      0.076      1.002      0.317      -0.073       0.226
knot_34       -0.0943      0.076     -1.235      0.217      -0.244       0.055
knot_35        0.1039      0.078      1.337      0.181      -0.048       0.256
knot_36       -0.1690      0.081     -2.079      0.038      -0.328      -0.010
knot_37        0.1986      0.084      2.362      0.018       0.034       0.363
knot_38       -0.0801      0.085     -0.938      0.348      -0.247       0.087
knot_39       -0.0127      0.087     -0.146      0.884      -0.183       0.158
knot_40        0.0045      0.083      0.054      0.957      -0.157       0.166
knot_41       -0.0718      0.087     -0.824      0.410      -0.243       0.099
knot_42       -0.0043      0.091     -0.047      0.962      -0.183       0.175
knot_43        0.1796      0.096      1.879      0.060      -0.008       0.367
knot_44       -0.1397      0.103     -1.350      0.177      -0.342       0.063
knot_45       -0.0091      0.111     -0.082      0.934      -0.226       0.208
knot_46        0.0326      0.120      0.271      0.786      -0.203       0.268
knot_47       -0.0828      0.131     -0.630      0.529      -0.340       0.175
knot_48        0.0582      0.146      0.399      0.690      -0.228       0.344
knot_49       -0.0267      0.162     -0.165      0.869      -0.344       0.290
knot_50        0.1124      0.194      0.581      0.562      -0.267       0.492
knot_51       -0.1186      0.235     -0.504      0.614      -0.580       0.342
knot_52       -0.0133      0.268     -0.050      0.960      -0.539       0.512
knot_53        0.1658      0.294      0.564      0.573      -0.410       0.742
knot_54       -0.2045      0.335     -0.611      0.541      -0.861       0.452
knot_55        0.1716      0.429      0.400      0.689      -0.669       1.012
knot_56       -0.4076      0.465     -0.877      0.380      -1.318       0.503
knot_57        0.5338      0.627      0.851      0.395      -0.696       1.763
knot_58        0.1181      0.700      0.169      0.866      -1.255       1.491
knot_59       -0.3092      0.314     -0.984      0.325      -0.925       0.306
knot_60       -0.7365      0.952     -0.774      0.439      -2.603       1.130
knot_61        0.9303      0.867      1.072      0.284      -0.770       2.631
knot_62        0.4651      0.434      1.072      0.284      -0.385       1.315
==============================================================================
Omnibus:                      724.158   Durbin-Watson:                   2.015
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1054.343
Skew:                          -0.302   Prob(JB):                    1.13e-229
Kurtosis:                       3.794   Cond. No.                     3.85e+16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 9.3e-26. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


#Plotting the fitted values of this model: 
# Generate x_range for plotting
x_range = np.linspace(x.min(), x.max(), 100)

# Prepare a DataFrame for x_range
X_range = pd.DataFrame({'const': 1, 'x': x_range})
for knot in range(1, 63):
    X_range[f'knot_{knot}'] = np.maximum(X_range['x'] - knot, 0)

fitted_model_all_knots = model_all_knots.predict(X_range)

plt.figure(figsize = (10, 6))
plt.scatter(x, y, s = 1)
plt.plot(x_range,fitted_model_all_knots, color = 'green', label = 'Fitted All Knots Regression')
plt.plot(x_range,fitted_model_2, color = 'red', label = 'Fitted Quadratic')
plt.xlabel('Years of Experience')
plt.ylabel('Log of Weekly Earnings')
plt.title('All knots regression of y on x')
plt.legend()
plt.show()


#Repeating the analysis on a smaller dataset: 
n = 500
random_seed = 42
dt_small = dt.sample(n = 500, random_state = random_seed)
print(dt_small.shape)
print(dt_small.head())

(500, 5)
          Region   MetropolitanStatus  Exper  Educ  WeeklyEarnings
8735        West     MetropolitanArea     19    14          735.99
13369  Northeast  NotMetropolitanArea     15    16          807.22
5512     Midwest  NotMetropolitanArea     18    16          341.88
24277      South  NotMetropolitanArea     32    12          240.38
24959      South     MetropolitanArea     14    16         1210.83


#Let us now repeat the exercise fitting the quadratic model as well as the all-knots regression model on this smaller dataset: 
y = np.log(dt_small['WeeklyEarnings'])
x = dt_small['Exper']

#Quadratic model: 
X = sm.add_constant(x)
X['Exper_Square'] = np.square(x)
model_quad = sm.OLS(y, X).fit()

print(model_quad.summary())

#Plotting the fitted quadratic on the scatter plot
b0, b1, b2 = model_quad.params
x_range = np.linspace(x.min(), x.max(), 100)
fitted_model_quad = b0 + b1*x_range + b2*np.square(x_range)

X_all_knots = sm.add_constant(x) #adding intercept
for knot in range(1, 63):
    X_all_knots[f'knot_{knot}'] = np.maximum(x - knot, 0)
model_all_knots = sm.OLS(y, X_all_knots).fit()

print(model_all_knots.summary())

# Prepare a DataFrame for x_range
X_range = pd.DataFrame({'const': 1, 'x': x_range})
for knot in range(1, 63):
    X_range[f'knot_{knot}'] = np.maximum(X_range['x'] - knot, 0)

fitted_model_all_knots = model_all_knots.predict(X_range)

plt.figure(figsize = (10, 6))
plt.scatter(x, y, s = 5)
plt.plot(x_range,fitted_model_all_knots, color = 'green', label = 'Fitted All Knots Regression by OLS')
plt.plot(x_range,fitted_model_quad, color = 'red', label = 'Fitted Quadratic')
plt.xlabel('Years of Experience')
plt.ylabel('Log of Weekly Earnings')
plt.title('All knots regression of y on x')
plt.legend()
plt.show()

                            OLS Regression Results                            
==============================================================================
Dep. Variable:         WeeklyEarnings   R-squared:                       0.183
Model:                            OLS   Adj. R-squared:                  0.180
Method:                 Least Squares   F-statistic:                     55.83
Date:                Sat, 18 Nov 2023   Prob (F-statistic):           1.34e-22
Time:                        18:49:33   Log-Likelihood:                -405.85
No. Observations:                 500   AIC:                             817.7
Df Residuals:                     497   BIC:                             830.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            5.6176      0.069     81.159      0.000       5.482       5.754
Exper            0.0666      0.007      9.404      0.000       0.053       0.080
Exper_Square    -0.0011      0.000     -7.593      0.000      -0.001      -0.001
==============================================================================
Omnibus:                        9.502   Durbin-Watson:                   2.014
Prob(Omnibus):                  0.009   Jarque-Bera (JB):               11.214
Skew:                          -0.226   Prob(JB):                      0.00367
Kurtosis:                       3.577   Cond. No.                     2.09e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.09e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
                            OLS Regression Results                            
==============================================================================
Dep. Variable:         WeeklyEarnings   R-squared:                       0.273
Model:                            OLS   Adj. R-squared:                  0.190
Method:                 Least Squares   F-statistic:                     3.298
Date:                Sat, 18 Nov 2023   Prob (F-statistic):           9.95e-12
Time:                        18:49:33   Log-Likelihood:                -376.83
No. Observations:                 500   AIC:                             857.7
Df Residuals:                     448   BIC:                             1077.
Df Model:                          51                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.1547      0.272     18.980      0.000       4.621       5.688
Exper          0.1953      0.351      0.557      0.578      -0.494       0.884
knot_1         0.1042      0.551      0.189      0.850      -0.978       1.186
knot_2        -0.0938      0.453     -0.207      0.836      -0.983       0.796
knot_3        -0.1610      0.384     -0.419      0.675      -0.916       0.594
knot_4        -0.2014      0.326     -0.617      0.538      -0.843       0.440
knot_5         0.6183      0.309      2.002      0.046       0.011       1.225
knot_6        -0.6134      0.296     -2.075      0.039      -1.194      -0.032
knot_7         0.3069      0.315      0.974      0.330      -0.312       0.926
knot_8        -0.1287      0.346     -0.372      0.710      -0.809       0.551
knot_9         0.0483      0.428      0.113      0.910      -0.794       0.890
knot_10       -0.0715      0.366     -0.195      0.845      -0.791       0.648
knot_11       -0.1556      0.334     -0.465      0.642      -0.813       0.502
knot_12        0.1899      0.331      0.574      0.566      -0.461       0.840
knot_13        0.2660      0.328      0.811      0.418      -0.378       0.910
knot_14       -0.5626      0.329     -1.709      0.088      -1.209       0.084
knot_15        0.3070      0.314      0.979      0.328      -0.309       0.923
knot_16        0.2728      0.389      0.701      0.484      -0.492       1.037
knot_17       -0.4239      0.364     -1.166      0.244      -1.139       0.291
knot_18        0.1405      0.329      0.427      0.669      -0.506       0.787
knot_19       -0.4726      0.305     -1.549      0.122      -1.072       0.127
knot_20        0.6368      0.350      1.821      0.069      -0.050       1.324
knot_21       -0.1141      0.373     -0.306      0.760      -0.847       0.619
knot_22        0.1367      0.361      0.379      0.705      -0.572       0.846
knot_23       -0.1656      0.387     -0.428      0.669      -0.926       0.595
knot_24       -0.1908      0.388     -0.492      0.623      -0.953       0.571
knot_25        0.1208      0.432      0.280      0.780      -0.728       0.970
knot_26       -0.2886      0.454     -0.636      0.525      -1.181       0.603
knot_27        0.6466      0.493      1.310      0.191      -0.323       1.616
knot_28       -0.1814      0.498     -0.365      0.716      -1.159       0.797
knot_29       -0.4298      0.476     -0.903      0.367      -1.365       0.506
knot_30        0.4724      0.462      1.021      0.308      -0.437       1.381
knot_31       -0.5066      0.438     -1.157      0.248      -1.367       0.354
knot_32        0.4254      0.462      0.920      0.358      -0.483       1.334
knot_33       -0.1973      0.476     -0.415      0.679      -1.133       0.738
knot_34        0.2300      0.505      0.456      0.649      -0.762       1.222
knot_35       -0.3746      0.530     -0.707      0.480      -1.416       0.667
knot_36        0.5390      0.562      0.959      0.338      -0.566       1.644
knot_37       -0.2597      0.703     -0.369      0.712      -1.642       1.122
knot_38       -0.5455      0.639     -0.854      0.394      -1.801       0.710
knot_39        0.7818      0.643      1.217      0.224      -0.481       2.045
knot_40       -0.0695      0.639     -0.109      0.913      -1.325       1.186
knot_41       -0.3461      0.725     -0.477      0.633      -1.772       1.079
knot_42       -0.4418      0.735     -0.601      0.548      -1.887       1.004
knot_43        0.1942      0.979      0.198      0.843      -1.730       2.119
knot_44        1.1360      1.194      0.951      0.342      -1.211       3.483
knot_45       -0.9202      0.914     -1.006      0.315      -2.717       0.877
knot_46       -0.5969      0.873     -0.684      0.494      -2.313       1.119
knot_47        0.4265      0.859      0.497      0.620      -1.261       2.114
knot_48        1.5575      1.152      1.352      0.177      -0.707       3.822
knot_49       -2.1921      0.941     -2.330      0.020      -4.041      -0.343
knot_50        0.6258      0.504      1.242      0.215      -0.365       1.616
knot_51        0.4172      0.336      1.242      0.215      -0.243       1.078
knot_52        0.2086      0.168      1.242      0.215      -0.122       0.539
knot_53             0          0        nan        nan           0           0
knot_54             0          0        nan        nan           0           0
knot_55             0          0        nan        nan           0           0
knot_56             0          0        nan        nan           0           0
knot_57             0          0        nan        nan           0           0
knot_58             0          0        nan        nan           0           0
knot_59             0          0        nan        nan           0           0
knot_60             0          0        nan        nan           0           0
knot_61             0          0        nan        nan           0           0
knot_62             0          0        nan        nan           0           0
==============================================================================
Omnibus:                       12.537   Durbin-Watson:                   2.023
Prob(Omnibus):                  0.002   Jarque-Bera (JB):               15.793
Skew:                          -0.262   Prob(JB):                     0.000372
Kurtosis:                       3.696   Cond. No.                     1.86e+18
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 7.3e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


#In order to work with the regularizing prior, we need to set the values of C, tau, sigma:
C = 10 ** 6 #some large value
tau = 0.01 #set tau = 0.001, tau = 0.01, tau = 0.1, tau = 1, tau = 2
sig = 0.6

kmax = 62

Q0 = np.diag([C, C] + [tau ** 2] * kmax) #this is the prior covariance matrix of b0, b1, b2, ...b(kmax + 1)
#Q0 is a diagonal matrix with C in the first two entries and the rest of entries equal tau^2

TempMat = np.linalg.inv((sig**2) * np.linalg.inv(Q0) + X_all_knots.T @ X_all_knots) #this is the matrix which
#multiplies X^T Y to give the posterior mean
post_mean = TempMat @ (X_all_knots.T @ y.to_numpy().reshape(-1, 1))

#calculating the fitted values for a grid of x-values:
x_range = np.linspace(x.min(), x.max(), 100)

X_range = pd.DataFrame({'const': 1, 'x': x_range})
for knot in range(1, (1 + kmax)):
    X_range[f'knot_{knot}'] = np.maximum(x_range - knot, 0)

fitted_model_regularized = X_range.values @ post_mean


plt.figure(figsize = (10, 6))
plt.scatter(x, y, s = 5)
plt.plot(x_range,fitted_model_all_knots, color = 'green', label = 'Fitted All Knots Regression by OLS')
plt.plot(x_range,fitted_model_quad, color = 'red', label = 'Fitted Quadratic')
plt.plot(x_range,fitted_model_regularized, color = 'black', label = 'Fitted Regularization')
plt.xlabel('Years of Experience')
plt.ylabel('Log of Weekly Earnings')
plt.title('All knots regression of y on x')
plt.legend()
plt.show()


# Marginal Likelihood Maximization for tau and sigma:
# We first set grids for tau and sigma:
taugrid = np.logspace(np.log10(0.001), np.log10(0.1), 50)
siggrid = np.logspace(np.log10(0.1), np.log10(1), 50)
g = pd.DataFrame([(tau, sig) for tau in taugrid for sig in siggrid], columns=['tau', 'sig'])

C = 10 ** 6
for i in range(len(g)):
    tau_val = g.loc[i, 'tau']
    sig_val = g.loc[i, 'sig']
    Q0 = np.diag([C, C] + [tau_val ** 2] * kmax)
    Sigmat = X_all_knots.values @ Q0 @ X_all_knots.values.T + np.diag([sig_val ** 2] * n)
    cov_inv = np.linalg.inv(Sigmat)
    sign, log_cov_det = np.linalg.slogdet(Sigmat)
    norm_factor = n*np.log(2*np.pi) + log_cov_det
    g.loc[i, 'loglik'] = -0.5 * (norm_factor + y.T @ cov_inv @ y) #this is the log marginal density

ind_max = g['loglik'].idxmax()
print(g.loc[ind_max])

tau         0.013895
sig         0.542868
loglik   -432.733096
Name: 1436, dtype: float64


#The best values of tau and sig selected are
tau_best = g.loc[ind_max, 'tau']
sig_best = g.loc[ind_max, 'sig']
print(tau_best, sig_best)

0.013894954943731374 0.5428675439323859


#Now we calculate the posterior mean with the best values for tau and sig:
C = 10 ** 6 #some large value
tau = tau_best
sig = sig_best

kmax = 62

Q0 = np.diag([C, C] + [tau ** 2] * kmax) #this is the prior covariance matrix of b0, b1, b2, ...b(kmax + 1)
#Q0 is a diagonal matrix with C in the first two entries and the rest of entries equal tau^2

TempMat = np.linalg.inv((sig**2) * np.linalg.inv(Q0) + X_all_knots.T @ X_all_knots) #this is the matrix which
#multiplies X^T Y to give the posterior mean
post_mean = TempMat @ (X_all_knots.T @ y.to_numpy().reshape(-1, 1))

#calculating the fitted values for a grid of x-values:
x_range = np.linspace(x.min(), x.max(), 100)

X_range = pd.DataFrame({'const': 1, 'x': x_range})
for knot in range(1, 63):
    X_range[f'knot_{knot}'] = np.maximum(x_range - knot, 0)

fitted_model_regularized = X_range.values @ post_mean


plt.figure(figsize = (10, 6))
plt.scatter(x, y, s = 5)
plt.plot(x_range,fitted_model_all_knots, color = 'green', label = 'Fitted All Knots Regression by OLS')
plt.plot(x_range,fitted_model_quad, color = 'red', label = 'Fitted Quadratic')
plt.plot(x_range,fitted_model_regularized, color = 'black', label = 'Fitted Regularization')
plt.xlabel('Years of Experience')
plt.ylabel('Log of Weekly Earnings')
plt.title('All knots regression of y on x')
plt.legend()
plt.show()


#Another Example: 
dt = pd.read_csv("yahoo_trends_Nov2022.csv")
print(dt.head())
n = len(dt)
y = dt['y']
x = np.arange(1, n+1)

    y
0  39
1  38
2  38
3  38
4  41


#Plotting the fitted line on the scatter plot
plt.figure(figsize = (10, 6))
plt.scatter(x, y, s = 5)
plt.xlabel('Time')
plt.ylabel('Trends')
plt.title('Google trends dataset for the query Yahoo')
plt.show()


# Create X_all_knots as a DataFrame with an intercept
X_all_knots = sm.add_constant(pd.DataFrame(x, columns=['x']))  # 'x' is now a column in the DataFrame

# Create a dictionary to hold all new knot columns
knot_columns = {}

# Generate knot columns
for knot in range(2, n):
    knot_columns[f'knot_{knot}'] = np.maximum(x - knot, 0)

# Convert the dictionary to a DataFrame and concatenate it with X_all_knots
knot_df = pd.DataFrame(knot_columns)
X_all_knots = pd.concat([X_all_knots, knot_df], axis=1)


##### Likelihood for tau and sigma:
taugrid = np.logspace(np.log10(0.1), np.log10(1), 50)
siggrid = np.logspace(np.log10(0.1), np.log10(5), 50)
g = pd.DataFrame([(tau, sig) for tau in taugrid for sig in siggrid], columns=['tau', 'sig'])

C = 10 ** 6
for i in range(len(g)):
    tau_val = g.loc[i, 'tau']
    sig_val = g.loc[i, 'sig']
    Q0 = np.diag([C, C] + [tau_val ** 2] * (n-2))
    Sigmat = X_all_knots.values @ Q0 @ X_all_knots.values.T + np.diag([sig_val ** 2] * n)
    cov_inv = np.linalg.inv(Sigmat)
    sign, log_cov_det = np.linalg.slogdet(Sigmat)
    norm_factor = n*np.log(2*np.pi) + log_cov_det
    g.loc[i, 'loglik'] = -0.5 * (norm_factor + y.T @ cov_inv @ y)

ind_max = g['loglik'].idxmax()
print(g.loc[ind_max])

tau         0.754312
sig         1.918208
loglik   -583.942353
Name: 2187, dtype: float64


#The best values of tau and sig selected are
tau_best = g.loc[ind_max, 'tau']
sig_best = g.loc[ind_max, 'sig']
print(tau_best, sig_best)

0.7543120063354617 1.9182080867904119


#With best values of tau and sig:
C = 10 ** 6 #some large value
tau = tau_best
sig = sig_best

Q0 = np.diag([C, C] + [tau ** 2] * (n-2)) #this is the prior covariance matrix of b0, b1, b2, ...b(kmax + 1)
#Q0 is a diagonal matrix with C in the first two entries and the rest of entries equal tau^2

TempMat = np.linalg.inv((sig**2) * np.linalg.inv(Q0) + X_all_knots.T @ X_all_knots) #this is the matrix which
#multiplies X^T Y to give the posterior mean
post_mean = TempMat @ (X_all_knots.T @ y.to_numpy().reshape(-1, 1))


fitted_model_regularized = X_all_knots.values @ post_mean


plt.figure(figsize = (10, 6))
plt.scatter(x, y, s = 5)
plt.plot(x,fitted_model_regularized, color = 'red', label = 'Fitted Regularization')
plt.xlabel('Time')
plt.ylabel('Trends')
plt.title('All knots regression of y on x')
plt.legend()
plt.show()

High-Dimensional Regression¶

Two Simple Regressions (linear and quadratic)¶

Broken Stick Regression¶

Regression Models with more knots¶

All knots: high-dimensional linear model¶

Regularization¶

Choosing $\tau$ and $\sigma$ via Marginal Likelihood¶

Another Example: Google Trends Dataset¶