import pandas as pd
import numpy as np
import statsmodels.api as sm


#Import the MROZ.csv dataset
mroz = pd.read_csv("MROZ.csv")
print(mroz.head(12))

    inlf  hours  kidslt6  kidsge6  age  educ    wage  repwage  hushrs  husage  \
0      1   1610        1        0   32    12  3.3540     2.65    2708      34   
1      1   1656        0        2   30    12  1.3889     2.65    2310      30   
2      1   1980        1        3   35    12  4.5455     4.04    3072      40   
3      1    456        0        3   34    12  1.0965     3.25    1920      53   
4      1   1568        1        2   31    14  4.5918     3.60    2000      32   
5      1   2032        0        0   54    12  4.7421     4.70    1040      57   
6      1   1440        0        2   37    16  8.3333     5.95    2670      37   
7      1   1020        0        0   54    12  7.8431     9.98    4120      53   
8      1   1458        0        2   48    12  2.1262     0.00    1995      52   
9      1   1600        0        2   39    12  4.6875     4.15    2100      43   
10     1   1969        0        1   33    12  4.0630     4.30    2450      34   
11     1   1960        0        1   42    11  4.5918     4.58    2375      47   

    ...  faminc     mtr  motheduc  fatheduc  unem  city  exper   nwifeinc  \
0   ...   16310  0.7215        12         7   5.0     0     14  10.910060   
1   ...   21800  0.6615         7         7  11.0     1      5  19.499981   
2   ...   21040  0.6915        12         7   5.0     0     15  12.039910   
3   ...    7300  0.7815         7         7   5.0     0      6   6.799996   
4   ...   27300  0.6215        12        14   9.5     1      7  20.100058   
5   ...   19495  0.6915        14         7   7.5     1     33   9.859054   
6   ...   21152  0.6915        14         7   5.0     0     11   9.152048   
7   ...   18900  0.6915         3         3   5.0     0     35  10.900038   
8   ...   20405  0.7515         7         7   3.0     0     24  17.305000   
9   ...   20425  0.6915         7         7   5.0     0     21  12.925000   
10  ...   32300  0.5815        12         3   5.0     0     15  24.299953   
11  ...   28700  0.6215        14         7   5.0     0     14  19.700071   

       lwage  expersq  
0   1.210154      196  
1   0.328512       25  
2   1.514138      225  
3   0.092123       36  
4   1.524272       49  
5   1.556480     1089  
6   2.120260      121  
7   2.059634     1225  
8   0.754336      576  
9   1.544899      441  
10  1.401922      225  
11  1.524272      196  

[12 rows x 22 columns]


#Poisson Regression through StatsModels
# Define the response variable and covariates
Y = mroz['hours']
X = mroz[['kidslt6', 'age', 'educ', 
        'huswage', 'exper', 'expersq']].copy()
X = sm.add_constant(X) # Add a constant (intercept) to the model
# Fit the Poisson regression model
poiregmodel = sm.GLM(Y, X, family=sm.families.Poisson()).fit()
print(poiregmodel.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                  hours   No. Observations:                  753
Model:                            GLM   Df Residuals:                      746
Model Family:                 Poisson   Df Model:                            6
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -3.1563e+05
Date:                Thu, 05 Oct 2023   Deviance:                   6.2754e+05
Time:                        05:51:51   Pearson chi2:                 6.60e+05
No. Iterations:                     5   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.9365      0.012    562.281      0.000       6.912       6.961
kidslt6       -0.8075      0.004   -193.217      0.000      -0.816      -0.799
age           -0.0427      0.000   -201.166      0.000      -0.043      -0.042
educ           0.0528      0.001     83.439      0.000       0.052       0.054
huswage       -0.0207      0.000    -54.548      0.000      -0.021      -0.020
exper          0.1204      0.001    219.231      0.000       0.119       0.121
expersq       -0.0018   1.63e-05   -112.090      0.000      -0.002      -0.002
==============================================================================


#55% comes from:
print((np.exp(poiregmodel.params['kidslt6']) - 1)*100)

-55.40391074218227


coef = poiregmodel.params.values
kidslt6 = 0
age = 30
educ = 16
huswage = 10
exper = 10
expersq = 100
pred = np.exp(coef[0] + coef[1]*kidslt6 + coef[2]*age + coef[3]*educ + coef[4]*huswage + coef[5]*exper + coef[6]*expersq)
print(pred)

1502.8004946558758


coef = poiregmodel.params.values
kidslt6 = 1
age = 30
educ = 16
huswage = 10
exper = 10
expersq = 100
pred = np.exp(coef[0] + coef[1]*kidslt6 + coef[2]*age + coef[3]*educ + coef[4]*huswage + coef[5]*exper + coef[6]*expersq)
print(pred)

670.1902499636603


coef = poiregmodel.params.values
kidslt6 = 2
age = 30
educ = 16
huswage = 10
exper = 10
expersq = 100
pred = np.exp(coef[0] + coef[1]*kidslt6 + coef[2]*age + coef[3]*educ + coef[4]*huswage + coef[5]*exper + coef[6]*expersq)
print(pred)

298.8786420709859


Y = mroz['hours']
X1 = mroz[['kidsge6', 'age', 'educ', 
        'huswage', 'exper', 'expersq']].copy() #'kidslt6' is now replaced by 'kidsge6'
X1 = sm.add_constant(X1) 
poiregmodel1 = sm.GLM(Y, X1, family=sm.families.Poisson()).fit()
print(poiregmodel1.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                  hours   No. Observations:                  753
Model:                            GLM   Df Residuals:                      746
Model Family:                 Poisson   Df Model:                            6
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -3.4027e+05
Date:                Thu, 05 Oct 2023   Deviance:                   6.7681e+05
Time:                        05:52:25   Pearson chi2:                 6.78e+05
No. Iterations:                     5   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.2418      0.013    485.960      0.000       6.217       6.267
kidsge6       -0.0084      0.001     -7.347      0.000      -0.011      -0.006
age           -0.0278      0.000   -135.231      0.000      -0.028      -0.027
educ           0.0410      0.001     65.183      0.000       0.040       0.042
huswage       -0.0214      0.000    -56.022      0.000      -0.022      -0.021
exper          0.1348      0.001    245.305      0.000       0.134       0.136
expersq       -0.0022   1.62e-05   -136.487      0.000      -0.002      -0.002
==============================================================================


#Log-likelihoods for the two models:
logLik_model = poiregmodel.llf
logLik_model1 = poiregmodel1.llf

#Deviances for the two models:
deviance_model = poiregmodel.deviance
deviance_model1 = poiregmodel1.deviance

# Print results
print(f"Model with 'kidslt6': Log-Likelihood = {logLik_model}, Deviance = {deviance_model}")
print(f"Model with 'kidsge6': Log-Likelihood = {logLik_model1}, Deviance = {deviance_model1}")

Model with 'kidslt6': Log-Likelihood = -315632.12091262755, Deviance = 627538.4070785991
Model with 'kidsge6': Log-Likelihood = -340265.97607505764, Deviance = 676806.1174034592


#AIC and BIC for the two models (with 'kidslt6' and 'kidsge6')
aic_model = poiregmodel.aic
aic_model1 = poiregmodel1.aic

bic_model = poiregmodel.bic_llf
bic_model1 = poiregmodel1.bic_llf

print(f"Model with 'kidslt6': AIC = {aic_model}, BIC = {bic_model}")
print(f"Model with 'kidsge6': AIC = {aic_model1}, BIC = {bic_model1}")

#Instead of using .aic and .bic, we can calculate these using llf and the formula as follows:
num_params = 7
#the number of parameters equals 7 in both these models. The 7 parameters are beta_0, beta_1, beta_2,
#beta_3, beta_4, beta_5, beta_6
aic_model_formula = (-2)*poiregmodel.llf + 2*7
aic_model1_formula = (-2)*poiregmodel1.llf + 2*7

n = mroz.shape[0]
bic_model_formula = (-2)*poiregmodel.llf + np.log(n) * 7
bic_model1_formula = (-2)*poiregmodel1.llf + np.log(n) * 7

print(f"Model with 'kidslt6': AIC = {aic_model_formula}, BIC = {bic_model_formula}")
print(f"Model with 'kidsge6': AIC = {aic_model1_formula}, BIC = {bic_model1_formula}")

Model with 'kidslt6': AIC = 631278.2418252551, BIC = 631310.6102818497
Model with 'kidsge6': AIC = 680545.9521501153, BIC = 680578.3206067099
Model with 'kidslt6': AIC = 631278.2418252551, BIC = 631310.6102818497
Model with 'kidsge6': AIC = 680545.9521501153, BIC = 680578.3206067099


Y = mroz['hours']
Xfull = mroz[['kidslt6', 'kidsge6', 'age', 'educ', 'hushrs', 'huseduc', 'huswage', 
           'motheduc', 'fatheduc', 'exper', 'expersq', 'unem', 'city']]
Xfull = sm.add_constant(Xfull) 
poiregmodel_full = sm.GLM(Y, Xfull, family=sm.families.Poisson()).fit()
print(poiregmodel_full.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                  hours   No. Observations:                  753
Model:                            GLM   Df Residuals:                      739
Model Family:                 Poisson   Df Model:                           13
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -3.1222e+05
Date:                Thu, 05 Oct 2023   Deviance:                   6.2072e+05
Time:                        05:52:37   Pearson chi2:                 6.53e+05
No. Iterations:                     5   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.4625      0.015    503.407      0.000       7.433       7.492
kidslt6       -0.8180      0.004   -194.435      0.000      -0.826      -0.810
kidsge6       -0.0369      0.001    -31.452      0.000      -0.039      -0.035
age           -0.0433      0.000   -194.433      0.000      -0.044      -0.043
educ           0.0604      0.001     74.681      0.000       0.059       0.062
hushrs        -0.0001    2.5e-06    -49.100      0.000      -0.000      -0.000
huseduc       -0.0135      0.001    -22.282      0.000      -0.015      -0.012
huswage       -0.0241      0.000    -55.083      0.000      -0.025      -0.023
motheduc       0.0137      0.000     27.570      0.000       0.013       0.015
fatheduc      -0.0037      0.000     -7.535      0.000      -0.005      -0.003
exper          0.1177      0.001    212.420      0.000       0.117       0.119
expersq       -0.0018   1.64e-05   -112.054      0.000      -0.002      -0.002
unem          -0.0195      0.000    -42.213      0.000      -0.020      -0.019
city           0.0452      0.003     14.890      0.000       0.039       0.051
==============================================================================


#All subsets of size k
def get_combinations(arr, k):
    if k == 0:
        return [[]]
    if not arr: 
        return []
    return get_combinations(arr[1:], k) + [x + [arr[0]] for x in get_combinations(arr[1:], k-1)] 
print(get_combinations(['a', 'b', 'c', 'd', 'e'], 3))

[['e', 'd', 'c'], ['e', 'd', 'b'], ['e', 'c', 'b'], ['d', 'c', 'b'], ['e', 'd', 'a'], ['e', 'c', 'a'], ['d', 'c', 'a'], ['e', 'b', 'a'], ['d', 'b', 'a'], ['c', 'b', 'a']]


#Best Subsets Poisson Regression using AIC
Y = mroz['hours']
Xfull = mroz[['kidslt6', 'kidsge6', 'age', 'educ', 'hushrs', 'huseduc', 'huswage', 
           'motheduc', 'fatheduc', 'exper', 'expersq', 'unem', 'city']]

best_aic_per_k = {}

# Looping through all possible sizes of predictor subsets
for k in range(1, len(Xfull.columns) + 1):
    
    # Getting all combinations of predictors of size k without using itertools
    best_aic_k = float("inf")
    best_model_k = None
    best_variables_k = None
    
    for variables in get_combinations(list(Xfull.columns), k):
        predictors = Xfull[list(variables)]
        predictors = sm.add_constant(predictors)  # Add a constant (intercept) to the model
        
        # Fitting the model
        model = sm.GLM(Y, predictors, family=sm.families.Poisson()).fit()
        
        # Updating best AIC model for each k
        if model.aic < best_aic_k:
            best_aic_k = model.aic
            best_model_k = model
            best_variables_k = variables  # Storing variable names
    
    # Storing the best AIC and variable names for each k in the dictionary
    best_aic_per_k[k] = (best_aic_k, best_model_k, best_variables_k)

# Displaying the results
for k, (aic, model, variables) in best_aic_per_k.items():
    print(f"\nBest AIC value for k={k} variables: {aic}")
    print(f"Variables in the model: {variables}")
    print(f"Model parameters:\n{model.params}")

Best AIC value for k=1 variables: 744498.6174901996
Variables in the model: ['exper']
Model parameters:
const    5.997464
exper    0.049036
dtype: float64

Best AIC value for k=2 variables: 708125.6250492265
Variables in the model: ['exper', 'age']
Model parameters:
const    7.340090
exper    0.066197
age     -0.036907
dtype: float64

Best AIC value for k=3 variables: 653555.3560647196
Variables in the model: ['exper', 'age', 'kidslt6']
Model parameters:
const      8.134844
exper      0.064158
age       -0.051846
kidslt6   -0.846538
dtype: float64

Best AIC value for k=4 variables: 639269.8868831303
Variables in the model: ['expersq', 'exper', 'age', 'kidslt6']
Model parameters:
const      7.513553
expersq   -0.001812
exper      0.122583
age       -0.045117
kidslt6   -0.793601
dtype: float64

Best AIC value for k=5 variables: 634410.2632771938
Variables in the model: ['expersq', 'exper', 'educ', 'age', 'kidslt6']
Model parameters:
const      6.970329
expersq   -0.001774
exper      0.120684
educ       0.042226
age       -0.044275
kidslt6   -0.811135
dtype: float64

Best AIC value for k=6 variables: 631278.241825255
Variables in the model: ['expersq', 'exper', 'huswage', 'educ', 'age', 'kidslt6']
Model parameters:
const      6.936480
expersq   -0.001829
exper      0.120372
huswage   -0.020714
educ       0.052831
age       -0.042680
kidslt6   -0.807524
dtype: float64

Best AIC value for k=7 variables: 628661.4275956514
Variables in the model: ['expersq', 'exper', 'huswage', 'hushrs', 'educ', 'age', 'kidslt6']
Model parameters:
const      7.230814
expersq   -0.001794
exper      0.118114
huswage   -0.026952
hushrs    -0.000124
educ       0.057793
age       -0.042924
kidslt6   -0.815312
dtype: float64

Best AIC value for k=8 variables: 626758.9386570181
Variables in the model: ['unem', 'expersq', 'exper', 'huswage', 'hushrs', 'educ', 'age', 'kidslt6']
Model parameters:
const      7.351924
unem      -0.019775
expersq   -0.001836
exper      0.118913
huswage   -0.025638
hushrs    -0.000136
educ       0.059774
age       -0.041995
kidslt6   -0.811565
dtype: float64

Best AIC value for k=9 variables: 625832.1260942485
Variables in the model: ['unem', 'expersq', 'exper', 'huswage', 'hushrs', 'educ', 'age', 'kidsge6', 'kidslt6']
Model parameters:
const      7.492027
unem      -0.019316
expersq   -0.001827
exper      0.117388
huswage   -0.025561
hushrs    -0.000131
educ       0.058120
age       -0.043663
kidsge6   -0.035307
kidslt6   -0.824243
dtype: float64

Best AIC value for k=10 variables: 625176.0636516893
Variables in the model: ['unem', 'expersq', 'exper', 'motheduc', 'huswage', 'hushrs', 'educ', 'age', 'kidsge6', 'kidslt6']
Model parameters:
const       7.431017
unem       -0.019136
expersq    -0.001836
exper       0.117897
motheduc    0.011373
huswage    -0.025718
hushrs     -0.000133
educ        0.051211
age        -0.042745
kidsge6    -0.035498
kidslt6    -0.822677
dtype: float64

Best AIC value for k=11 variables: 624731.7720226073
Variables in the model: ['unem', 'expersq', 'exper', 'motheduc', 'huswage', 'huseduc', 'hushrs', 'educ', 'age', 'kidsge6', 'kidslt6']
Model parameters:
const       7.454727
unem       -0.018725
expersq    -0.001840
exper       0.117839
motheduc    0.012096
huswage    -0.022766
huseduc    -0.012720
hushrs     -0.000125
educ        0.059141
age        -0.042974
kidsge6    -0.036542
kidslt6    -0.819659
dtype: float64

Best AIC value for k=12 variables: 624532.3497764468
Variables in the model: ['city', 'unem', 'expersq', 'exper', 'motheduc', 'huswage', 'huseduc', 'hushrs', 'educ', 'age', 'kidsge6', 'kidslt6']
Model parameters:
const       7.462437
city        0.042751
unem       -0.019525
expersq    -0.001837
exper       0.117815
motheduc    0.012042
huswage    -0.024076
huseduc    -0.013749
hushrs     -0.000123
educ        0.059219
age        -0.043243
kidsge6    -0.036282
kidslt6    -0.818489
dtype: float64

Best AIC value for k=13 variables: 624477.5595229984
Variables in the model: ['city', 'unem', 'expersq', 'exper', 'fatheduc', 'motheduc', 'huswage', 'huseduc', 'hushrs', 'educ', 'age', 'kidsge6', 'kidslt6']
Model parameters:
const       7.462458
city        0.045172
unem       -0.019463
expersq    -0.001838
exper       0.117729
fatheduc   -0.003659
motheduc    0.013728
huswage    -0.024136
huseduc    -0.013512
hushrs     -0.000123
educ        0.060386
age        -0.043270
kidsge6    -0.036885
kidslt6    -0.818034
dtype: float64


#Best subsets regression with BIC.

import statsmodels.genmod.generalized_linear_model as glm
glm.SET_USE_BIC_LLF(True) #this silences the warnings for BIC

#Best BIC Model:
best_bic_per_k = {}

# Looping through all possible sizes of predictor subsets
for k in range(1, len(Xfull.columns) + 1):
    
    best_bic_k = float("inf")
    best_model_k = None
    best_variables_k = None
    
    for variables in get_combinations(list(Xfull.columns), k):
        predictors = Xfull[list(variables)]
        predictors = sm.add_constant(predictors)  # Add a constant (intercept) to the model
        
        # Fitting the model
        model = sm.GLM(Y, predictors, family=sm.families.Poisson()).fit()
        
        # Updating best BIC model for each k
        if model.bic < best_bic_k:  # Using BIC instead of AIC here
            best_bic_k = model.bic  # Updating BIC value
            best_model_k = model
            best_variables_k = variables  # Storing variable names
    
    # Storing the best BIC and variable names for each k in the dictionary
    best_bic_per_k[k] = (best_bic_k, best_model_k, best_variables_k)

# Displaying the results
for k, (bic, model, variables) in best_bic_per_k.items():
    print(f"\nBest BIC value for k={k} variables: {bic}")
    print(f"Variables in the model: {variables}")
    print(f"Model parameters:\n{model.params}")

Best BIC value for k=1 variables: 744507.8656206552
Variables in the model: ['exper']
Model parameters:
const    5.997464
exper    0.049036
dtype: float64

Best BIC value for k=2 variables: 708139.4972449099
Variables in the model: ['exper', 'age']
Model parameters:
const    7.340090
exper    0.066197
age     -0.036907
dtype: float64

Best BIC value for k=3 variables: 653573.8523256308
Variables in the model: ['exper', 'age', 'kidslt6']
Model parameters:
const      8.134844
exper      0.064158
age       -0.051846
kidslt6   -0.846538
dtype: float64

Best BIC value for k=4 variables: 639293.0072092693
Variables in the model: ['expersq', 'exper', 'age', 'kidslt6']
Model parameters:
const      7.513553
expersq   -0.001812
exper      0.122583
age       -0.045117
kidslt6   -0.793601
dtype: float64

Best BIC value for k=5 variables: 634438.0076685606
Variables in the model: ['expersq', 'exper', 'educ', 'age', 'kidslt6']
Model parameters:
const      6.970329
expersq   -0.001774
exper      0.120684
educ       0.042226
age       -0.044275
kidslt6   -0.811135
dtype: float64

Best BIC value for k=6 variables: 631310.6102818496
Variables in the model: ['expersq', 'exper', 'huswage', 'educ', 'age', 'kidslt6']
Model parameters:
const      6.936480
expersq   -0.001829
exper      0.120372
huswage   -0.020714
educ       0.052831
age       -0.042680
kidslt6   -0.807524
dtype: float64

Best BIC value for k=7 variables: 628698.4201174738
Variables in the model: ['expersq', 'exper', 'huswage', 'hushrs', 'educ', 'age', 'kidslt6']
Model parameters:
const      7.230814
expersq   -0.001794
exper      0.118114
huswage   -0.026952
hushrs    -0.000124
educ       0.057793
age       -0.042924
kidslt6   -0.815312
dtype: float64

Best BIC value for k=8 variables: 626800.5552440683
Variables in the model: ['unem', 'expersq', 'exper', 'huswage', 'hushrs', 'educ', 'age', 'kidslt6']
Model parameters:
const      7.351924
unem      -0.019775
expersq   -0.001836
exper      0.118913
huswage   -0.025638
hushrs    -0.000136
educ       0.059774
age       -0.041995
kidslt6   -0.811565
dtype: float64

Best BIC value for k=9 variables: 625878.3667465264
Variables in the model: ['unem', 'expersq', 'exper', 'huswage', 'hushrs', 'educ', 'age', 'kidsge6', 'kidslt6']
Model parameters:
const      7.492027
unem      -0.019316
expersq   -0.001827
exper      0.117388
huswage   -0.025561
hushrs    -0.000131
educ       0.058120
age       -0.043663
kidsge6   -0.035307
kidslt6   -0.824243
dtype: float64

Best BIC value for k=10 variables: 625226.9283691951
Variables in the model: ['unem', 'expersq', 'exper', 'motheduc', 'huswage', 'hushrs', 'educ', 'age', 'kidsge6', 'kidslt6']
Model parameters:
const       7.431017
unem       -0.019136
expersq    -0.001836
exper       0.117897
motheduc    0.011373
huswage    -0.025718
hushrs     -0.000133
educ        0.051211
age        -0.042745
kidsge6    -0.035498
kidslt6    -0.822677
dtype: float64

Best BIC value for k=11 variables: 624787.2608053408
Variables in the model: ['unem', 'expersq', 'exper', 'motheduc', 'huswage', 'huseduc', 'hushrs', 'educ', 'age', 'kidsge6', 'kidslt6']
Model parameters:
const       7.454727
unem       -0.018725
expersq    -0.001840
exper       0.117839
motheduc    0.012096
huswage    -0.022766
huseduc    -0.012720
hushrs     -0.000125
educ        0.059141
age        -0.042974
kidsge6    -0.036542
kidslt6    -0.819659
dtype: float64

Best BIC value for k=12 variables: 624592.4626244082
Variables in the model: ['city', 'unem', 'expersq', 'exper', 'motheduc', 'huswage', 'huseduc', 'hushrs', 'educ', 'age', 'kidsge6', 'kidslt6']
Model parameters:
const       7.462437
city        0.042751
unem       -0.019525
expersq    -0.001837
exper       0.117815
motheduc    0.012042
huswage    -0.024076
huseduc    -0.013749
hushrs     -0.000123
educ        0.059219
age        -0.043243
kidsge6    -0.036282
kidslt6    -0.818489
dtype: float64

Best BIC value for k=13 variables: 624542.2964361876
Variables in the model: ['city', 'unem', 'expersq', 'exper', 'fatheduc', 'motheduc', 'huswage', 'huseduc', 'hushrs', 'educ', 'age', 'kidsge6', 'kidslt6']
Model parameters:
const       7.462458
city        0.045172
unem       -0.019463
expersq    -0.001838
exper       0.117729
fatheduc   -0.003659
motheduc    0.013728
huswage    -0.024136
huseduc    -0.013512
hushrs     -0.000123
educ        0.060386
age        -0.043270
kidsge6    -0.036885
kidslt6    -0.818034
dtype: float64


#Here is the model: 
Y = mroz['hours']
X = mroz[['kidslt6', 'age', 'educ', 
        'huswage', 'exper', 'expersq']]
X = sm.add_constant(X) # Add a constant (intercept) to the model
# Fit the Poisson regression model
poiregmodel = sm.GLM(Y, X, family=sm.families.Poisson()).fit()


print(dir(poiregmodel)) #this prints all methods and attributes associated with poiregmodel (find fittedvalues in this list)

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_cache', '_data_attr', '_data_attr_model', '_data_in_cache', '_endog', '_freq_weights', '_get_robustcov_results', '_get_wald_nonlinear', '_iweights', '_n_trials', '_transform_predict_exog', '_use_t', '_var_weights', 'aic', 'bic', 'bic_deviance', 'bic_llf', 'bse', 'conf_int', 'converged', 'cov_kwds', 'cov_params', 'cov_type', 'deviance', 'df_model', 'df_resid', 'f_test', 'family', 'fit_history', 'fittedvalues', 'get_distribution', 'get_hat_matrix_diag', 'get_influence', 'get_prediction', 'info_criteria', 'initialize', 'k_constant', 'llf', 'llf_scaled', 'llnull', 'load', 'method', 'mle_settings', 'model', 'mu', 'nobs', 'normalized_cov_params', 'null', 'null_deviance', 'params', 'pearson_chi2', 'plot_added_variable', 'plot_ceres_residuals', 'plot_partial_residuals', 'predict', 'pseudo_rsquared', 'pvalues', 'remove_data', 'resid_anscombe', 'resid_anscombe_scaled', 'resid_anscombe_unscaled', 'resid_deviance', 'resid_pearson', 'resid_response', 'resid_working', 'save', 'scale', 'score_test', 'summary', 'summary2', 't_test', 't_test_pairwise', 'tvalues', 'use_t', 'wald_test', 'wald_test_terms']


#Fitted values:
poireg_fitted_values = poiregmodel.fittedvalues
print(poireg_fitted_values)

0      765.488164
1      789.438938
2      727.125304
3      814.320429
4      442.021498
          ...    
748    533.872877
749    350.154433
750    475.726717
751    466.893617
752    900.034415
Length: 753, dtype: float64


#You can also manually calculate fitted values as follows:
beta_hat = poiregmodel.params.values 
fvals = np.exp(X.dot(beta_hat))
print(fvals)
#Check that fvals coincides with poiregmodel.fittedvalues:
print(np.array_equal(fvals, poireg_fitted_values))

0      765.488164
1      789.438938
2      727.125304
3      814.320429
4      442.021498
          ...    
748    533.872877
749    350.154433
750    475.726717
751    466.893617
752    900.034415
Length: 753, dtype: float64
True


import matplotlib.pyplot as plt
poireg_fitted_values = poiregmodel.fittedvalues
print(poireg_fitted_values)
plt.scatter(poireg_fitted_values, Y, alpha = 1, s = 20)
plt.plot([min(poireg_fitted_values), max(poireg_fitted_values)],[min(poireg_fitted_values), max(poireg_fitted_values)], color = 'red', linestyle = '--')
plt.xlabel('Fitted Values')
plt.ylabel('Observed Response')
plt.title('Fitted vs Observed Values')
plt.show()

0      765.488164
1      789.438938
2      727.125304
3      814.320429
4      442.021498
          ...    
748    533.872877
749    350.154433
750    475.726717
751    466.893617
752    900.034415
Length: 753, dtype: float64


print(f"Range of observed response: {[np.min(Y), np.max(Y)]}")
print(f"Range of fitted values: {[np.min(poireg_fitted_values), np.max(poireg_fitted_values)]}")

Range of observed response: [0, 4950]
Range of fitted values: [45.58196325662061, 2132.55829618613]


#Observations where Y = 0 and fitted values are large
condition = (Y == 0) & (poireg_fitted_values > 1500)
df1 = mroz.loc[condition, ['hours', 'kidslt6', 'age', 'educ', 'huswage', 'exper', 'expersq']]
df2 = pd.DataFrame(poireg_fitted_values[condition], columns=["Fitted Values"])
combined = pd.concat([df1.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)
print(combined)
print(poiregmodel.params)
#It is easy to see why the model gives large predictions (fitted_values) to these observations:
#these women have no kids and have high experience.

   hours  kidslt6  age  educ  huswage  exper  expersq  Fitted Values
0      0        0   38    12   5.6922     19      361    1733.075193
1      0        0   43    12   2.9510     25      625    1882.829625
2      0        0   49    12   1.9286     32     1024    1666.775722
3      0        0   55    16   2.6701     33     1089    1571.905555
4      0        0   41    12   3.7875     18      324    1504.698908
const      6.936480
kidslt6   -0.807524
age       -0.042680
educ       0.052831
huswage   -0.020714
exper      0.120372
expersq   -0.001829
dtype: float64


#observations where the actual response is somewhat large but the fitted value is small
condition = (Y > 2000) & (poireg_fitted_values < 500)
df1 = mroz.loc[condition, ['hours', 'kidslt6', 'age', 'educ', 'huswage', 'exper', 'expersq']]
df2 = pd.DataFrame(poireg_fitted_values[condition], columns=["Fitted Values"])
combined = pd.concat([df1.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)
print(combined)
print(poiregmodel.params)
#These women either have kids or have small exper and high age leading to small predictions (fitted values)

   hours  kidslt6  age  educ  huswage  exper  expersq  Fitted Values
0   2030        0   47    12   5.0870      6       36     452.821746
1   2480        1   39    10   8.6806     12      144     401.044507
2   2340        1   36    10   3.7129      1        1     174.582795
3   2149        0   45     8   4.5122      3        9     295.804050
4   3533        2   38    15   7.0000     17      289     352.452873
5   3000        0   51    14   8.8632      0        0     203.529784
const      6.936480
kidslt6   -0.807524
age       -0.042680
educ       0.052831
huswage   -0.020714
exper      0.120372
expersq   -0.001829
dtype: float64


#Posterior Predictive Distributions:
#First obtain posterior samples:
cov_matrix = poiregmodel.cov_params() #this is the covariance matrix of the posterior normal approximation
print(cov_matrix)
#Manual computation of covariance matrix:
beta_hat = poiregmodel.params.values 
log_muvec = np.dot(X, beta_hat)
muvec = np.exp(log_muvec)
M = np.diag(muvec)
Hessian = -X.T @ M @ X
Hessian_inv = np.linalg.inv(Hessian)
CovMat = -Hessian_inv
print(CovMat) #this coincides with poiregmodel.cov_params()

                const       kidslt6           age          educ       huswage  \
const    1.521851e-04 -1.094065e-05 -1.890439e-06 -4.802901e-06  1.794310e-07   
kidslt6 -1.094065e-05  1.746699e-05  2.218647e-07 -1.421124e-07 -2.502671e-08   
age     -1.890439e-06  2.218647e-07  4.501405e-08  1.130163e-08 -1.009965e-08   
educ    -4.802901e-06 -1.421124e-07  1.130163e-08  4.008993e-07 -7.051506e-08   
huswage  1.794310e-07 -2.502671e-08 -1.009965e-08 -7.051506e-08  1.441951e-07   
exper   -2.275008e-06  2.447391e-07  9.708493e-09 -1.707776e-08  2.029966e-09   
expersq  8.169060e-08 -6.410297e-09 -8.849655e-10  1.469846e-10  3.559604e-10   

                exper       expersq  
const   -2.275008e-06  8.169060e-08  
kidslt6  2.447391e-07 -6.410297e-09  
age      9.708493e-09 -8.849655e-10  
educ    -1.707776e-08  1.469846e-10  
huswage  2.029966e-09  3.559604e-10  
exper    3.014745e-07 -8.400485e-09  
expersq -8.400485e-09  2.661157e-10  
[[ 1.52185070e-04 -1.09406532e-05 -1.89043951e-06 -4.80290153e-06
   1.79431046e-07 -2.27500844e-06  8.16905999e-08]
 [-1.09406532e-05  1.74669961e-05  2.21864692e-07 -1.42112412e-07
  -2.50267180e-08  2.44739159e-07 -6.41029810e-09]
 [-1.89043951e-06  2.21864692e-07  4.50140545e-08  1.13016300e-08
  -1.00996487e-08  9.70849310e-09 -8.84965506e-10]
 [-4.80290153e-06 -1.42112412e-07  1.13016300e-08  4.00899280e-07
  -7.05150573e-08 -1.70777559e-08  1.46984555e-10]
 [ 1.79431046e-07 -2.50267180e-08 -1.00996487e-08 -7.05150573e-08
   1.44195100e-07  2.02996543e-09  3.55960395e-10]
 [-2.27500844e-06  2.44739159e-07  9.70849310e-09 -1.70777559e-08
   2.02996543e-09  3.01474552e-07 -8.40048540e-09]
 [ 8.16905999e-08 -6.41029810e-09 -8.84965506e-10  1.46984555e-10
   3.55960395e-10 -8.40048540e-09  2.66115699e-10]]


#Generating posterior beta samples:
n_samples = 1000
beta_samples = np.random.multivariate_normal(beta_hat, cov_matrix, n_samples)
print(beta_samples.shape)
print(beta_samples[:10])

(1000, 7)
[[ 6.92194536e+00 -8.03270772e-01 -4.24286435e-02  5.31064504e-02
  -2.08870726e-02  1.20842234e-01 -1.85190786e-03]
 [ 6.93385598e+00 -8.08114326e-01 -4.27041476e-02  5.33726205e-02
  -2.09114608e-02  1.20040017e-01 -1.81940021e-03]
 [ 6.92838759e+00 -8.04950338e-01 -4.24892209e-02  5.26605631e-02
  -2.02854229e-02  1.20542260e-01 -1.84123327e-03]
 [ 6.92248199e+00 -8.06713238e-01 -4.25091360e-02  5.32710713e-02
  -2.16870422e-02  1.21372727e-01 -1.85441246e-03]
 [ 6.95677092e+00 -8.10321674e-01 -4.31309003e-02  5.25048822e-02
  -2.07995923e-02  1.20545966e-01 -1.82656077e-03]
 [ 6.94115575e+00 -8.03133123e-01 -4.27210462e-02  5.25318769e-02
  -2.03102277e-02  1.20224158e-01 -1.82583563e-03]
 [ 6.92293173e+00 -8.08470247e-01 -4.26668913e-02  5.33084676e-02
  -2.05542698e-02  1.20943282e-01 -1.84161588e-03]
 [ 6.92499787e+00 -8.10092566e-01 -4.26821485e-02  5.41082290e-02
  -2.06054555e-02  1.19823778e-01 -1.81369128e-03]
 [ 6.94827919e+00 -8.07833747e-01 -4.28014843e-02  5.24328534e-02
  -2.06433929e-02  1.19881541e-01 -1.81066192e-03]
 [ 6.92673869e+00 -8.07883767e-01 -4.23639834e-02  5.29246453e-02
  -2.10721353e-02  1.20889993e-01 -1.85660273e-03]]


#Posterior Predictive Distributions for fixed values of i
row = 483 #row plays the row of i in the mathematical notation
#this row corresponds to the observation with the smallest fitted value
#row = 34
print(mroz.loc[row, ['hours', 'kidslt6', 'age', 'educ', 'huswage', 'exper', 'expersq']])
print(poireg_fitted_values[row])
poisson_pred_Y_row = np.zeros(n_samples)
for j in range(n_samples):
    poisson_mean = np.exp(np.dot(beta_samples[j], X.iloc[row]))
    poisson_pred_Y_row[j] = np.random.poisson(poisson_mean)
#poisson_pred_Y_row contains the values Y_i^{(j)}, j = 1,...,N
#We can look at the histogram of poisson_pred_Y_row as follows:
plt.figure(figsize = (8, 6))
plt.hist(poisson_pred_Y_row, bins = 100)
plt.xlabel('Value')
plt.ylabel('Count')
plt.title('Histogram of Posterior Predictive Samples')
plt.grid(axis='y')
plt.show()
#The actual value of Y for row 483 equals 0 which is quite far from the histogram (the lowest end of the histogram is still larger than 25)
#This plot reveals that the model is not good for predicting observations where Y = 0

hours       0.000
kidslt6     3.000
age        31.000
educ       13.000
huswage    19.444
exper       3.000
expersq     9.000
Name: 483, dtype: float64
45.58196325662061


#Posterior Predictive Distributions for fixed values of i
row = 291 #here the actual response value is quite high (= 3686)
#row = 34
print(mroz.loc[row, ['hours', 'kidslt6', 'age', 'educ', 'huswage', 'exper', 'expersq']])
print(poireg_fitted_values[row])
poisson_pred_Y_row = np.zeros(n_samples)
for j in range(n_samples):
    poisson_mean = np.exp(np.dot(beta_samples[j], X.iloc[row]))
    poisson_pred_Y_row[j] = np.random.poisson(poisson_mean)
#poisson_pred_Y_row contains the values Y_i^{(j)}, j = 1,...,N
#We can look at the histogram of poisson_pred_Y_row as follows:
plt.figure(figsize = (8, 6))
plt.hist(poisson_pred_Y_row, bins = 100)
plt.xlabel('Value')
plt.ylabel('Count')
plt.title('Histogram of Posterior Predictive Samples')
plt.grid(axis='y')
plt.show()
#Here the histogram is very far from the actual observation of 3686.
#The model is quite inaccurate for very high values of hours.

hours      3686.0000
kidslt6       0.0000
age          32.0000
educ         14.0000
huswage       6.6788
exper        11.0000
expersq     121.0000
Name: 291, dtype: float64
1443.5070767068787


#Posterior Predictive Distributions for fixed values of i
row = 34
print(mroz.loc[row, ['hours', 'kidslt6', 'age', 'educ', 'huswage', 'exper', 'expersq']])
print(poireg_fitted_values[row])
poisson_pred_Y_row = np.zeros(n_samples)
for j in range(n_samples):
    poisson_mean = np.exp(np.dot(beta_samples[j], X.iloc[row]))
    poisson_pred_Y_row[j] = np.random.poisson(poisson_mean)
#poisson_pred_Y_row contains the values Y_i^{(j)}, j = 1,...,N
#We can look at the histogram of poisson_pred_Y_row as follows:
plt.figure(figsize = (8, 6))
plt.hist(poisson_pred_Y_row, bins = 100)
plt.xlabel('Value')
plt.ylabel('Count')
plt.title('Histogram of Posterior Predictive Samples')
plt.grid(axis='y')
plt.show()
#Here the posterior predictive histogram nicely covers the actual observed response 2081.

hours      2081.0000
kidslt6       0.0000
age          45.0000
educ         16.0000
huswage       7.5449
exper        27.0000
expersq     729.0000
Name: 34, dtype: float64
2042.4646924796739


#Negative Binomial Regression
Y = mroz['hours']
X = mroz[['kidslt6', 'age', 'educ', 'huswage', 'exper', 'expersq']]
X = sm.add_constant(X)  # Add a constant (intercept) to the model
# Fit the Negative Binomial regression model
#First Way: 
negbinom_model = sm.NegativeBinomial(Y, X).fit() #this treats alpha as an unknown parameter and
#fits it giving an estimate (in this case 7.42). 
# Display the summary
print(negbinom_model.summary())

#Second Way: 
negbinom_model = sm.GLM(Y, X, family=sm.families.NegativeBinomial(alpha=7)).fit()
#I am giving alpha = 7 because it was basically the estimate of alpha that was given by sm.NegativeBinomial.

Optimization terminated successfully.
         Current function value: 5.737933
         Iterations: 27
         Function evaluations: 42
         Gradient evaluations: 31
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                  hours   No. Observations:                  753
Model:                            GLM   Df Residuals:                      746
Model Family:        NegativeBinomial   Df Model:                            6
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -5519.7
Date:                Thu, 05 Oct 2023   Deviance:                       4304.2
Time:                        07:13:29   Pearson chi2:                 2.13e+03
No. Iterations:                    12   Pseudo R-squ. (CS):             0.4263
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.1446      0.326     21.946      0.000       6.507       7.783
kidslt6       -1.0319      0.078    -13.188      0.000      -1.185      -0.879
age           -0.0550      0.005    -10.118      0.000      -0.066      -0.044
educ           0.0741      0.017      4.321      0.000       0.040       0.108
huswage       -0.0372      0.009     -4.056      0.000      -0.055      -0.019
exper          0.1460      0.013     11.045      0.000       0.120       0.172
expersq       -0.0023      0.000     -5.383      0.000      -0.003      -0.001
==============================================================================

/srv/conda/envs/notebook/lib/python3.11/site-packages/statsmodels/discrete/discrete_model.py:3661: RuntimeWarning: overflow encountered in exp
  return np.exp(linpred)
/srv/conda/envs/notebook/lib/python3.11/site-packages/statsmodels/discrete/discrete_model.py:3377: RuntimeWarning: divide by zero encountered in log
  llf = coeff + size*np.log(prob) + endog*np.log(1-prob)
/srv/conda/envs/notebook/lib/python3.11/site-packages/statsmodels/discrete/discrete_model.py:3468: RuntimeWarning: invalid value encountered in multiply
  dparams = exog*a1 * (y-mu)/(mu+a1)
/srv/conda/envs/notebook/lib/python3.11/site-packages/statsmodels/discrete/discrete_model.py:3468: RuntimeWarning: invalid value encountered in divide
  dparams = exog*a1 * (y-mu)/(mu+a1)
/srv/conda/envs/notebook/lib/python3.11/site-packages/statsmodels/discrete/discrete_model.py:3471: RuntimeWarning: invalid value encountered in divide
  - np.log(a1+mu) - (y-mu)/(a1+mu)).sum() * da1
/srv/conda/envs/notebook/lib/python3.11/site-packages/statsmodels/discrete/discrete_model.py:3661: RuntimeWarning: overflow encountered in exp
  return np.exp(linpred)
/srv/conda/envs/notebook/lib/python3.11/site-packages/statsmodels/discrete/discrete_model.py:3377: RuntimeWarning: divide by zero encountered in log
  llf = coeff + size*np.log(prob) + endog*np.log(1-prob)
/srv/conda/envs/notebook/lib/python3.11/site-packages/statsmodels/genmod/families/family.py:1367: ValueWarning: Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.
  warnings.warn("Negative binomial dispersion parameter alpha not "


#The estimates of regression coefficients are different (compared to Poisson regression) but similar.
#The coefficient of kidslt6 now leads to the interpretation (note that the model is still log mu = b0 + b1 x1 + ... + bm xm)
print((np.exp(negbinom_model.params['kidslt6']) - 1)*100)

-64.46921135832466


#Fitted values and their range
negbinom_fitted_values = negbinom_model.fittedvalues
print(f"Range of observed response: {[np.min(Y), np.max(Y)]}")
print(f"Range of fitted values for Poisson regression: {[np.min(poireg_fitted_values), np.max(poireg_fitted_values)]}")
print(f"Range of fitted values for Negative Binomial regression: {[np.min(negbinom_fitted_values), np.max(negbinom_fitted_values)]}")
#Note the bigger spread of values for the Negative Binomial Regression. This improves the issue of overdispersion.

Range of observed response: [0, 4950]
Range of fitted values for Poisson regression: [45.58196325662061, 2132.55829618613]
Range of fitted values for Negative Binomial regression: [20.113601308712315, 2846.983218999344]


#Plotting Fitted Values and Observed Response Values
plt.scatter(negbinom_fitted_values, Y, alpha = 1, s = 3)
plt.scatter(poireg_fitted_values, Y, alpha = 1, s = 3, color = 'red')
plt.xlabel('Fitted Values')
plt.ylabel('Observed Response')
plt.title('Fitted vs Observed Values')
plt.show()
#The negative binomial fits show more dispersion
plt.scatter(negbinom_fitted_values, poireg_fitted_values, alpha = 1, s = 3)
plt.xlabel('Negative Binomial Fitted Values')
plt.ylabel('Poisson Fitted Values')
x = np.linspace(min(negbinom_fitted_values), max(negbinom_fitted_values), 100)
plt.plot(x, x, label='y=x', linestyle='--', color='black')
plt.show()

Model Selection and Model Checking (in Poisson Regression, and GLMs)¶

Model Selection¶

More details for log-likelihood and deviance¶

AIC and BIC¶

Model Checking¶

Fitted Values¶

Posterior Predictive Distributions¶

Overdispersion¶

Negative Binomial Regression¶

Generalized Linear Models¶