#Import the necessary libraries:
import arviz as az
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pymc3 as pm #pymc3 is the previous version of pymc. We shall switch to pymc as soon as it gets installed in datahub

#Specify the Model in PyMC: 
health_model = pm.Model()
with health_model:
    
    overweight = pm.Bernoulli('overweight', 0.1)
    smoking = pm.Bernoulli('smoking', 0.1)
    # Deterministic probabilities for 'heart' based on conditions
    p_heart = pm.Deterministic('p_heart', pm.math.switch(overweight, 
                                      pm.math.switch(smoking, 0.75, 0.5), 
                                      pm.math.switch(smoking, 0.4, 0.1)))

    # 'heart' random variable
    heart = pm.Bernoulli('heart', p_heart)
    # Deterministic probability for 'cough' based on 'smoking'
    p_cough = pm.Deterministic('p_cough', pm.math.switch(smoking, 0.6, 0.05))
    # 'cough' random variable
    cough = pm.Bernoulli('cough', p_cough)
    #This ends the specification of the model. 
    #To obtain samples from PyMC, run the following:
    idata = pm.sample(5000, chains = 4, return_inferencedata = True)
    #This will generate 5000*4 = 20000 posterior samples from (overweight, smoking, heart, cough).

Multiprocess sampling (4 chains in 4 jobs)
BinaryGibbsMetropolis: [overweight, smoking, heart, cough]

Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 5 seconds.
/opt/conda/lib/python3.9/site-packages/arviz/utils.py:187: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  numba_fn = numba.jit(**self.kwargs)(self.function)

print(idata.posterior)
overweight_samples = idata.posterior['overweight'].values.flatten()
smoking_samples = idata.posterior['smoking'].values.flatten()
heart_samples = idata.posterior['heart'].values.flatten()
cough_samples = idata.posterior['cough'].values.flatten()
all_samples = np.column_stack((overweight_samples, smoking_samples, heart_samples, cough_samples))
print(all_samples[:80])

<xarray.Dataset>
Dimensions:     (chain: 4, draw: 5000)
Coordinates:
  * chain       (chain) int64 0 1 2 3
  * draw        (draw) int64 0 1 2 3 4 5 6 ... 4994 4995 4996 4997 4998 4999
Data variables:
    overweight  (chain, draw) int64 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
    smoking     (chain, draw) int64 0 0 0 0 0 0 0 0 0 0 ... 0 1 1 0 0 0 0 0 0 0
    heart       (chain, draw) int64 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 1 0 0 0 0 0 0
    cough       (chain, draw) int64 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
    p_heart     (chain, draw) float64 0.1 0.1 0.1 0.1 0.1 ... 0.1 0.1 0.1 0.1
    p_cough     (chain, draw) float64 0.05 0.05 0.05 0.05 ... 0.05 0.05 0.05
Attributes:
    created_at:                 2023-09-19T23:03:00.075509
    arviz_version:              0.12.1
    inference_library:          pymc3
    inference_library_version:  3.11.2
    sampling_time:              5.032527685165405
    tuning_steps:               1000
[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 1]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [1 0 1 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 1 0]
 [1 0 0 0]
 [0 0 1 0]
 [0 1 0 0]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 0]
 [1 0 1 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [1 0 0 0]
 [0 0 0 0]
 [1 0 1 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [1 0 1 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 1]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 1 0]
 [0 0 0 0]
 [1 0 0 0]
 [0 0 0 0]]

#Marginal probability that heart = 1:
print(np.sum(heart_samples)/len(heart_samples))

#This should be compared to the actual value of the probability which can be calculated (more tediously) as:
p_overweight = 0.1
p_smoking = 0.1
# Conditional probabilities for heart
p_heart_given_overweight_and_smoking = 0.75
p_heart_given_overweight_and_not_smoking = 0.5
p_heart_given_not_overweight_and_smoking = 0.4
p_heart_given_not_overweight_and_not_smoking = 0.1
# Calculate the marginal probability of heart being 1
p_heart = (
    p_overweight * p_smoking * p_heart_given_overweight_and_smoking +
    p_overweight * (1 - p_smoking) * p_heart_given_overweight_and_not_smoking +
    (1 - p_overweight) * p_smoking * p_heart_given_not_overweight_and_smoking +
    (1 - p_overweight) * (1 - p_smoking) * p_heart_given_not_overweight_and_not_smoking
)
print(p_heart)

0.1631
0.16950000000000004

#Calculating the conditional probability of overweight = 1 given heart = 1 from the posterior samples.
mask = (heart_samples == 1)
print(sum(mask)) #this means that many of the original samples are discarded for this calculation.
count_overweight_given_heart = sum(overweight_samples[mask] == 1)
required_probability = count_overweight_given_heart/sum(mask)
print(required_probability)

#Exact calculation
print((p_overweight * p_smoking * p_heart_given_overweight_and_smoking + p_overweight * (1 - p_smoking) * p_heart_given_overweight_and_not_smoking) / p_heart)

3262
0.30257510729613735
0.30973451327433627

#PyMC model when heart is pre-specified as 1. 
health_model_1 = pm.Model()
with health_model_1:
    overweight = pm.Bernoulli('overweight', 0.1)
    smoking = pm.Bernoulli('smoking', 0.1)
    # Deterministic probabilities for 'heart' based on conditions
    p_heart = pm.Deterministic('p_heart', pm.math.switch(overweight, 
                                      pm.math.switch(smoking, 0.75, 0.5), 
                                      pm.math.switch(smoking, 0.4, 0.1)))

    # 'heart' random variable
    heart = pm.Bernoulli('heart', p_heart, observed = 1) 
    #observed = 1 means we want heart to be fixed at the observed value of 1.
    #If we want heart to be fixed at 0, we would say observed = 0.
    # Deterministic probability for 'cough' based on 'smoking'
    p_cough = pm.Deterministic('p_cough', pm.math.switch(smoking, 0.6, 0.05))
    # 'cough' random variable
    cough = pm.Bernoulli('cough', p_cough)
    #This ends the specification of the model. 
    #To obtain samples from PyMC, run the following:
    idata = pm.sample(5000, chains = 4, return_inferencedata = True)
    #This will generate 5000*4 = 20000 posterior samples from (overweight, smoking, heart, cough).

Multiprocess sampling (4 chains in 4 jobs)
BinaryGibbsMetropolis: [overweight, smoking, cough]

Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 4 seconds.

print(idata.posterior)
overweight_samples = idata.posterior['overweight'].values.flatten()
smoking_samples = idata.posterior['smoking'].values.flatten()
cough_samples = idata.posterior['cough'].values.flatten()
all_samples = np.column_stack((overweight_samples, smoking_samples, cough_samples))
print(all_samples[:80])

<xarray.Dataset>
Dimensions:     (chain: 4, draw: 5000)
Coordinates:
  * chain       (chain) int64 0 1 2 3
  * draw        (draw) int64 0 1 2 3 4 5 6 ... 4994 4995 4996 4997 4998 4999
Data variables:
    overweight  (chain, draw) int64 1 0 1 0 1 0 0 0 0 1 ... 0 0 0 0 0 0 0 1 1 0
    smoking     (chain, draw) int64 0 0 0 0 0 0 0 0 0 0 ... 1 1 0 0 0 0 0 0 0 0
    cough       (chain, draw) int64 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
    p_heart     (chain, draw) float64 0.5 0.1 0.5 0.1 0.5 ... 0.1 0.5 0.5 0.1
    p_cough     (chain, draw) float64 0.05 0.05 0.05 0.05 ... 0.05 0.05 0.05
Attributes:
    created_at:                 2023-09-19T23:11:52.450208
    arviz_version:              0.12.1
    inference_library:          pymc3
    inference_library_version:  3.11.2
    sampling_time:              4.087341785430908
    tuning_steps:               1000
[[1 0 0]
 [0 0 0]
 [1 0 0]
 [0 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [1 0 0]
 [1 0 0]
 [0 0 0]
 [1 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [1 0 0]
 [0 1 0]
 [0 1 1]
 [0 0 0]
 [1 0 0]
 [0 0 0]
 [1 1 0]
 [0 0 0]
 [1 0 0]
 [1 0 0]
 [0 1 0]
 [0 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 1]
 [0 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [1 0 0]
 [1 0 0]
 [0 0 0]
 [1 0 0]
 [0 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [1 0 0]
 [1 1 0]
 [0 0 0]
 [1 0 0]
 [0 1 1]
 [0 1 1]
 [0 1 0]
 [0 1 1]
 [0 1 0]
 [1 0 0]
 [0 1 0]
 [0 1 1]
 [1 0 0]
 [1 1 0]
 [0 1 1]
 [0 1 1]
 [0 1 0]
 [0 1 1]
 [0 1 0]
 [0 1 1]
 [0 1 1]
 [0 1 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [1 0 0]
 [0 0 0]
 [1 0 0]
 [0 0 0]]

#Marginal probability that heart = 1:
print(np.sum(overweight_samples)/len(overweight_samples))

0.31235

#PyMC model when cough is pre-specified as 1. 
health_model_2 = pm.Model()
with health_model_2:
    overweight = pm.Bernoulli('overweight', 0.1)
    smoking = pm.Bernoulli('smoking', 0.1)
    # Deterministic probabilities for 'heart' based on conditions
    p_heart = pm.Deterministic('p_heart', pm.math.switch(overweight, 
                                      pm.math.switch(smoking, 0.75, 0.5), 
                                      pm.math.switch(smoking, 0.4, 0.1)))

    # 'heart' random variable
    heart = pm.Bernoulli('heart', p_heart) 
    #observed = 1 means we want heart to be fixed at the observed value of 1.
    #If we want heart to be fixed at 0, we would say observed = 0.
    # Deterministic probability for 'cough' based on 'smoking'
    p_cough = pm.Deterministic('p_cough', pm.math.switch(smoking, 0.6, 0.05))
    # 'cough' random variable
    cough = pm.Bernoulli('cough', p_cough, observed = 1)
    #This ends the specification of the model. 
    #To obtain samples from PyMC, run the following:
    idata = pm.sample(5000, chains = 4, return_inferencedata = True)
    #This will generate 5000*4 = 20000 posterior samples from (overweight, smoking, heart, cough).

Multiprocess sampling (4 chains in 4 jobs)
BinaryGibbsMetropolis: [overweight, smoking, heart]

Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 4 seconds.

print(idata.posterior)
overweight_samples = idata.posterior['overweight'].values.flatten()
smoking_samples = idata.posterior['smoking'].values.flatten()
heart_samples = idata.posterior['heart'].values.flatten()
all_samples = np.column_stack((overweight_samples, smoking_samples, heart_samples))
#Required conditional probability:
print(np.sum(smoking_samples)/len(smoking_samples))

<xarray.Dataset>
Dimensions:     (chain: 4, draw: 5000)
Coordinates:
  * chain       (chain) int64 0 1 2 3
  * draw        (draw) int64 0 1 2 3 4 5 6 ... 4994 4995 4996 4997 4998 4999
Data variables:
    overweight  (chain, draw) int64 0 0 0 0 0 0 0 0 0 0 ... 0 1 1 1 0 0 0 0 0 0
    smoking     (chain, draw) int64 1 0 0 1 0 1 1 1 1 0 ... 1 1 1 0 1 0 1 0 1 0
    heart       (chain, draw) int64 0 0 1 0 0 0 1 0 1 0 ... 0 1 0 1 1 0 0 0 0 0
    p_heart     (chain, draw) float64 0.4 0.1 0.1 0.4 0.1 ... 0.4 0.1 0.4 0.1
    p_cough     (chain, draw) float64 0.6 0.05 0.05 0.6 ... 0.6 0.05 0.6 0.05
Attributes:
    created_at:                 2023-09-19T23:13:30.935572
    arviz_version:              0.12.1
    inference_library:          pymc3
    inference_library_version:  3.11.2
    sampling_time:              4.06363320350647
    tuning_steps:               1000
0.56815

from graphviz import Digraph
def create_graphical_model():
    # Create a new directed graph
    dot = Digraph()

    # Add nodes
    dot.node('overweight', 'Overweight')
    dot.node('smoking', 'Smoking')
    dot.node('heart_disease', 'Heart Disease')
    dot.node('cough', 'Cough')
    
    # Add edges
    dot.edge('overweight', 'heart_disease')
    dot.edge('smoking', 'heart_disease')
    dot.edge('smoking', 'cough')

    return dot

# Display the graph
display(create_graphical_model())

#Another Graphical Model:
from graphviz import Digraph

# Create a directed graph
dot = Digraph()

# Add nodes to the graph
for node in ['A', 'B', 'C', 'D', 'E', 'F']:
    dot.node(node)

# Add directed edges
edges = [('A', 'C'), ('B', 'C'), ('C', 'E'), ('D', 'E'), ('C', 'F'), ('D', 'F')]
for edge in edges:
    dot.edge(edge[0], edge[1])

# Display the graph in the Jupyter notebook
display(dot)

#Data
pos_A_obs = 3
neg_A_obs = 0
n_A_obs = 3
pos_B_obs = 19
neg_B_obs = 1
n_B_obs = 20

#PyMC Model:
microwave_model = pm.Model()
with microwave_model:
    #Priors
    theta_A = pm.Beta("theta_A", alpha=1, beta=1)
    theta_B = pm.Beta("theta_B", alpha=1, beta=1)
    #Likelihoods
    pos_A = pm.Binomial("pos_A", n=n_A_obs, p=theta_A, observed = pos_A_obs)
    pos_B = pm.Binomial("pos_B", n=n_B_obs, p=theta_B, observed = pos_B_obs)
    #Sample from posterior:
    idata = pm.sample(5000, chains = 4, return_inferencedata = True) 
    #The above step is asking for 5000*4 = 20000 posterior samples of theta_A and theta_B

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [theta_B, theta_A]

Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 8 seconds.

#We can access the posterior samples in the following way:
theta_A_samples = idata.posterior['theta_A'].values.flatten()
display(len(theta_A_samples))
display(theta_A_samples)
theta_B_samples = idata.posterior['theta_B'].values.flatten()
display(len(theta_B_samples))
display(theta_B_samples)

20000

array([0.75401301, 0.9005176 , 0.79450101, ..., 0.89130518, 0.90965358,
       0.83777861])

20000

array([0.98174034, 0.94598862, 0.9778023 , ..., 0.951163  , 0.95447991,
       0.78704502])

from scipy.stats import beta
plt.hist(theta_A_samples, bins = 100, density = True, alpha = 0.6, label = 'Posterior Samples for theta_A') #alpha controls the opacity of the plotted histogram bars 
#alpha = 1 is completely opaque and alpha = 0 is completely transparent
x = np.linspace(0, 1, 1000)
pdf_values = beta.pdf(x, pos_A_obs + 1, neg_A_obs + 1)
plt.plot(x, pdf_values, 'r-', label = 'Beta(4, 1) Density')
plt.xlabel('Theta_A_value')
plt.ylabel('Density')
plt.legend()
plt.title('Histogram of theta_A_samples and Beta(4, 1) density')
plt.show()

plt.hist(theta_B_samples, bins = 100, density = True, alpha = 0.6, label = 'Posterior Samples for theta_B') #alpha controls the opacity of the plotted histogram bars 
#alpha = 1 is completely opaque and alpha = 0 is completely transparent
x = np.linspace(0, 1, 1000)
pdf_values = beta.pdf(x, pos_B_obs + 1, neg_B_obs + 1)
plt.plot(x, pdf_values, 'r-', label = 'Beta(20, 2) Density')
plt.xlabel('Theta_B_value')
plt.ylabel('Density')
plt.legend()
plt.title('Histogram of theta_B_samples and Beta(20, 2) density')
plt.show()

N = len(theta_A_samples)
proportion_AleqB = np.sum(theta_A_samples < theta_B_samples)/N
print(proportion_AleqB)

0.70435

import graphviz
def create_graph():
    dot = graphviz.Digraph(format='png')
    # Define nodes
    dot.node('theta_A', 'θ_A')
    dot.node('theta_B', 'θ_B')
    dot.node('pos_A', 'pos_A')
    dot.node('pos_B', 'pos_B')
    # Define edges
    dot.edge('theta_A', 'pos_A')
    dot.edge('theta_B', 'pos_B')
    return dot
# Display the graph
create_graph()

#Read the kidney cancer dataset
import pandas as pd
d_full = pd.read_csv("KidneyCancerClean.csv", skiprows=4)
d = d_full[['state', 'Location', 'fips', 'dc', 'pop', 'dc.2', 'pop.2']].copy()
#Combine the death and population counts for the two periods 80-84 and 85-89
d['dct'] = d['dc'] + d['dc.2'] #dct stands for death count total
d['popm'] = (d['pop'] + d['pop.2']) / 2
print(d.head(20))

      state                   Location  fips  dc     pop  dc.2   pop.2  dct  \
0   ALABAMA    Autauga County, Alabama  1001   2   61921     1   64915    3   
1   ALABAMA    Baldwin County, Alabama  1003   7  170945    15  195253   22   
2   ALABAMA    Barbour County, Alabama  1005   0   33316     1   33987    1   
3   ALABAMA       Bibb County, Alabama  1007   0   30152     1   31175    1   
4   ALABAMA     Blount County, Alabama  1009   3   88342     5   91547    8   
5   ALABAMA    Bullock County, Alabama  1011   0    8313     0    8197    0   
6   ALABAMA     Butler County, Alabama  1013   0   31963     1   31722    1   
7   ALABAMA    Calhoun County, Alabama  1015   9  243105    12  233021   21   
8   ALABAMA   Chambers County, Alabama  1017   7   59985     0   57813    7   
9   ALABAMA   Cherokee County, Alabama  1019   0   43401     0   43828    0   
10  ALABAMA    Chilton County, Alabama  1021   2   65792     3   68837    5   
11  ALABAMA    Choctaw County, Alabama  1023   1   22741     3   22441    4   
12  ALABAMA     Clarke County, Alabama  1025   1   38611     3   37772    4   
13  ALABAMA       Clay County, Alabama  1027   3   27846     3   26954    6   
14  ALABAMA   Cleburne County, Alabama  1029   2   29718     1   29735    3   
15  ALABAMA     Coffee County, Alabama  1031   3   80334     4   81882    7   
16  ALABAMA    Colbert County, Alabama  1033   3  108797     6  104971    9   
17  ALABAMA    Conecuh County, Alabama  1035   2   22009     2   20687    4   
18  ALABAMA      Coosa County, Alabama  1037   2   18003     1   17828    3   
19  ALABAMA  Covington County, Alabama  1039   4   76229     3   75990    7   

        popm  
0    63418.0  
1   183099.0  
2    33651.5  
3    30663.5  
4    89944.5  
5     8255.0  
6    31842.5  
7   238063.0  
8    58899.0  
9    43614.5  
10   67314.5  
11   22591.0  
12   38191.5  
13   27400.0  
14   29726.5  
15   81108.0  
16  106884.0  
17   21348.0  
18   17915.5  
19   76109.5

#Our specific procedure for obtaining the prior was:
d['naiveproportion'] = d['dct'] / d['popm']
proportions_largecounties = d['naiveproportion'][d['popm'] >= 300000] #filter out the high population counties
m = np.mean(proportions_largecounties)
V = np.var(proportions_largecounties)
a = ((m*m*(1-m))/V) - m
b = (((1-m)*(1-m)*m)/V) - (1-m)
aes = a
bes = b
display(a, b)
d['bayesestimate'] = (d['dct'] + a)/(d['popm'] + a + b)

11.781889938777496

120049.39668603124

N = d.shape[0] #this is the number of counties
n = np.round(d['popm'].values)
y_obs = d['dct'].values

kidney_model = pm.Model()
with kidney_model:
    theta = pm.Beta("theta", alpha=a, beta=b, shape = N)
    Y = pm.Binomial("Y", n=n, p=theta, observed=y_obs)
    #Sample from posterior:
    idata = pm.sample(1000, chains = 2, return_inferencedata = True)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [theta]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 26 seconds.
The acceptance probability does not match the target. It is 0.7170667963818471, but should be close to 0.8. Try to increase the number of tuning steps.

print(idata.posterior)
#Let us now focus on one of the theta variables (say theta corresponding to the county
#in row 346) and then check if the histogram of posterior samples closely approximate
#the correct posterior Beta density. 
#Posterior samples for theta_346 can be accessed as follows.
theta_346_samples = idata.posterior['theta'][:, :, 346].values.flatten()
print(np.mean(theta_346_samples))
print(len(theta_346_samples))

<xarray.Dataset>
Dimensions:      (chain: 2, draw: 1000, theta_dim_0: 3110)
Coordinates:
  * chain        (chain) int64 0 1
  * draw         (draw) int64 0 1 2 3 4 5 6 7 ... 993 994 995 996 997 998 999
  * theta_dim_0  (theta_dim_0) int64 0 1 2 3 4 5 ... 3105 3106 3107 3108 3109
Data variables:
    theta        (chain, draw, theta_dim_0) float64 4.044e-05 ... 7.283e-05
Attributes:
    created_at:                 2023-09-20T01:41:37.825315
    arviz_version:              0.12.1
    inference_library:          pymc3
    inference_library_version:  3.11.2
    sampling_time:              25.946101188659668
    tuning_steps:               1000
0.0002029461144040448
2000

#Let us plot this histogram along with the exact posterior Beta(a + Pos, b + Neg) density:
print(d.iloc[346])
Tot = round(d.loc[346, 'popm'])
Pos = d.loc[346, 'dct']
Neg = Tot - Pos
#This is a large county (population around half a million) and a large death count (120)
#Actual posterior mean
print((a+Pos)/(a+b+Tot))
from scipy.stats import beta
# Plotting histogram
plt.hist(theta_346_samples, bins=50, density=True, alpha=0.6, label="Posterior Samples")

# Superimposing the Beta density
x = np.arange(0, 8e-4, 1e-6)
y = beta.pdf(x, a+Pos, b+Neg)
plt.plot(x, y, 'r-', lw=2, label=f"Beta({a+Pos}, {b+Neg}) Density")
# Labels, legend, and title
plt.xlabel("Value")
plt.ylabel("Density")
plt.legend(loc="upper right")
plt.title("Histogram of Posterior Samples of 346th theta with the actual posterior Beta Density")
plt.show()

state                               FLORIDA
Location           Sarasota County, Florida
fips                                  12115
dc                                       55
pop                                  485595
dc.2                                     65
pop.2                                573711
dct                                     120
popm                               529653.0
naiveproportion                    0.000227
bayesestimate                      0.000203
Name: 346, dtype: object
0.00020283055885838032

#We can compute the posterior means for all theta parameters by simply averaging
#the observed posterior samples
theta_samples = idata.posterior['theta'].values
combined_samples = theta_samples.reshape(-1, N)
theta_means = np.mean(combined_samples, axis=0)

#We can check if these Bayes estimates obtained from samples generated by 
#some kind of a Markov Chain Monte Carlo (MCMC) algorithm match the Bayes estimates
#calculated from the correct Beta posterior densities

#Plot theta_means against the exact posterior mean Bayes estimates
plt.figure(figsize=(10, 6))
plt.scatter(d['bayesestimate'].values, theta_means, alpha=0.5, s = 15)

# Add the y=x line
min_val = min(d['bayesestimate'].values.min(), theta_means.min())
max_val = max(d['bayesestimate'].values.max(), theta_means.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r-', lw=2, label="y=x line")

plt.xlabel("Exact Bayesestimates")
plt.ylabel("MCMC Bayesestimates")
plt.title("MCMC Bayes Estimates vs. Exact Bayes Estimates")
plt.grid(True)
plt.show()
#Clearly the two methods give very nearly the same answers

#The graphical model here is very simple: 
from graphviz import Digraph

def create_graphical_model(N):
    # Create a new directed graph
    dot = Digraph()

    # Add theta nodes and X nodes
    theta_nodes = [f"theta_{i}" for i in range(1, N+1)]
    X_nodes = [f"X_{i}" for i in range(1, N+1)]
    for i in range(N):
        dot.node(theta_nodes[i], label=f"θ{i+1}")
        dot.node(X_nodes[i], label=f"X{i+1}")
        dot.edge(theta_nodes[i], X_nodes[i])
    return dot

# Display the graph
display(create_graphical_model(8))

kidney_model_ab_unknown = pm.Model()
with kidney_model_ab_unknown:
    a = pm.Uniform('a', lower = 0, upper = 50)
    b = pm.Uniform('b', lower = 0, upper = 200000)
    theta = pm.Beta("theta", alpha=a, beta=b, shape = N)
    Y = pm.Binomial("Y", n=n, p=theta, observed=y_obs)
    #Sample from posterior:
    idata = pm.sample(1000, chains = 2, return_inferencedata = True)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [theta, b, a]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 105 seconds.
There were 347 divergences after tuning. Increase `target_accept` or reparameterize.
The acceptance probability does not match the target. It is 0.6992742015685093, but should be close to 0.8. Try to increase the number of tuning steps.
There were 660 divergences after tuning. Increase `target_accept` or reparameterize.
The acceptance probability does not match the target. It is 0.6116679458637905, but should be close to 0.8. Try to increase the number of tuning steps.
The rhat statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.

a_samples = idata.posterior['a'].values.flatten()
b_samples = idata.posterior['b'].values.flatten()
amean = np.mean(a_samples)
astd = np.std(a_samples)
bmean = np.mean(b_samples)
bstd = np.std(b_samples)
display([amean, astd])
display([bmean, bstd])

[17.068281999244608, 1.1274766547362032]

[164737.25252111058, 11262.339137284955]

#plotting the histograms of a_samples and b_samples
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

axes[0].hist(a_samples, bins=30, color='blue', alpha=0.7, label='a_samples')
axes[0].set_title('Histogram of a_samples')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')

axes[1].hist(b_samples, bins=30, color='red', alpha=0.7, label='b_samples')
axes[1].set_title('Histogram of b_samples')
axes[1].set_xlabel('Value')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

#Let us investigate the theta samples:
#As before, we can look at the histogram of the posterior samples for theta_346 and
#compare it with the Beta posterior from the previous analysis
theta_346_samples = idata.posterior['theta'][:, :, 346].values.flatten()
print(np.mean(theta_346_samples))
print(len(theta_346_samples))

#Let us plot this histogram along with Beta(a + Pos, b + Neg) posterior for our previous values of a and b
Tot = round(d.loc[346, 'popm'])
Pos = d.loc[346, 'dct']
Neg = Tot - Pos
#This is a large county (population around half a million) and a large death count (120)
#Actual posterior mean
print((aes+Pos)/(aes+bes+Tot))
from scipy.stats import beta
# Plotting histogram
plt.hist(theta_346_samples, bins=50, density=True, alpha=0.6, label="Posterior Samples")

# Superimposing the Beta density
x = np.arange(0, 8e-4, 1e-6)
y = beta.pdf(x, aes+Pos, bes+Neg)
plt.plot(x, y, 'r-', lw=2, label=f"Beta({aes+Pos}, {bes+Neg}) Density")
# Labels, legend, and title
plt.xlabel("Value")
plt.ylabel("Density")
plt.legend(loc="upper right")
plt.title("Histogram of Posterior Samples of 346th theta with the actual posterior Beta Density")
plt.show()

#This is similar to our previous result but the histogram has moved slightly to the left.

0.00019285921452520883
2000
0.00020283055885838032

#We now compare the Bayes estimates in this random a, b model with 
#the Bayes estimates based on estimation of a, b in our previous adhoc way.
theta_samples = idata.posterior['theta'].values
combined_samples = theta_samples.reshape(-1, N)
theta_means = np.mean(combined_samples, axis = 0)

#Plot theta_means against the exact posterior mean Bayes estimates
plt.figure(figsize=(10, 6))
plt.scatter(d['bayesestimate'].values, theta_means, alpha=0.5, s = 15)

# Add the y=x line
min_val = min(d['bayesestimate'].values.min(), theta_means.min())
max_val = max(d['bayesestimate'].values.max(), theta_means.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r-', lw=2, label="y=x line")

plt.xlabel("Bayes Estimates with Fixed a, b")
plt.ylabel("Bayes Estimates from random a, b")
plt.title("Bayes Estimates from random a, b vs. Fixed a, b Bayes Estimates")
plt.grid(True)
plt.show()

#The results are similar but there are some differences.

#Plotting the posterior standard deviations
theta_stddevs = np.std(combined_samples, axis = 0)

d['bayesstddev'] = np.sqrt(((d['dct'] + aes)/(d['popm'] + aes + bes)) * (1 - ((d['dct'] + aes)/(d['popm'] + aes + bes))) * (1/(d['popm'] + aes + bes + 1)))
#Plot theta_means against the exact posterior mean Bayes estimates
plt.figure(figsize=(10, 6))
plt.scatter(d['bayesstddev'].values, theta_stddevs, alpha=0.5, s = 15)

# Add the y=x line
min_val = min(d['bayesstddev'].values.min(), theta_stddevs.min())
max_val = max(d['bayesstddev'].values.max(), theta_stddevs.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r-', lw=2, label="y=x line")

plt.xlabel("Standard Deviations with Fixed a, b")
plt.ylabel("Standard Deviations from random a, b")
plt.title("Standard Deviations from random a, b vs. Standard Deviations for fixed a, b")
plt.grid(True)
plt.show()
#From this plot, it is clear that the standard deviations in this random a, b model are
#generally smaller than the standard deviations in the fixed a, b model. 
#It appears that this Bayesian Hierarchical Model is able to deduce more information
#about the theta parameters (leading to slightly smaller uncertainty) compared to the
#method where we estimated a and b using only about 300 large counties.

#The graphical model for this model is the following:
from graphviz import Digraph

def create_graphical_model(N):
    # Create a new directed graph
    dot = Digraph()

    # Add theta nodes and X nodes
    theta_nodes = [f"theta_{i}" for i in range(1, N+1)]
    X_nodes = [f"X_{i}" for i in range(1, N+1)]
    for i in range(N):
        dot.node(theta_nodes[i], label=f"θ{i+1}")
        dot.node(X_nodes[i], label=f"X{i+1}")
        dot.edge(theta_nodes[i], X_nodes[i])

    # Add nodes a and b
    dot.node("a", "a")
    dot.node("b", "b")

    # Add directed edges from a and b to each theta_i
    for theta in theta_nodes:
        dot.edge("a", theta)
        dot.edge("b", theta)

    return dot

# Display the graph
display(create_graphical_model(8))

Probabilistic Programming using PyMC, and Graphical Models¶

Graphical Models¶

Bayesian Statistics using PyMC¶

Microwave Example using PyMC¶

Kidney Cancer Data Analysis using PyMC¶

Analysis via a Hierarchical Model¶