#Import the necessary libraries:
import arviz as az
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pymc as pm #PyMC3 is now replaced by PyMC


#Data
n = 15
y_obs = np.array([17.62, 17.61, 17.61, 17.62, 17.62, 17.615, 17.615, 17.625, 17.61, 17.62, 17.62, 17.605, 17.61, 17.62, 17.61])


#Usual Solution:
thetaest = np.mean(y_obs)
thetaerror = np.std(y_obs, ddof=1)/np.sqrt(n)
print(thetaest, thetaerror)

17.615333333333336 0.00150132216861293


#Bayes Solution
measurement_model = pm.Model()
with measurement_model:
    theta = pm.Uniform("theta", lower = -80, upper = 80)
    log_sigma = pm.Uniform("log_sigma", lower = -10, upper = 10)
    sigma = pm.Deterministic("sigma", pm.math.exp(log_sigma))
    Y = pm.Normal("Y", mu = theta, sigma = sigma, observed=y_obs)
    #Sample from posterior:
    idata = pm.sample(2000, chains = 2, return_inferencedata = True)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [theta, log_sigma]

Sampling 2 chains for 1_000 tune and 2_000 draw iterations (2_000 + 4_000 draws total) took 3 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


theta_samples = idata.posterior['theta'].values.flatten()
log_sigma_samples = idata.posterior['log_sigma'].values.flatten()
#Histogram of posterior theta samples for theta
plt.hist(theta_samples, bins = 500)
plt.xlabel(r'Values of $\theta$')
plt.ylabel('Frequency')
plt.title(r'Histogram of the Posterior $\theta$ samples')
plt.show();

#Histogram of posterior log_sigma samples
plt.hist(log_sigma_samples, bins = 500)
plt.xlabel(r'values of $\log(\sigma)$')
plt.ylabel('Frequency')
plt.title(r'Histogram of the Posterior $\log(\sigma)$ samples')
plt.show();

#Posterior samples for sigma are obtained by exponentiating the log_sigma samples
sigma_samples = np.exp(log_sigma_samples)
plt.hist(sigma_samples, bins = 500)
plt.xlabel(r'values of $\sigma$')
plt.ylabel('Frequency')
plt.title(r'Histogram of the Posterior $\sigma$ samples')
plt.show();


#Our best estimate of the unknown length can be taken to be the mean of the posterior samples for theta:
thetamean = np.mean(theta_samples)
display(thetamean)
#Our uncertainty in theta can be captured by the standard deviation of the posterior samples
thetastd = np.std(theta_samples)
display(thetastd)
#Our best estimate for sigma (which can be interpreted as the uncertainty in each individual measurement) can be obtained 
#by taking the mean of the posterior samples for sigma:
sigmamean = np.mean(sigma_samples)
display(sigmamean)

17.615357797323185

0.0015931447991204468

0.006142381305189109


display([thetamean, thetaest])
display([thetastd, thetaerror])

[17.615357797323185, 17.615333333333336]

[0.0015931447991204468, 0.00150132216861293]


#A 95% interval for theta based on the posterior samples is computed as follows:
lower_limit_theta = np.percentile(theta_samples, 2.5)
upper_limit_theta = np.percentile(theta_samples, 97.5)
display([lower_limit_theta, upper_limit_theta])

#Similarly, A 95% interval for sigma based on the posterior samples is computed as follows:
lower_limit_sigma = np.percentile(sigma_samples, 2.5)
upper_limit_sigma = np.percentile(sigma_samples, 97.5)
display([lower_limit_sigma, upper_limit_sigma])

[17.612289074482923, 17.61846644701896]

[0.0043205952568492155, 0.008952565464264876]


#In the following, we take C = 10000
#Uncomment the code to run

# measurement_model = pm.Model()
# with measurement_model:
#     theta = pm.Uniform("theta", lower = -10000, upper = 10000)
#     log_sigma = pm.Uniform("log_sigma", lower = -10000, upper = 10000)
#     sigma = pm.Deterministic("sigma", pm.math.exp(log_sigma))
#     Y = pm.Normal("Y", mu = theta, sigma = sigma, observed=y_obs)
#     #Sample from posterior:
#     idata = pm.sample(2000, chains = 2, return_inferencedata = True) 
    
#This model with C extremely large is throwing an error.


y_obs = np.array([17.62, 17.61, 17.61, 17.62, 17.62, 17.615, 17.615, 17.625, 17.61, 17.62, 17.62, 17.605, 17.61, 17.62, 17.61, 25, 31])
n = len(y_obs)
print(n)

17


#Usual Solution:
thetaest = np.mean(y_obs)
thetaerror = np.std(y_obs, ddof=1)/np.sqrt(n)
print(thetaest, thetaerror)

18.837058823529414 0.8751232117897164


measurement_model = pm.Model()
with measurement_model:
    theta = pm.Uniform("theta", -80, 80)
    log_sigma = pm.Uniform("log_sigma", -10, 10)
    sigma = pm.Deterministic("sigma", pm.math.exp(log_sigma))
    Y = pm.Normal("Y", mu = theta, sigma = sigma, observed=y_obs)
    #Sample from posterior:
    idata = pm.sample(2000, chains = 2, return_inferencedata = True) 

theta_samples = idata.posterior['theta'].values.flatten()
log_sigma_samples = idata.posterior['log_sigma'].values.flatten()
sigma_samples = np.exp(log_sigma_samples)
#Our best estimate of the unknown length can be taken to be the mean of the posterior samples for theta:
thetamean = np.mean(theta_samples)
display([thetamean, np.mean(y_obs)])
#Our uncertainty in theta can be captured by the standard deviation of the posterior samples
thetastd = np.std(theta_samples)
display(thetastd)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [theta, log_sigma]

Sampling 2 chains for 1_000 tune and 2_000 draw iterations (2_000 + 4_000 draws total) took 2 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics

[18.817034942559157, 18.837058823529414]

0.9416332348601021


measurement_somefake_model = pm.Model()
with measurement_somefake_model:
    w = pm.Beta("w", alpha = 1, beta = 1)
    theta = pm.Uniform("theta", lower = -80, upper = 80)
    log_sigma = pm.Uniform("log_sigma", lower = -10, upper = 10)
    sigma = pm.Deterministic("sigma", pm.math.exp(log_sigma))
    
    theta2 = 0
    sigma2 = 100
    
    thetas = pm.math.stack([theta, theta2])
    sigmas = pm.math.stack([sigma, sigma2])

    category = pm.Bernoulli("category", p = w, shape = n)
    mean_idx = thetas[category]
    sd_idx = sigmas[category]
    obs = pm.Normal("obs", mu = mean_idx, sigma = sd_idx, observed = y_obs)
    idata = pm.sample(1000, chains = 2, return_inferencedata = True)

Multiprocess sampling (2 chains in 4 jobs)
CompoundStep
>NUTS: [w, theta, log_sigma]
>BinaryGibbsMetropolis: [category]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 7 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


w_samples = idata.posterior['w'].values.flatten()
print(w_samples)
wmean = np.mean(w_samples)
wstd = np.std(w_samples)
print(f"mean of w: {wmean}")
print(f"SD of w: {wstd}")

[0.08974985 0.10023658 0.16318447 ... 0.08025677 0.21912921 0.12668582]
mean of w: 0.1563788527985102
SD of w: 0.07999833790480428


theta_samples = idata.posterior['theta'].values.flatten()
theta_est = np.mean(theta_samples) #this will be our estimate of theta
theta_std = np.std(theta_samples) #this will be the uncertainty in the estimate of theta
print(f"Estimate for theta: {theta_est}") 
print(f"Uncertainty in theta (SD): {theta_std}") 

sigma_samples = idata.posterior['sigma'].values.flatten()
sigma_est = np.mean(sigma_samples)
print(f"Estimate for sigma: {sigma_est}") #this will be the estimate of sigma

Estimate for theta: 17.61535873280061
Uncertainty in theta (SD): 0.001599080046578374
Estimate for sigma: 0.006145534327400754


print(idata.posterior['category'].shape)

(2, 1000, 17)


category_samples_onepoint = idata.posterior['category'][:, :, 16].values.flatten()
print(category_samples_onepoint)

[1 1 1 ... 1 1 1]


category_samples = idata.posterior['category'].values
combined_samples = category_samples.reshape(-1, n)

category_means = np.mean(combined_samples, axis = 0)
category_sums = np.sum(combined_samples, axis = 0)
print(np.round(category_means))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]


dplan = pd.read_csv('exoplanets.csv')
print(dplan.shape)
dplan.head(10)

(517, 6)


#Let us plot the histogram of the radii:
plt.figure(figsize=(10, 8))
plt.hist(dplan['radius'], bins = 100)
plt.xlabel('Radius')
plt.ylabel('Frequency')
plt.title('Histogram of Planet Radii')
plt.show()
#Clearly there are two groups of planet radii: Big planets and Small planets.


#Fitting a Gaussian mixture model using pymc
#N = dplan_no_missing.shape[0]
#observed = dplan_no_missing['pl_radj']
N = dplan.shape[0]
exoplanet_model = pm.Model()
with exoplanet_model:
    w = pm.Beta("w", alpha = 1, beta = 1)
    thetas = pm.Normal("thetas", mu = np.array([2, 14]), sigma = 10, shape = 2)
    sigmas = pm.HalfNormal("sigmas", sigma = 10, shape = 2) #The HalfNormal distribution is the absolute value of the Normal
    category = pm.Bernoulli("category", p = w, shape = N)
    mean_individual = thetas[category]
    sd_individual = sigmas[category]
    obs = pm.Normal("obs", mu = mean_individual, sigma = sd_individual, observed = dplan['radius'])
    idata = pm.sample(1000, chains = 2, return_inferencedata = True)

Multiprocess sampling (2 chains in 4 jobs)
CompoundStep
>NUTS: [w, thetas, sigmas]
>BinaryGibbsMetropolis: [category]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 71 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


#Calculate posterior means of w, thetas and sigmas:
w_samples = idata.posterior['w'].values.flatten()
wmean = np.mean(w_samples)

thetas_samples = idata.posterior['thetas'].values
combined_thetas_samples = thetas_samples.reshape(-1, 2)

thetas_means = np.mean(combined_thetas_samples, axis = 0)

sigmas_samples = idata.posterior['sigmas'].values
combined_sigmas_samples = sigmas_samples.reshape(-1, 2)

sigmas_means = np.mean(combined_sigmas_samples, axis = 0)

print(f"mean of w: {wmean}")
print(f"mean of theta_0: {thetas_means[0]}")
print(f"mean of theta_1: {thetas_means[1]}")
print(f"mean of sigma_0: {sigmas_means[0]}")
print(f"mean of theta_0: {sigmas_means[1]}")

mean of w: 0.6644072056827766
mean of theta_0: 2.6638927220217274
mean of theta_1: 13.466795467201019
mean of sigma_0: 1.2249132379040955
mean of theta_0: 3.2315425843829293


#Let us plot the fitted two component normal mixture density on the histogram (to visualize the fit):
from scipy.stats import norm

plt.hist(dplan['radius'], bins = 100, density = True, alpha = 0.6, label = 'Planet Radius Data')
x = np.linspace(min(dplan['radius']), max(dplan['radius']), 1000)
density = wmean * norm.pdf(x, thetas_means[0], sigmas_means[0]) + (1-wmean) * norm.pdf(x, thetas_means[1], sigmas_means[1])
plt.plot(x, density, 'r', label = 'Gaussian Mixture Density')
plt.title('Histogram of Planet Radii with the Fitted Gaussian Mixture Density')
plt.xlabel('Radius')
plt.ylabel('Density')
plt.legend()
plt.show()


#We now repeat the analysis on the logarithmic scale
#First let us plot the histogram of log(radius)
data_log = np.log(dplan['radius'])
#Let us plot the histogram of the radii:
plt.figure(figsize=(10, 8))
plt.hist(data_log, bins = 100)
plt.xlabel('log(Radius)')
plt.ylabel('Frequency')
plt.title('Histogram of Logarithm of Planet Radii')
plt.show()
#Clearly there are two groups of planet radii: Big planets and Small planets.


#Fitting a Gaussian mixture model using pymc
N = len(data_log)
exoplanet_model = pm.Model()
with exoplanet_model:
    w = pm.Beta("w", alpha = 1, beta = 1)
    thetas = pm.Normal("thetas", mu = np.array([1, 2.5]), sigma = 3, shape = 2)
    sigmas = pm.HalfNormal("sigmas", sigma = 3, shape = 2)
    category = pm.Bernoulli("category", p = w, shape = N)
    mean_individual = thetas[category]
    sd_individual = sigmas[category]
    obs = pm.Normal("obs", mu = mean_individual, sigma = sd_individual, observed = data_log)
    idata = pm.sample(1000, chains = 2, return_inferencedata = True)

Multiprocess sampling (2 chains in 4 jobs)
CompoundStep
>NUTS: [w, thetas, sigmas]
>BinaryGibbsMetropolis: [category]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 72 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


#Calculate posterior means of w, thetas and sigmas:
w_samples = idata.posterior['w'].values.flatten()
wmean = np.mean(w_samples)

thetas_samples = idata.posterior['thetas'].values
combined_thetas_samples = thetas_samples.reshape(-1, 2)

thetas_means = np.mean(combined_thetas_samples, axis = 0)

sigmas_samples = idata.posterior['sigmas'].values
combined_sigmas_samples = sigmas_samples.reshape(-1, 2)

sigmas_means = np.mean(combined_sigmas_samples, axis = 0)

print("Results on the log scale:")
print(f"mean of w: {wmean}")
print(f"mean of theta_0: {thetas_means[0]}")
print(f"mean of theta_1: {thetas_means[1]}")
print(f"mean of sigma_0: {sigmas_means[0]}")
print(f"mean of theta_0: {sigmas_means[1]}")

Results on the log scale:
mean of w: 0.6134098346896935
mean of theta_0: 1.0123333806004144
mean of theta_1: 2.6148677968707754
mean of sigma_0: 0.6262416891159611
mean of theta_0: 0.19873751616543645


#Let us plot the fitted two component normal mixture density on the histogram (to visualize the fit):
from scipy.stats import norm

plt.hist(data_log, bins = 100, density = True, alpha = 0.6, label = 'Logarithm of Planet Radius Data')
x = np.linspace(min(data_log), max(data_log), 1000)
density = wmean * norm.pdf(x, thetas_means[0], sigmas_means[0]) + (1-wmean) * norm.pdf(x, thetas_means[1], sigmas_means[1])
plt.plot(x, density, 'r', label = 'Gaussian Mixture Density')
plt.title('Histogram of Planet Radii with the Fitted Gaussian Mixture Density')
plt.xlabel('log(Radius)')
plt.ylabel('Density')
plt.legend()
plt.show()


category_samples = idata.posterior['category'].values
combined_samples = category_samples.reshape(-1, N)
print(combined_samples.shape)
category_means = np.mean(combined_samples, axis = 0)

plt.hist(category_means, density = True)
plt.title('Histogram of Probabilities of each planet being large')
plt.xlabel('Probability')
plt.ylabel('Density')
plt.show()
#For large planets, the corresponding entry in category_means will be large. For small planets, the entry in category_means will be small. 
#If the model is unsure between large and small, it will give a value for category_means which is close to 0.5.

(2000, 517)


#Creating a Classification of Data Points based on the posterior mean of the category variable:
binary_class = (category_means > 0.5).astype(int)
dplan['classi'] = binary_class
print(np.mean(dplan['classi']))

0.6286266924564797


#Plotting two separate histograms in one plot
radius_zero = dplan[dplan['classi'] == 0]['radius']
radius_one = dplan[dplan['classi'] == 1]['radius']
plt.hist(radius_zero, bins = 100, color = 'red', alpha = 0.5, label = 'Small Radii')
plt.hist(radius_one, bins = 100, color = 'blue', alpha = 0.5, label = 'Large Radii')
plt.title('Histograms')
plt.xlabel('Radius')
plt.ylabel('Frequency')
plt.legend()
plt.show()


print(np.max(radius_zero), np.min(radius_one))

8.1600792 8.3730483


dplan[(dplan['radius'] == 8.1600792) | (dplan['radius'] == 8.3730483)]


print(category_means[262])
print(category_means[317])

0.426
0.55


#Rejection sampling for generating samples from Beta(4, 1):
#Our proposal distribution will be Uniform[0, 1]
#The value of M can be taken to be the largest value of the density. 
M = 4
N = 20000 #this is the number of proposal samples that we will generate
prior_samples = np.random.rand(N)
p_prior_samples = prior_samples ** 3
Y_samples = np.random.binomial(n = 1, p = p_prior_samples)
posterior_samples = prior_samples[Y_samples == 1]
print(len(posterior_samples))
plt.hist(posterior_samples, bins = 500, density = True, alpha = 0.6, label = 'Rejection Sampling Samples from Beta(4, 1)') 
x = np.linspace(0, 1, 1000)
from scipy.stats import beta
pdf_values = beta.pdf(x, 4, 1)
plt.plot(x, pdf_values, 'r-', label = 'Beta(4, 1) Density')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.title('Histogram of Samples from Rejection Sampling and Beta(4, 1) density')
plt.show()
#The match between the histogram and the true density is not bad but we are only 
#getting about 1/4 of the total samples (others are rejected because of Y = 0).

4895


#Rejection sampling for generating samples from Beta(20, 2):
#Our proposal distribution will be Uniform[0, 1]
#The value of M can be taken to be anything larger than the largest value of the density. 
M = 8
N = 20000 #this is the number of proposal samples that we will generate
prior_samples = np.random.rand(N)
p_prior_samples = 420 * (prior_samples ** 19) * (1 - prior_samples) * (1/8)
Y_samples = np.random.binomial(n = 1, p = p_prior_samples)
posterior_samples = prior_samples[Y_samples == 1]
print(len(posterior_samples))
plt.hist(posterior_samples, bins = 500, density = True, alpha = 0.6, label = 'Rejection Sampling Samples from Beta(20, 2)') 
x = np.linspace(0, 1, 1000)
from scipy.stats import beta
pdf_values = beta.pdf(x, 20, 2)
plt.plot(x, pdf_values, 'r-', label = 'Beta(20, 2) Density')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.title('Histogram of Samples from Rejection Sampling and Beta(20, 2) density')
plt.show()
#The match between the histogram and the true density is not bad but we are only 
#getting about 1/8 of the total samples (others are rejected because of Y = 0).

2506

	name	orbital_period	mass	radius	star_temperature	density
0	2MASS J21402931+1625183 A b	7336.500000	6657.910000	10.312188	2300.0	NaN
1	55 Cnc e	0.736539	8.078476	1.905513	5196.0	6.400
2	BD+20 594 b	41.685500	16.299962	2.230571	5766.0	7.890
3	CoRoT-1 b	1.508956	327.334000	16.701261	5950.0	0.380
4	CoRoT-10 b	13.240600	873.950000	10.872633	5075.0	3.700
5	CoRoT-11 b	2.994330	740.474000	16.028727	6440.0	0.990
6	CoRoT-12 b	2.828042	291.422600	16.140816	5675.0	0.411
7	CoRoT-13 b	4.035190	415.682400	9.919877	5945.0	2.340
8	CoRoT-14 b	1.512140	2415.280000	12.217701	6035.0	7.300
9	CoRoT-16 b	5.352270	170.023000	13.114413	5650.0	0.440

	name	orbital_period	mass	radius	star_temperature	density	classi
262	Kepler-35 b	131.45800	40.3606	8.160079	5606.0	0.41	0
317	Kepler-539 b	125.63243	308.2660	8.373048	5820.0	2.90	1

More Bayesian Examples with PyMC¶

Example: Normal Mean Estimation¶

Example: Normal Mean Estimation with Some Erroneous Observations¶

Example: Exoplanet Data Analysis¶

A study of some techniques for Monte Carlo Sampling¶

Rejection Sampling¶