$\newcommand{\Prob}[1]{\mathsf{P}\{#1\}}$
$\newcommand{\rv}[1]{\mathbf{#1}}$
With classical distributions, you can use standard library functions as in the following example with the exponential distribution. Populate your cell with the following code
import numpy as np
from scipy.stats import expon #import exponential distribution
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1)
#<name>.ppf(alpha) returns alpha-quantile
x = np.linspace(expon.ppf(0.01), expon.ppf(0.99), 100)
#<name>.pdf(x) returns the value of the probability distribution function
ax.plot(x, expon.pdf(x), lw=5, alpha=0.6, label='exponential pdf') #graph of pdf
ax.plot(x, expon.cdf(x), lw=5, alpha=0.6, label='exponential cdf') #graph of ddf
ax.set_xlabel('values of random variable')
ax.set_ylabel('pdf / cdf')
ax.legend()
plt.show()
#one can display the inverse cdf with <name>.ppf() method
fig, ax = plt.subplots(1, 1)
x = np.linspace(0, 1, 100)
ax.plot(x, expon.ppf(x), lw=5, alpha=0.6, label='exponential inverse pdf') #graph of pdf^{-1}
ax.set_xlabel('quantiles')
ax.set_ylabel(r'inverse cdf $F^{-1}$')
plt.show()
bin = np.linspace(m, M, N)
import numpy as np
m = 0
M = 1
N = 1000
vals_avail = np.linspace(m, M, num=N)
vals_avail[:10], vals_avail[-5:-1]
def cdf_str(x):
return 0.5 * (x + np.sqrt(x))
z_avail = [cdf_str(v) for v in vals_avail]
np.transpose(z_avail[:5]), np.transpose(z_avail[-5:-1])
#Inverse pdf
from scipy.interpolate import interp1d
import random
import matplotlib.pyplot as plt
ppf_str = interp1d(z_avail, vals_avail, kind='linear')
v_new = np.linspace(m, M, num=17)
fig, ax = plt.subplots(1, 1)
ax.plot(z_avail, vals_avail, 'o', cdf_str(v_new), v_new, '--')
random.seed(1)
v_tmp = random.random()
ax.plot(v_tmp, ppf_str(v_tmp), 's', markersize=12)
ax.legend(['data', 'linear', 'test point'], loc='best')
ax.set_xlabel('random values $z$ from $[0, 1]$')
ax.set_ylabel('inverse cdf $F^{-1}(z)$')
plt.show()
x = np.array(x)
into the codeimport numpy as np
from scipy.interpolate import interp1d
def cdf_complex(x):
return 0.5 * (x + np.sqrt(x))
def ppf_appr(m=0, M=1, N=1000):
vals_avail = np.linspace(m, M, num=N)
z_avail = [cdf_complex(v) for v in vals_avail]
ppf = interp1d(vals_avail, z_avail, kind='linear')
return ppf
i
varies from 0
to N-1
) for i in range(N)
:#Example of the sampling with replacement
import random
random.seed(1)
x_example = [0, 2.5, 2022, 1, 0]
sample_new = random.choices(x_example, k=15)
sample_new
#EStimate of the variance
import numpy as np
def bootstrap_var(x, N=100, seed=1):
random.seed(seed)
variances = []
for i in range(N):
sample_new = random.choices(x, k=len(x))
variances.append(np.var(sample_new))
return variances
variances = bootstrap_var([0, 2.5, 2022, 1, 0])
np.mean(variances), np.var(x_example)
- all obtained estimates are conditional to the observation of the initial vector $\mathbf{x} = (x_1, \ldots, x_n)$
- these conditional estimates tend to the true estimates as $n \to \infty$
- typically, bootstrap estimate is located below a real variance
- The true distribution can be continuous, but the bootstrapped distribution is discrete
- This can be improved through the approximation of the observed discrete distribution by a continuous function
numpy.percentile()
# Syntax of numpy.percentile()
numpy.percentile(arr, percentile, axis=None, out=None, overwrite_input=False, keepdims=False)
arr
- array_like, this is the input array or object that can be converted to an array.
percentile
– array_like of float Percentile or sequence of percentiles to compute, which must be between 0 and 100 inclusive
numpy.percentile
chooses the lowest point at the left gap, the highest point at the right gap and goes from bottom to the top uniformly in-betweenThe second example is below
#technical: plots empirical distribution (step-like function)
def plt_empirical_dstr(x, clr=None, label='Empirical distribution function'):
frac_before = [i / len(x) for i in range(len(x))]
frac_after = [(i+1) / len(x) for i in range(len(x))]
for i in range(len(x)):
plt.plot([x[i], x[i]], [frac_before[i], frac_after[i]], linestyle='--',
color=clr_perc)
if i == 0 and clr != None:
plt.plot([x[i], x[i+1]], [frac_after[i], frac_after[i]], label=label)
elif i < len(x)-1 and clr != None:
plt.plot([x[i], x[i+1]], [frac_after[i], frac_after[i]])
elif i == 0:
plt.plot([x[i], x[i+1]], [frac_after[i], frac_after[i]], label=label)
elif i < len(x)-1:
plt.plot([x[i], x[i+1]], [frac_after[i], frac_after[i]])
#empirical distribution and linear approximation to it with N = 8 points
import numpy as np
import random
import matplotlib.pyplot as plt
random.seed(1)
N = 8
x = []
for _ in range(N):
x.append(random.uniform(0, 2))
x = np.array(sorted(x))
num_percentiles = 100
t = []
val = []
for uni in range(num_percentiles):
t.append(uni / num_percentiles)
val.append(np.percentile(x, uni))
fig, ax = plt.subplots(1, 1)
ax.plot(val, t, label='Linear approximation')
plt_empirical_dstr(x, clr='orange')
ax.legend()
ax.set_xlabel('observed values of the random variable')
ax.set_ylabel('cdf $F$')
ax.set_title('${:d}$-value sample'.format(N))
plt.show()
u = [100*random.random() for _ in range(len(x))]
z = np.percentile(x, u)
import numpy as np
a = 5.0
type(a) == int
#function returns cdf with the normal kernel defined above and the support on [min(x), max(x)]
#s = 0.3*np.std(x) / ((len(x))**0.2)
#error processing
#code generates pairs: value of random variable and cdf at this point
#values:
# between min(x) and max(x) with a uniform step, where the equation is used
# two additional values: one at the left, where cdf = 0, and one at the right, where cdf = 1
from scipy.stats import norm
def cdf_kernel_norm(x, s=None, num=100):
func_name = 'cdf_kernel_norm' #for error messages
#Error processing
if s == None:
s = 0.3*np.std(x) / ((len(x))**0.2)
if type(num) != int:
raise TypeError(f'{func_name}: Only integer <num> are allowed; transfered: {num}')
elif num <= 0:
raise Exception(f'{func_name}: incorrect number of variables <num> = {num}')
if x == None or len(x) == 0:
raise Exception(f'{func_name}: Incorrect input array {x}')
#----End of error processing
# The simplest way to define the values is z = np.linspace(min(x), max(x), num)
# We define additionally min(x) - delta, where cdf = 0 and max(x) + delta, where cdf = 1
# normally delta = x[i+1] - x[i], but a certain definition must be done if len(x) == 1
elif len(x) == 1 and x != 0:
delta = x[0] / 2
z = np.array([x[0]-delta, x[0], x[0]+delta])#values at which cdf is defined
elif len(x) == 1:
delta = 1
z = np.array([x[0]-delta, x[0], x[0]+delta])#values at which cdf is defined
else:
delta = (np.max(x) - np.min(x)) / (len(x) - 1)
z = np.linspace(min(x)-delta, max(x)+delta, num+2)#values at which cdf is defined
cdf = []
cdf.append(0)
for zz in z[1:-1]:
arr = [norm.cdf((zz-xx) / s) for xx in x]
cdf.append(np.mean(arr)) #the value of cdf at the point z
cdf.append[1]
return z, cdf
x = []
N = 8
random.seed(1)
for _ in range(N):
x.append(random.uniform(0, 2))
x = np.array(sorted(x))
fig, ax = plt.subplots(1, 1)
s_param = [0.01, 0.1, 1]
for s in s_param:
z, cdf = cdf_kernel_norm(x, s)
plt.plot(z, cdf, label=f'$s = {s}$')
ax.set_ylabel('cdf $F$')
ax.set_title('Role of the standard deviation')
ax.legend()
plt.show()
fig, ax = plt.subplots(1, 1)
plt_empirical_dstr(x)
z, cdf = cdf_kernel_norm(x, s)#Kernel approximation
plt.plot(z, cdf)
ax.set_xlabel('observed values of the random variable')
ax.set_ylabel('cdf $F$')
ax.set_title('${:d}$-value sample'.format(N))
ax.legend()
plt.show()
from scipy.interpolate import interp1d
import random
def plt_kernel_appr(x, s=None, numpoints=100, label='kernel'):
if s == None:
s = 0.3*np.std(x) / ((len(x))**0.2)
z, cdf = cdf_kernel_norm(x, s)
ppf_str = interp1d(cdf, z, kind='linear')
tt = np.linspace(min(cdf), max(cdf), numpoints)
ax.plot(ppf_str(tt), tt, label=label)
fig, ax = plt.subplots(1, 1)
plt_empirical_dstr(x)
random.seed(1)
v_tmp = random.uniform(min(cdf), max(cdf))
plt_kernel_appr(x)
ax.plot(ppf_str(v_tmp), v_tmp, 's', markersize=12)
ax.legend()
def cdf_percentile_appr(x, num_percentiles=100):
t = []
val = []
for uni in range(num_percentiles):
t.append(uni / num_percentiles)
val.append(np.percentile(x, uni))
return val, t
random.seed(1)
fig, ax = plt.subplots(1, 1)
plt_kernel_appr(x, label='Kernel approximation')
plt_empirical_dstr(x)
val, t = cdf_percentile_appr(x)
ax.plot(val, t, label='Linear approximation', color='red')
ax.legend()
plt.show()
$\newcommand{\Exp}{\mathbf{E}\,}$
#Example with $n = 15$, $N = 1000$, $\theta = 2$
import numpy as np
import random
random.seed(1)
n = 15
N = 1000
theta = 2
x = []
for _ in range(N):
x.append(random.uniform(0, theta))
#data is sampled; theta is a secret knowledge
theta_mle = max(x)
thetas_bootstrap = []
for i in range(N):
x_tmp = random.choices(x, k=len(x))
thetas_bootstrap.append(np.max(x_tmp))
theta_corrected = 2*theta_mle - np.mean(thetas_bootstrap)
print('True value = {:.6f}; initial estimate = {:.6f}; corrected estimate = {:.6f}'.format(theta,
theta_mle, theta_corrected))
Let $X_1$, $\ldots$, $X_n$ be distinct observations (no ties). Show that there are $2n-1\choose n$ distinct bootstrap samples. Note that this number is asymptotically $(n\pi)^{-1/2} 2^{2n-1}$