ELEC-E7890 - User Research
Lecture 3 - Inference
Aurélien Nioche
Aalto University
# Import the libraries
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats as stats
import string # For adding letters in the figures
import scipy.special # For gamma function
from statsmodels.formula.api import ols # For regression analysis
from statsmodels.stats.anova import anova_lm # For ANOVA
%config InlineBackend.figure_format='retina' # For not burning your eyes
sns.set_context("notebook")
np.set_printoptions(threshold=8) # Don't print to much elements when printing arrays
Let's re-use the data from the last lecture...
# Seed the random number generator
np.random.seed(4)
# Set the parameters
mu_A = 150.0
mu_B = 200.0
small_sd = 10.0
large_sd = 50.0
n = 100
# Create the samples
xA_small_sd = np.random.normal(mu_A, scale=small_sd, size=n)
xB_small_sd = np.random.normal(mu_B, scale=small_sd, size=n)
dataset_small_sd = pd.DataFrame({"xA": xA_small_sd, "xB": xB_small_sd})
dataset_small_sd
xA_large_sd = np.random.normal(mu_A, scale=large_sd, size=n)
xB_large_sd = np.random.normal(mu_B, scale=large_sd, size=n)
dataset_large_sd = pd.DataFrame({"xA": xA_large_sd, "xB": xB_large_sd})
dataset_large_sd
# Create figure and axes
fig, axes = plt.subplots(ncols=2, nrows=2, figsize=(16, 9))
# For each dataset (containing each two samples)
datasets = dataset_large_sd, dataset_small_sd
for i in range(len(datasets)):
# Get data
df = datasets[i]
# Create histograms
ax = axes[i, 0]
sns.histplot(data=df, ax=ax, kde=False, element="step")
# Plot the theoretical mean
ax.axvline(mu_A, ls='--', color='black', alpha=0.1, lw=2)
ax.axvline(mu_B, ls='--', color='black', alpha=0.1, lw=2)
# Set the axis lables
ax.set_ylabel("Proportion")
ax.set_xlabel("value")
# Create a barplot
ax = axes[i, 1]
df = df.melt()
sns.barplot(x="variable", y="value", ax=ax, data=df, ci="sd")
# Add horizontal lines representing the means
ax.axhline(mu_A, ls='--', color='black', alpha=0.1, lw=2)
ax.axhline(mu_B, ls='--', color='black', alpha=0.1, lw=2)
# Set the y limits
ax.set_ylim(0, max(mu_A, mu_B) + large_sd * 1.25)
plt.tight_layout()
plt.show()
Note that we assume here that both $x_A$ and $x_B$ are normally distributed, that is that the data come from a normal distribution. More on this later...
Let's see what happens for the dataset with small standard deviation.
# Run a Student's t-test
t, p = stats.ttest_ind(dataset_small_sd.xA, dataset_small_sd.xB)
# Print the results
print(f"t={t}, p={p}")
Here, the p-value is very small ($p=0.000 \dots 1$), what means that it is quite likely that there is a difference in the parent population (and we know that it is indeed the case as we generate the data ourselves).
Let's look of what happens for dataset with large standard deviation:
# Run a Student's t-test
t, p = stats.ttest_ind(dataset_large_sd.xA, dataset_large_sd.xB)
# Print the results
print(f"t={t}, p={p}")
Both p-values are very small, indicating that the probability to make a mistake by rejecting the null-hypothesis is small.
It turns out that the $n$ is so large, that the difference between the two means is also statistically significant in the dataset with large variance. Inferential statistics are a good tool to know if we could generalize what we observed (SD/variance)!
So let's simulate and see how often I could be wrong...
Let's construct two samples $x_1$ and $x_2$, such that $\bar x_1 > \bar x_2$ (with $\bar x$: the observed mean for $x$), even if $\mu_{x_1} = \mu_{x_2}$ (with $\mu_x$: the theoretical/parent population mean).
# Seed the random number generator
np.random.seed(1234)
# Set the parameters
n = 100
mu = 100
sigma = 30
# Generate two samples
xA = np.random.normal(mu, scale=sigma, size=n)
xB = np.random.normal(mu, scale=sigma, size=n)
if np.mean(xA) > np.mean(xB):
x1 = xA
x2 = xB
else:
x1 = xB
x2 = xA
# Look at the data
df = pd.DataFrame(data={"x1": x1, "x2": x2})
df
print(f"mean x1: {np.mean(df.x1)}")
print(f"mean x2: {np.mean(df.x2)}")
# Create figure and axes
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 4))
# Create the histograms
sns.histplot(data=df, ax=ax1,
element="step",
stat="density") # y-axis is density instead of counts
# Put labels
ax1.set_ylabel("Density")
ax1.set_xlabel("value")
# Plot the barplot
df_melted = df.melt(var_name="group")
sns.barplot(x="group", y="value", ax=ax2, data=df_melted, ci="sd")
plt.tight_layout()
plt.show()
# Run a Student's t-test
t, p = stats.ttest_ind(x1, x2, equal_var=True)
# Print the results
print(f"t = {t}, p = {p}")
Note that as expected, the p-value is really high (suggesting that the probability of making a mistake by rejecting the null-hypothesis is really high). Indeed, even if our samples, we observe that $\bar x_1 > \bar x_2$, the theoretical mean of both samples is the same (by construction).
...by looking at which frequency I observe the inverse difference, that is $\bar x_2 > \bar x_1$ instead of $\bar x_1 > \bar x_2$, creating new datasets using as theoretical means the means observed in the samples, and as theoretical SD the SD observed in the samples.
# Compute the observed means for the two samples
mu1 = np.mean(x1)
mu2 = np.mean(x2)
# Compute the observed standard deviations for the two samples
sigma1 = np.std(x1)
sigma2 = np.std(x2)
# Set the parameters
n_dataset = 10000
# Container for the results
mu2_sup_mu1 = np.zeros(n_dataset)
# Create data for each dataset
for i in range(n_dataset):
# Create two samples using parameters observed in the sample, and of the same size
new_x1 = np.random.normal(mu1, scale=sigma1, size=n)
new_x2 = np.random.normal(mu2, scale=sigma2, size=n)
# Look if observed mean for sample 2
# is superior for sample 1
r = np.mean(new_x2) > np.mean(new_x1)
# Store the result
mu2_sup_mu1[i] = r
# Compute the frequence with which
# 'inverse' difference is observed
err_freq = np.mean(mu2_sup_mu1)
# Create figure and axis
fig, ax = plt.subplots()
# Define labels
labels = [r'$\bar{x_2} > \bar{x_1}$ (freq)', "p-value / 2"]
# Defines values
values = [err_freq, p/2]
# Create barplot
ax.bar(labels, values)
# Set the limit of the y-axis
ax.set_ylim(0, 1.0)
plt.show()
In human/animal related studies (but not only), $p=0.05$ (5% error), what means that a test result can be said significant if the $p$-value is below $0.05$.
Note that it is before all a convention.
Null hypothesis is true | null hypothesis is false | |
---|---|---|
Reject null hypothesis | False positive [Type I error] | true negative |
Do not reject null hypothesis | True positive | False negative [Type II error] |
Sensitivity $= \frac{TP}{TP + FN}$ ("don't miss someone positive")
Specificity = $\frac{TN}{TN + FP}$ ("don't select someone negative")
Accuracy = $\frac{TP + TN}{TP + TN + FP + FN}$ ("do good predictions overall")
Precision = $\frac{TP}{TP + FP}$ ("don't be too inclusive")
Let's take an example...
# Seed the random number generator
np.random.seed(46)
# Set the parameters
mu = 100
sig = 10
n = 20
# Generate two samples
x1 = np.random.normal(mu, scale=sig, size=n)
x2 = np.random.normal(mu, scale=sig, size=n)
# Look at the data
df = pd.DataFrame({"x1": x1, "x2": x2})
df
mean_x1 = np.mean(df.x1)
mean_x2 = np.mean(df.x2)
print(f"sample mean x1: {mean_x1}")
print(f"sample mean x2: {mean_x2}")
Note that the sample means are different even if the population mean is the same (mu=100
).
# Create the fig and axes
fig, axes = plt.subplots(ncols=2, figsize=(16, 4))
# Do the histogram
ax = axes[0]
sns.histplot(df, ax=ax, stat="density", element="step")
# Create the lines indicating the means
ax.axvline(mean_x1, label="sample mean", color='C0', lw=2, ls=':')
ax.axvline(mean_x2, label="sample mean", color='C1', lw=2, ls=':')
# Create the bell curve corresponding to the theoretic distribution
x_min, x_max= ax.get_xlim()
x_th = np.linspace(x_min, x_max, 1000)
y_th = stats.distributions.norm.pdf(x_th, loc=mu, scale=sig)
ax.plot(x_th, y_th, color="black")
# Create the line for the theoretical mean
ymax = stats.distributions.norm.pdf(mu, loc=mu, scale=sig)
ax.vlines(mu, ymin=0, ymax=ymax,
label="theoretical mean", color='black', lw=2, ls='--')
# Set the axis labels
ax.set_ylabel("Density")
ax.set_xlabel("value")
# Create the legend
ax.legend()
# Do the barplot
ax = axes[1]
df_melted = df.melt(var_name="group")
sns.barplot(x="group", y="value", ax=ax, data=df_melted, ci="sd")
plt.tight_layout()
plt.show()
# Set the threshold
thr = 0.05
# Run the Student's t-test
t, p = stats.ttest_ind(x1, x2, equal_var=True)
# Print the results
print(f"t={t}, p={p}, can reject null hypothesis={p < thr}")
Result indicates that we can reject the null-hypothesis (and saying something like 'there is a difference'), while we should not do so (we know by construction that $x_1$ and $x_2$ come from the same distribution, and therefore that the population mean of the two samples is the same): it is a false positive.
Let's take an example...
# Seed the random number generator
np.random.seed(0)
# Set the parameters
m1, m2 = 0, 1
sig = 0.9
n = 20
# Generate two samples
x1 = np.random.normal(m1, scale=sig, size=n)
x2 = np.random.normal(m2, scale=sig, size=n)
# Look at the data
df = pd.DataFrame({"x1": x1, "x2": x2})
df
# Create the figure and axes
fig, axes = plt.subplots(ncols=2, figsize=(16, 4))
ax = axes[0]
# Plot the histogram
sns.histplot(df, ax=ax, stat="density", element="step")
# Plot the sample means
ax.axvline(np.mean(df.x1), label="sample mean", color='C0', lw=2, ls=':')
ax.axvline(np.mean(df.x2), label="sample mean", color='C1', lw=2, ls=':')
# Plot the bell curves
x_min, x_max= ax.get_xlim()
# Plot the theoretic distributions for the two samples
for m, color in (m1, "C0"), (m2, "C1"):
# Plot the line for the mean
ymax = stats.distributions.norm.pdf(m, loc=m, scale=sig)
ax.vlines(m, ymin=0, ymax=ymax,transform=ax.transData,
label="theoretical mean", color=color, lw=2, ls='--')
# Plot the bell curve
x_th = np.linspace(x_min, x_max, 1000)
y_th = stats.distributions.norm.pdf(x_th, loc=m, scale=sig)
ax.plot(x_th, y_th, color=color, label="theoretical distribution", ls='-')
# Create the axis labels
ax.set_ylabel("Density")
ax.set_xlabel("value")
# Create the legend
ax.legend()
# Plot the barplot
ax = axes[1]
df = pd.DataFrame({"A": x1, "B": x2}).melt(var_name="group")
sns.barplot(x="group", y="value", ax=ax, data=df, ci="sd")
# Set the y-axis limits
ax.set_ylim(0, ax.get_ylim()[-1])
plt.tight_layout()
plt.show()
# Set the threshold
thr = 0.05
# Run a Student's t-test
t, p = stats.ttest_ind(x1, x2, equal_var=True)
# Print the result
print(f"t={t}, p={p}, can reject null hypothesis={p < thr}")
Result indicates that we can not reject the null-hypothesis (and saying something like 'there is no significant difference'), while we should do so (we know by construction that $x_1$ and $x_2$ do not come from the same distribution): it is a false negative.
Let's reveal the magic trick behind the p-value...
For the magic to operate, some conditions need to me be met.
Let's come back on this in the next section.
The t statistic can be calculated as follows:
$$ t={\frac {{\bar {X}}_{1}-{\bar {X}}_{2}}{s_{p}{\sqrt {\frac {2}{n}}}}}$$where $n = n_1 = n_2$ and $s_p$ is the pooled standard deviation of the two samples.
Note that:
The pooled standard deviation of the two samples $X_1$ and $X_2$, noted $s_p$, is defined as:
$$ s_{p}={\sqrt {\frac {s_{X_{1}}^{2}+s_{X_{2}}^{2}}{2}}} $$with $s_{X_1}^2$ and $s_{X_2}^2$ are the unbiased estimators of the variances of $X_1$ and $X_2$.
The unbiased estimator of variance for a sample $X$ is defined as:
$${s_{X=[x_1, ..., x_n]}^{2}= {\frac {1}{n-1}}\sum _{i=1}^{n}\left(x_{i}-{\overline {X}}\right)^{2}}$$The bias of an estimator is the difference between this estimator's expected value and the true value of the parameter being estimated. (you can look on the Wikipedia page on this).
We divide by $n-1$ (instead of $n$) when computing the unbiased estimator of variance, because $n-1$ is the number of degrees of freedom (short intuition below; you can also watch this short video on Khan Academy).
What are degrees of freedom?
We know that the the sum of deviations around any mean must equal zero.
Therefore for any variable, if the mean is known and the first $n-1$ observations are known, then the $n^{th}$ observation must be the value that causes the sum of all the deviations around the mean to equal zero.
$n-1$ is called the degrees of freedom because the first $n-1$ observations have the freedom to be any value they want to be, but the $n^{th}$ value has no freedom. It must be whatever value that forces the sum of deviations around the mean to equal zero.
From a discussion on Khan Academy.
Let's take an example...
Let's generate two samples such that $\bar x_1$ < $\bar x_2$
# Set the random number generator
np.random.seed(1234)
# Set the parameters
n=20
mu1, mu2 = 100, 115
sigma = 30
# Generate two samples
x1 = np.random.normal(mu1, scale=sigma, size=n)
x2 = np.random.normal(mu2, scale=sigma, size=n)
# Make a few prints
df = pd.DataFrame({"x1": x1, "x2": x2})
df
print(f"mean x1: {df.x1.mean()}")
print(f"mean x2: {df.x2.mean()}")
# Create the figure and the axes
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 4))
# Plot the histogram
sns.histplot(df, ax=ax1,
stat="density", # y-axis is density instead of counts
element="step") # better visualization
# Set the axis labels
ax1.set_ylabel("Density")
ax1.set_xlabel("value")
# Plot the barplot
df_melted = df.melt(var_name="group")
sns.barplot(x="group", y="value", ax=ax2, data=df_melted, ci="sd")
plt.tight_layout()
plt.show()
# Set the threshold
thr = 0.05
# Run a Student's t-test
t, p = stats.ttest_ind(x1, x2, equal_var=True)
# Print the result
print(f"t={t}, p={p}, can reject={p < thr}")
Let's compute it by hand:
[...]
with $s_{X_1}^2$ and $s_{X_2}^2$ are the unbiased estimators of the variances of $X_1$ and $X_2$.
The unbiased estimator of variance for a sample $X$ is defined as:
$${s_{X=[x_1, ..., x_n]}^{2}= {\frac {1}{n-1}}\sum _{i=1}^{n}\left(x_{i}-{\overline {X}}\right)^{2}}$$[...]
# Compute the unbiased estimators of the variance
var1 = np.var(x1, ddof=1)
var2 = np.var(x2, ddof=1)
# Make a few prints
print("var1", var1)
print("var2", var2)
[...]
The pooled standard deviation of the two samples $X_1$ and $X_2$, noted $s_p$, is defined as:
$${\displaystyle s_{p}={\sqrt {\frac {s_{X_{1}}^{2}+s_{X_{2}}^{2}}{2}}}.}$$[...]
# Compute the pooled standard deviation of the two samples
sp = np.sqrt((var1 + var2)/2)
# Print the result
print("sp", sp)
Now, we have everything we need to compute the t-test using the formula:
$$ t={\frac {{\bar {X}}_{1}-{\bar {X}}_{2}}{s_{p}{\sqrt {\frac {2}{n}}}}}$$[...]
# Compute the denominator
denom = sp * np.sqrt(2 / n)
# Compute the numerator
mean1 = np.mean(x1)
mean2 = np.mean(x2)
num = mean1 - mean2
# Compute the value of the t
t = num/denom
# Print the result
print("t", t)
# Compute the value of the t
t = num/denom
# Print the result
print("t", t)
# Compare with the values used using Scipy
t, p = stats.ttest_ind(x1, x2, equal_var=True)
print("t", t)
First thing that we need to compute the $p$-value is to look at the probability density function of $t$
The probability density function (PDF) of the Student's t distribution is given by: $$f(x) = \frac{\Gamma \left(\frac{\nu+1}{2} \right)} {\sqrt{\nu\pi}\,\Gamma \left(\frac{\nu}{2} \right)} \left(1+\frac{x^2}{\nu} \right)^{-\frac{\nu+1}{2}}$$ where $\nu$ is the number of degrees of freedom for the $t$-statistic calculation (see below) and $\Gamma (\cdot)$ is the gamma function.
The degrees of freedom is equal $2n − 2$ where n is the number of participants in each group (to preserve the mean, each sample has $n-1$ degrees of freedom, so considering the two samples at once, we have $2(n-1) = 2 n - 2$).
def t_pdf(x, degree_freedom):
"""
This is equivalent to scipy.stats.t.pdf(x, df=degree_freedom)
"""
# Shortcuts
g = scipy.special.gamma
nu = degree_freedom
# Compute both left and right term
left = g((nu+1)/2) / (np.sqrt(nu * np.pi)*g(nu/2))
right = (1+x**2/nu) ** (-(nu+1)/2)
# And multiply it
result = left*right
return result
Computing the number of degrees of freedom is quite straightforward...
# Compute the number of degrees of freedom
degree_freedom = 2*n - 2
# Print the result
print("degrees of freedom:", degree_freedom)
Now, let's represent the probability density function of the t-distribution, given our specific number of degrees of freedom...
# Create the figure and the axis
fig, ax = plt.subplots(figsize=(12, 6))
# Plot the bell curve
x = np.linspace(-12, 12, 1000)
# Note that this can be shortened into 'y = stats.t.pdf(x, df=df)'
y = t_pdf(x, degree_freedom)
ax.plot(x, y, label="$t$-distribution PDF")
# Fill the area under the curve on the right of the t-value with blue
x_blue = np.linspace(t, 12, 1000)
y_blue = t_pdf(x_blue, degree_freedom)
ax.fill_between(x_blue, 0, y_blue, color="C0", alpha=0.1)
# Draw the line for representing the t-value
ax.axvline(t, ls='--', color="red", label="t-statistic when comparing $x_1$ and $x_2$")
# Fill the area under the curve on the left of the t-value with red
x_red = np.linspace(-12, t , 1000)
y_red = t_pdf(x_red, degree_freedom)
ax.fill_between(x_red, 0, y_red, color="red", label="Area under the curve = p-value / 2")
# Set the axis labels
ax.set_xlabel(r"$t$")
ax.set_ylabel("Density")
ax.legend()
plt.show()
The p-value one sided correspond to the area under the curve after the t-value.
For computing the area under the curve, we need the primitive (antiderivative) of the PDF, that is to say the cumulative distribution function (CDF)...
The cumulative distribution function is given by: $$ F(x) = \frac{1}{2} + x \Gamma \left( \frac{\nu+1}{2} \right) \times \frac{\,_2F_1 \left ( \frac{1}{2},\frac{\nu+1}{2};\frac{3}{2}; -\frac{x^2}{\nu} \right)} {\sqrt{\pi\nu}\,\Gamma \left(\frac{\nu}{2}\right)}$$ where $_2F_1$ is the hypergeometric function.
Let's skip the implementation for this one! Let's use instead SciPy (scipy.stats.t.cdf
)!
Let's represent the cumulative distribution function...
# Create the figure and the axes
fig, ax = plt.subplots(figsize=(12, 6))
# Plot the cumulative distribution functiion
x = np.linspace(-12, 12, 1000)
y1 = stats.t.cdf(x, degree_freedom)
ax.plot(x, y1, label="$t$-distribution CDF", color="C4")
# # Plot the survival function (1 - CDF)
# ax.plot(x, 1 - y1, label="1 - CDF", color="C1")
# Plot the line representing the t-value
ax.axvline(t, ls='--', color="red", label="t-statistic when comparing $x_1$ and $x_2$")
ax.axhline(stats.distributions.t.cdf(t, degree_freedom),
ls='-', color="red", label="p-value / 2")
# Set the axis labels
ax.set_xlabel("t")
ax.set_ylabel("value")
# Create the legend
ax.legend()
plt.show()
Now, let's compute the p-value.
It is finally quite easy now, we just need to look at CDF for the specific $t$-statistic that we obtained when comparing our two samples, and multiply by 2 (why to multiply by 2? we reply to this question just after)
# Compute the p-value
half_p = stats.distributions.t.cdf(t, degree_freedom)
p = half_p * 2
# Print the result
print("p", p)
# Compute the p-value using Scipy and compare...
t, p = stats.ttest_ind(x1, x2)
print("p", p)
For further explanation on the computation of the p-value, you can refer to: Krzywinski, M., Altman, N. Significance, P values and t-tests. Nat Methods 10, 1041–1042 (2013).
We talk about 'one-sided' when we consider the area under the PDF curve on one side, and about 'two-sided' because you take the value under the curve on both sides.
Let's represent that graphically...
# Create figure and axes
fig, ax = plt.subplots(figsize=(12, 6))
# Define x-limits
min_x, max_x = -12, 12
# Plot the bell curve
x = np.linspace(min_x, max_x, 1000)
y = stats.t.pdf(x, df=degree_freedom) # Now, using the function provided by Scipy
ax.plot(x, y)
# Set the axis labels
ax.set_xlabel(r"$t$")
ax.set_ylabel("Density")
# Fill the area under the curve
for t_ in (t, -t):
if t_ > 0:
x = np.linspace(t_, max_x, 1000)
else:
x = np.linspace(min_x, t_, 1000)
y_f = stats.t.pdf(x, df=degree_freedom)
ax.fill_between(x, 0, y_f, color="red")
# Draw a line corresponding to the t-value
ax.axvline(t_, ls='--', color="red")
plt.show()
Let's the see how the p-value for a given t-value change depending on the number of subjects...
# Set the sample size to use
ns = [5, 10, 15, 20, 50, 100]
# Set the t-values to use
x = np.linspace(1.5, 3, 1000)
# Create the figure
fig, ax = plt.subplots(figsize=(12, 6))
# For each n...
for n in ns:
# Compute the number of degrees of freedom
degree_freedom = 2*n - 2
# Compute the CDF
# Note that contrary to above, we consider here 1 - CDF
# because we consider positive values of the t-statistic.
# You'll get positive values for t if you compare two samples,
# such that the first sample has the highest mean than the second has the lowest mean.
# You'll get negative values for t if you compare two samples,
# such that the first sample has the lowest mean than the second has the highest mean.
y = (1 - stats.distributions.t.cdf(x, degree_freedom)) * 2
# Plot the result
ax.plot(x, y, label=f"n={n}")
# Set the axis labels
ax.set_xlabel("t")
ax.set_ylabel("p-value")
# Create a line for the magic threshold
ax.axhline(0.05, ls='--', color="black", alpha=0.1)
# Create the legend
ax.legend()
plt.show()
The more you will increase your $n$, the more you will increase the probability to have significant results
What we can see in the doc for the t-test's implementation from Scipy:
Let's see with two samples how to test if they meet the requirements for comparing them with a Student's t-test:
# Seed the random number generator
np.random.seed(1285)
# Set the parameters
n = 100
mu1 = 10
mu2 = 20
sig = 10
# Generate the sample
x1 = np.random.normal(mu1, scale=sig, size=n)
x2 = np.random.normal(mu2, scale=sig, size=n)
Testing the size of sample is pretty straightforward...
# Test the equality of sample sizes
is_equal = len(x1) == len(x2)
# Print the result
print("is equal", is_equal)
Let's test the normality of the distribution using a D’Agostino and Pearson’s test. The null hypothesis is that the data are normally distributed.
# Run the D’Agostino and Pearson’s test for sample x1
k, p = stats.normaltest(x1)
# Print the result
print(f"k={k}, p={p}")
# Run the D’Agostino and Pearson’s test for sample x2
k, p = stats.normaltest(x2)
# Print the result
print(f"k={k}, p={p}")
Let's test the normality of the distribution using a Levene test. The null hypothesis is that the all samples are from populations with equal variances.
# Run the Levene test
stat, p = stats.levene(x1, x2)
# Print the result
print(f"k={k}, p={p}")
The Welch's t-test can be computed this way:
$${\displaystyle t={\frac {{\bar {X}}_{1}-{\bar {X}}_{2}}{s_{\bar {\Delta }}}}}$$where
$${\displaystyle s_{\bar {\Delta }}={\sqrt {{\frac {s_{1}^{2}}{n_{1}}}+{\frac {s_{2}^{2}}{n_{2}}}}}.}$$# Seed the random number generator
np.random.seed(4)
# Set the parameters
m1, m2 = 100, 150
sd1, sd2 = 50, 10
n = 100
# Generate two samples
x1 = np.random.normal(m1, scale=sd1, size=n)
x2 = np.random.normal(m2, scale=sd2, size=n)
# Make a few prints
df = pd.DataFrame({"x1": x1, "x2": x2})
df
# Create figure and axes
fig, axes = plt.subplots(ncols=2, figsize=(16, 4))
# Plot the histogram
ax = axes[0]
sns.histplot(data=df, ax=ax, stat="density", element="step")
# Set the axis labels
ax.set_ylabel("density")
ax.set_xlabel("value")
# Plot the barplot
ax = axes[1]
df_melted = df.melt(var_name="sample")
sns.barplot(x="sample", y="value", ax=ax, data=df_melted, ci="sd")
plt.tight_layout()
plt.show()
# Compute a Welch's t-test
# Note that the 'equal-var=False' that will force the use of a Welch's t-test
# (instead of a Student t-test)
t, p = stats.ttest_ind(x1, x2, equal_var=False)
# Print the result
print(f"t={t}, p={p}")
Why that? Because they rely on a technique based on ranking the values: you can learn more about that by looking at how to compute by hand the Mann-Whitney U in the Additional material.
We want data that are not normally distributed. Let's take the example of a gamma distribution (we could have choose any other one, for instance the beta distribution, this is just an example, it doesn't matter which one we choose, it just does have to be NOT normal).
So let's consider data following a Gamma distribution, that is such that: $$ X \sim \mathrm{Gamma} (\alpha ,\beta )$$ with $\alpha, \beta \in \mathbb{R}$, two parameters.
Note: the gamma distribution is something different than the gamma function (even if there is a connection: the PDF of the gamma distribution uses the gamma function).
# Seed the random number generator
np.random.seed(124)
# Set the parameters
# From the doc of 'np.random.gamma': Samples are drawn from a Gamma distribution with specified parameters,
# shape (sometimes designated “k”) and scale (sometimes designated “theta”),
# where both parameters are > 0.
# knowing that shape=alpha and theta = 1/beta
k1, t1 = 1, 10
k2, t2 = 1.3, 14
n = 25
# Generate two samples
x1 = np.random.gamma(k1, scale=t1, size=n)
x2 = np.random.gamma(k2, scale=t2, size=n)
# Put it in a dataframe and display
df = pd.DataFrame({"x1": x1, "x2": x2})
df
# Create the figure and axes
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 6))
# Plot the histogram
sns.histplot(data=df, ax=ax1,
stat="density",
element="step")
# Plot the boxplot
df_melted = df.melt(var_name="sample")
sns.boxplot(x="sample", y="value", ax=ax2, data=df_melted)
plt.show()
# Compute the Mann-Whitney U
# Note: the default option is deprecated
# Remember to use the argument "alternative='two-sided'"
u, p = stats.mannwhitneyu(x1, x2, alternative='two-sided')
# Print the results
print(f"u = {u}, p={p}")
Don't forget to read the doc for the conditions of application!
They are one sections in the Additional material relating to the computation of the Mann-Whitney by hand. You'll see that it is quite different from the way to compute the $t$-test.
...is probably what you're looking for.
The Pearson correlation between $X$ and $Y$ can be expressed as: $$r_{XY}= \frac {\sum \limits _{i=1}^{n}(x_{i}-{\bar {X}})(y_{i}-{\bar {Y}})}{\sqrt {\sum \limits _{i=1}^{n}(x_{i}-{\bar {X}})^{2}\sum \limits _{i=1}^{n}(y_{i}-{\bar {Y}})^{2}}}$$
where $\overline {X}$ and $\overline {Y}$ are the means of $X$ and $Y$, and $s_{x}$ and $s_{y}$ are the corrected standard deviations of $X$ and $Y$.
Original article: Pearson, Karl (1895). "Notes on regression and inheritance in the case of two parents". Proceedings of the Royal Society of London. 58: 240–242.
# Import the data
df = pd.read_csv(os.path.join("data", "rr.csv"))
# Take a look
df
# Create figure and axes
fig, ax = plt.subplots(figsize=(5, 4))
# Plot a scatter plot
sns.scatterplot(x="Debt", y="Growth", ax=ax, data=df)
plt.show()
# Shortcut for our two columns of interest
debt, growth = df["Debt"], df["Growth"]
# Compare it using Scipy
r_, p = stats.pearsonr(debt, growth)
print(f"r={r_}, p={p}")
Based on this results, I could conclude that there is a negative correlation between growth and debt, and that this correlation is statistically significant. However, this conclusion will be invalid. Let's see why.
Don't forget to read the doc for knowing of the conditions of application!
We read that:
The calculation of the p-value relies on the assumption that each dataset is normally distributed.
So, are my conclusions for the growth-debt example valid?
...we need to check the normality of the data.
First thing that we can do is to look at the distribution of each sample.
Note: In a real context of application, we should have done that at first!
# Create figure and axes
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 6))
# Plot the left histogram
sns.histplot(ax=ax1, x=debt, label="debt",
stat="density", linewidth=0, alpha=0.5)
# Plot the right histogram
sns.histplot(ax=ax2, x=growth, label="growth",
stat="density", linewidth=0, alpha=0.5)
plt.show()
...so are my conclusions valid?
No, they don't seem to be, as the distribution of debt is clearly not normal!
To confirm our observations, let's test the normality of the distribution of both samples (growth and debt) using a D’Agostino and Pearson’s test. The null hypothesis is that the data are normally distributed.
# Compute the D’Agostino and Pearson’s test for x
k2, p = stats.normaltest(debt)
# Print the result
print(f"debt: k2={k2}, p={p}")
# Compute the D’Agostino and Pearson’s test for y
k2, p = stats.normaltest(growth)
# Print the result
print(f"growth: k2={k2}, p={p}")
Both indicates that the probability of rejecting the null-hypothesis by error is low: it is unlikely that both samples are normally distributed.
Therefore, my previous conclusions are invalid.
One option is to redo the analysis using a rank-dependent alternative.
r, p = stats.spearmanr(growth, debt)
print(f"Spearman correlation: r={r}, p={p}")
I can know conclude, based on the Spearman correlation test, that there is a negative correlation between growth and debt.
We are looking for a $c$, such that we can 'bound' the mean, and obtain a confidence interval (CI) such that: $$\text{CI} = \big(\bar{X} - c,\bar{X} + c\big)$$
They are two main cases: the standard deviation from the parent population is known, the standard deviation from the parent population is unknown.
...is probably what you're looking for.
The population standard deviation is known (or we have a very large sample, which means we have a reliable estimate of the SD), then:
$$c = z^{*}{\sigma \over {\sqrt {n}}}$$with
$$z^{*}=\Phi ^{-1}\left(1 - \frac{\alpha }{2}\right)$$where $\Phi^{-1}$ is the probit function (inverse CDF of the normal distribution), $\alpha$ is the error level (e.g., 5% for a confidence level at 95%).
Note: This is the most 'classic' formula. However, in practice, it is NOT the formula to use, as, most of the time, we don't know the population standard deviation.
If the population standard deviation is unknown then the Student's $t$ distribution is used as the critical value ($t^*$, see below).
$$c = t^{*}{s \over {\sqrt {n}}}$$with $s$ the (sample) standard deviation, $n$ the number of observation.
This value is dependent on the error level ($\alpha$) for the test and degrees of freedom ($\nu = n − 1$).
$$t^{*}= \text{CDF}_t^{-1}\left(1 - \frac {\alpha }{2}; \nu \right)$$where $\text{CDF}_t^{-1}$ is the inverse of the CDF of $t$.
# Seed the random number generator
np.random.seed(12345)
# Generate a sample
x = np.random.normal(100, scale=30, size=30)
# Print a few values
print("x", x)
# Create the figure
fig, ax = plt.subplots(figsize=(16, 4))
# Plot the histogram
sns.histplot(x, ax=ax, stat="density", alpha=0.3, element="step")
# Set the axis labels
ax.set_ylabel("density")
ax.set_xlabel("value")
plt.tight_layout()
plt.show()
# Set the threshold
alpha = 0.05
# Get the sample size
n = len(x)
# Compute the mean
m = np.mean(x)
# Compute the sd
sd = np.std(x)
# Compute the CDF value at the upper bound
cdf_value_upper_bound = 1 - alpha / 2.
# Compute the number of degrees of freedom
dof = n-1
# Compute t*
# ppf: Percent point function: inverse of the CDF
t_star = stats.t.ppf(cdf_value_upper_bound, dof)
# Compute the size of half of the confidence interval
c = sd/np.sqrt(n) * t_star
# Compute the confidence interval
ci = m-c, m+c
# Print the result
print("CI = ", ci)
# Create figure and axes
fig, ax = plt.subplots(figsize=(14, 4))
# Plot the histogram
sns.histplot(x, ax=ax, color="C0", alpha=0.3, stat="density", element="step")
# Set the axis labels
ax.set_ylabel("density")
ax.set_xlabel("value")
# Plot the vertical lines
ax.axvline(ci[0], ls="--", color="red", lw=3)
ax.axvline(ci[1], ls="--", color="red", lw=3)
ax.axvline(m, color="C0", lw=2, ls=':')
plt.tight_layout()
plt.show()
They are two sections in the Additional material relating to confidence interval. One is to give an intuition about to find $t^*$. Another is about a bad example of application.
One ultra "classic" test that we will not discuss here is the compare the effect of two factors, you can use an ANOVA (ANalysis Of VAriance). It is useful to discuss the interaction effects).
Note that for applying an ANOVA, the following conditions about the parent distribution need to be met:
Normality: The populations from which samples are drawn are normally distributed.
Homogeneity of variances: The populations from which samples are drawn are approximately equal</b>.
They is one section in the Additional material relating to ANOVA.
(See the growth-debt example as a bad example)
If you need to do multiple comparisons, you need to use a correction on your $p$-values such as a Bonferroni correction.
fig, ax = plt.subplots(figsize=(16, 9))
mu = 0
sig = 1
x = np.linspace(-12, 12, 1000)
y1 = stats.norm.pdf(x, loc=mu, scale=sig)
ax.plot(x, y1, label="normal dist", ls="--")
for df in (1, 5, 20):
y2 = stats.t.pdf(x, loc=mu, scale=sig, df=df)
ax.plot(x, y2, label=f"t-dist DoF={df}")
ax.set_ylabel('Density')
plt.legend()
plt.show()
I want to compare the effect of two factors, you can use an ANOVA (ANalysis Of VAriance). It is useful to discuss the interaction effects).
The following conditions about the parent/sample distribution need to be met:
Normality: The population from which samples are drawn should be normally distributed.
Homogeneity of variances: The variance among the groups should be approximately equal</b>.
(Bad) Example (see below why it is bad):
# Import the data
df = pd.read_csv("data/stcp-Rdataset-Diet.csv")
# Look at the top of the file
df
# Print the counts
print(df.count())
# Create a new column with the loss of weight
df["Loss"] = df["pre.weight"] - df["weight6weeks"]
df
# See the different values for the 'gender' column
print(df["gender"].unique())
# Remove the lines when the data is missing
df = df[df["gender"] != ' ']
# Print the new count
print(df.count())
# Represent the raw data
g = sns.catplot(x="Diet", y="Loss", hue="gender",
data=df, height=8)
plt.show()
# Use boxplot to synthesize the information
g = sns.catplot(x="Diet", y="Loss", hue="gender",
data=df, kind="bar", height=8)
plt.show()
# Do an Anova using the statsmodels library
# with the 'loss' as a VD and 'gender' and 'diet' as VI
formula = 'Loss ~ C(Diet) * C(gender)'
model = ols(formula, df).fit()
aov_table = anova_lm(model, typ=2)
# Print the result
print(aov_table)
So, are my conclusions valid?
...we need to check the normality of the data.
Let's test the normality of the distribution using a D’Agostino and Pearson’s test. The null hypothesis is that the data are normally distributed.
# Boolean for selecting the data such as...
# ...gender = 1
is_g1 = df.gender == "1"
# ...diet = 1
is_d1 = df.Diet == 1
# Compute the D'Agostino and Pearson's test
k2, p = stats.normaltest(df.Diet[is_g1 & is_d1])
# Print the result
print(f"k2={k2}, p={p}")
...unfortunately, our sample is too small. Are my conclusions still valid?
Nope! This was a bad example as we can not be sure that the conditions of applications are met. The solution? Using a non-parametric (rank-dependent) test!
We want data that are not normally distributed. Let's take the example of a gamma distribution (we could have choose any other one, for instance the beta distribution, this is just an example, it doesn't matter which one we choose, it just does have to be NOT normal).
So let's consider data following a Gamma distribution, that is such that: $$ X \sim \mathrm{Gamma} (\alpha ,\beta )$$ with $\alpha, \beta \in \mathbb{R}$, two parameters.
Note: the gamma distribution is something different than the gamma function (even if there is a connection: the PDF of the gamma distribution uses the gamma function).
# Seed the random number generator
np.random.seed(124)
# Set the parameters
# From the doc of 'np.random.gamma': Samples are drawn from a Gamma distribution with specified parameters,
# shape (sometimes designated “k”) and scale (sometimes designated “theta”),
# where both parameters are > 0.
# knowing that shape=alpha and theta = 1/beta
k1, t1 = 1, 10
k2, t2 = 1.3, 14
n = 25
# Generate two samples
x1 = np.random.gamma(k1, scale=t1, size=n)
x2 = np.random.gamma(k2, scale=t2, size=n)
# Put it in a dataframe and display
df = pd.DataFrame({"x1": x1, "x2": x2})
df
# Create the figure and axes
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 6))
# Plot the histogram
sns.histplot(data=df, ax=ax1,
stat="density",
element="step")
# Plot the boxplot
df_melted = df.melt(var_name="sample")
sns.boxplot(x="sample", y="value", ax=ax2, data=df_melted)
plt.show()
# Compute the Mann-Whitney U
# Note: the default option is deprecated
# Remember to use the argument "alternative='two-sided'"
u, p = scipy.stats.mannwhitneyu(x1, x2, alternative='two-sided')
# Print the results
print(f"u = {u}, p={p}")
Ex:
# Compute the n
n1 = len(x1)
n2 = len(x2)
# Rank the values
ranked = stats.rankdata(np.concatenate((x1, x2)))
# Print the result
print("ranked values", ranked)
$U$ is then given by:
$$U_{1}=R_{1}-{n_{1}(n_{1}+1) \over 2}\,\!$$where n1 is the sample size for sample 1, and R1 is the sum of the ranks in sample 1.
$$U_{2}=R_{2}-{n_{2}(n_{2}+1) \over 2}\,\!$$# Get the ranks for the sample x1
rank_x1 = ranked[0:n1]
# Compute U for x1
u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - np.sum(rank_x1, axis=0)
# The remainder is U for x2
u2 = n1 * n2 - u1
# Print the result
print(f"u1 = {u1}; u2 = {u2}")
# Compare with what is obtained using Scipy...
u, p = stats.mannwhitneyu(x1, x2, alternative='two-sided')
print(f"u = {u}, p={p}")
For large samples (n>20), U is approximately normally distributed, with
$$\mu = \frac {n_{1}n_{2}}{2}$$$$\sigma^2 = \sqrt {n_1 n_2 (n_1+n_2+1) \over 12}$$if there is a tie, a correction is necessary:
$$\sigma^2_{\text{corr}}={\sqrt {{n_{1}n_{2} \over 12}\left((n+1)-\sum _{i=1}^{k}{{t_{i}}^{3}-t_{i} \over n(n-1)}\right)}}\,$$where $n = n_1 + n_2$, $t_i$ is the number of subjects sharing rank $i$, and $k$ is the number of (distinct) ranks.
# Compute the mean of the U distribution
mean_rank = n1*n2/2.0
# Print the result
print("mean_rank", mean_rank)
# Compute the sd of the U distribution
# Note: we assume that there is not tie here
sd = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0)
# Print the result
print("sd", sd)
# Take the max of U1 and U2
u = max(u1, u2)
# Compute the p-value
p = (1 - stats.norm.cdf(u, loc=mean_rank, scale=sd)) * 2
# Print the result
print("p", p)
# Alternative: compute Z-score first
z = (u - mean_rank) / sd
# Compute the p-value using the Z-score
p = (1 - stats.norm.cdf(z)) * 2
print("p", p)
# Compare with what is obtained using Scipy...
u, p = stats.mannwhitneyu(x1, x2, alternative='two-sided')
print(f"u = {u}, p={p}")
# The small difference disappears with 'use_continuity=False' argument
# Indeed, by default, a correction is made assuming data
# are on a continuous scale
u, p = stats.mannwhitneyu(x1, x2, use_continuity=False,
alternative='two-sided')
# Print the result
print(f"u = {u}, p={p}")
# Import the data
df = pd.read_csv(os.path.join("data", "rr.csv"))
# Take a look
df
# Create figure and axes
fig, ax = plt.subplots(figsize=(5, 4))
# Plot a scatter plot
sns.scatterplot(x="Debt", y="Growth", ax=ax, data=df)
plt.show()
# Shortcut for our two columns of interest
debt, growth = df["Debt"], df["Growth"]
# Compare it using Scipy
r_, p = stats.pearsonr(debt, growth)
print(f"r={r_}, p={p}")
# shortcuts
x = debt
y = growth
# Compute the numerator
r_num = np.sum((x - np.mean(x)) * (y - np.mean(y)))
# Compute the denominator
r_denom = np.sqrt(np.sum((x - np.mean(x))**2))*np.sqrt(np.sum((y - np.mean(y))**2))
# Compute the r-value
r = r_num/r_denom
# Print the result
print("r", r)
For computing a p-value, we assume that the coefficient correlation follows a Student's t-distribution.
We can compute the t-value by using the formula: $$t=r{\sqrt {\frac {\nu}{1-r^{2}}}}$$ with $\nu = n-2$ the number of degrees of freedom.
Then, we can use the the cumulative distribution function of the t-distribution to compute the p-value.
# Get the n
n = len(x)
# Compute the degrees of freedom
dof = n - 2
# Compute the t-value
t = r * np.sqrt(dof/(1 - r**2))
# Compute the p-value
p = 2 * (1 - stats.distributions.t.cdf(np.abs(t), dof))
# Print the result
print("p", p)
# Compare with what is obtained using Scipy
r_, p = stats.pearsonr(x, y)
print(f"p={p}")
Let's come back to the example.
# Seed the random number generator
np.random.seed(12345)
# Generate a sample
x = np.random.normal(100, scale=30, size=30)
# Print a few values
print("x", x)
# Create the figure
fig, ax = plt.subplots(figsize=(16, 4))
# Plot the histogram
sns.histplot(x, ax=ax, stat="density", alpha=0.3, element="step")
# Set the axis labels
ax.set_ylabel("density")
ax.set_xlabel("value")
plt.tight_layout()
plt.show()
# Set the threshold
alpha = 0.05
# Get the sample size
n = len(x)
# Compute the mean
m = np.mean(x)
# Compute the sd
sd = np.std(x)
# Compute the CDF value at the upper bound
cdf_value_upper_bound = 1 - alpha / 2.
# Compute the number of degrees of freedom
dof = n-1
# Compute t*
# ppf: Percent point function: inverse of the CDF
t_star = stats.t.ppf(cdf_value_upper_bound, dof)
# Compute the size of half of the confidence interval
c = sd/np.sqrt(n) * t_star
# Compute the confidence interval
ci = m-c, m+c
# Print the result
print("CI = ", ci)
# Create figure and axes
fig, ax = plt.subplots(figsize=(14, 4))
# Plot the histogram
sns.histplot(x, ax=ax, color="C0", alpha=0.3, stat="density", element="step")
# Set the axis labels
ax.set_ylabel("density")
ax.set_xlabel("value")
# Plot the vertical lines
ax.axvline(ci[0], ls="--", color="red", lw=3)
ax.axvline(ci[1], ls="--", color="red", lw=3)
ax.axvline(m, color="C0", lw=2, ls=':')
plt.tight_layout()
plt.show()
Let see try now to have an intuition about how we find the $t^*$.
Finding $t^*$ consists in finding for which $t$-value the CDF of the $t$ distribution is equal to $1-\alpha/2$. For computing this, we are using the PPF (percent point function) of the $t$ distribution, which is the inverse of the CDF, but we can 'solve this graphically' just looking at the CDF. So here we go:
# Create the figure and axes
fig, axes = plt.subplots(ncols=2, figsize=(14, 6))
# Set the range of values
t_min, t_max = -3, 3
ax = axes[0]
# Plot the CDF on the right panel
y2 = stats.t.cdf(t, df=dof)
ax.plot(t, y2, color="red")
# Plot the position of the upper bound
ax.axvline(t_star, ls='--', color="red", label=r'$t^*$')
#. Plot the position of the y-value
ax.plot((t_min,t_star), (1-alpha/2, 1-alpha/2), color="black", alpha=0.5, lw=2, ls=":",
label=r"$1 - \alpha /2$")
ax.set_yticks([0, 0.5, 1-alpha/2, 1])
# Set the axis labels
ax.set_xlabel("t")
ax.set_ylabel("CDF")
ax.legend()
ax = axes[1]
# Plot the bell curve
t = np.linspace(t_min, t_max, 1000)
y = stats.t.pdf(t, df=dof)
ax.plot(t, y)
# Plot the position fo the upper bounds
ax.axvline(t_star, ls='--', color="red")
# Fill the area under the curve
x_f = np.linspace(t_min, t_star, 1000)
y_f = stats.t.pdf(x_f, df=dof)
ax.fill_between(x_f, 0, y_f, color="red")
# Set the axis labels
ax.set_xlabel("t")
ax.set_ylabel("PDF")
plt.show()
Let's assume that we are looking for an estimation of the average flight delay, with confidence interval at 95%.
Let's consider flight arrival delays. Data come from https://data.world/bob-wakefield/flights
# Import the data
df = pd.read_csv("data/flights.csv")
# Print the top of the file
df
# Clean the data set by removing the nan values
x = df.dropna()["arr_delay"].values
# Create figure and axe
fig, ax = plt.subplots(figsize=(16, 4))
# Plot the histogram
sns.histplot(x="arr_delay", ax=ax, color="C0", linewidth=0,
alpha=0.5, stat="density", data=df)
# Set the axis labels
ax.set_ylabel("density")
ax.set_xlabel("value")
plt.tight_layout()
plt.show()
Zooming a little bit...
# Create figure and axe
fig, ax = plt.subplots(figsize=(16, 4))
# Plot the histogram
sns.histplot(x="arr_delay", ax=ax, color="C0", linewidth=0, alpha=0.5, stat="density", data=df)
# Set the limits of the x-axis
ax.set_xlim(-200, 200)
# Set the axis labels
ax.set_ylabel("density")
ax.set_xlabel("value")
plt.tight_layout()
plt.show()
It doesn't look really like a normal distribution...
Let's check the normality of the data.
For this, we can use a D’Agostino and Pearson’s test. The null hypothesis is that the data are normally distributed.
# Compute the D’Agostino and Pearson’s test for our sample
k2, p = stats.normaltest(x)
# Print the result
print(f"k2={k2}, p={p}")
The probability to reject the null-hypothesis is close to zero: the data are probability not normally distributed.