C Templates

Notes

  • Google spreadsheets are, of course, read-only. To use the templates, you need to log in to your Google account and then select File > Make a copy from the menu.

  • In the sheets, I tried to maintain the following color convention:

    • Green indicates areas that need to be changed (e.g., data entry).

    • Yellow highlights the most important results, formulas in these cells (and other non-green cells) should not be changed or overwritten.

  • Some cells may contain calculations even though they appear empty. Be cautious and avoid overwriting these cells.

  • Excel templates may not work in Excel 2007 due to the limited availability of statistical functions in this version. Excel 2007 users are encouraged to report which sheets do not work – maybe we will be able to find a workaround .

The templates are continuously being developed, so any comments, suggestions, or error reports are welcome.

C.1 Bayes' formula

Bayes' formula — Google spreadsheet

Bayes' formula — Excel template

# Hypotheses: descriptions (optional)
hypotheses <- c("Disease", "No disease")

# Prior
prior <- c(.001, .999)

# Likelihood
likelihood <- c(.95, .01)

# Posterior
posterior <- prior*likelihood
posterior <- posterior/sum(posterior)

# Check
if(length(prior)!=length(likelihood))
{print("Both vectors (prior and likelihood) should be the same length. ")}
if(sum(prior)!=1){
  print("Suma prawdopodobieństw w rozkładzie zaczątkowym (prior) powinna być równa 1.")}
if (!exists("hypotheses") || length(hypotheses)!=length(prior)) {
  hipotezy <- paste0('H', 1:length(prior))
}

# Wynik w formie ramki danych:
print(data.frame(
  Hypothesis = hypotheses, `Prior` = prior, `Likelihood` = likelihood, `Posterior` = posterior
))
##   Hypothesis Prior Likelihood  Posterior
## 1    Disease 0.001       0.95 0.08683729
## 2 No disease 0.999       0.01 0.91316271
# Chart
library(ggplot2)

hypotheses<-factor(hypotheses, levels=hypotheses)

df <- data.frame(Hypotheses = c(hypotheses, hypotheses),
                 `Distribution` = factor(c(rep("prior", length(prior)), rep("posterior", length(posterior))), 
                                    levels=c("prior", "posterior")
                 ),
                 `Probability` = c(prior, posterior)
)

ggplot(data=df, aes(x=Hypotheses, y=`Probability`, fill=`Distribution`)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label = format(round(`Probability`,4), nsmall=4), group=`Distribution`), 
            position = position_dodge(width = .9), vjust = -0.2)

# Hypotheses: descriptions (optional)
hypotheses = ["Disease", "No disease"]

# Prior
prior = [.001, .999]

# Likelihood
likelihood = [.95, .01]

# Posterior
posterior = [a*b for a, b in zip(prior, likelihood)]
posterior = [p/sum(posterior) for p in posterior]

# Check
if len(prior) != len(likelihood):
    print("Both vectors (prior and likelihood) should be the same length. ")
if sum(prior) != 1:
    print("The sum of probabilities in the prior distribtuion should be 1.")
if not "hypotheses" in locals() or len(hypotheses) != len(prior):
    hypotheses = ["H" + str(i) for i in range(1, len(prior)+1)]

# Wynik w formie ramki danych:
import pandas as pd
df = pd.DataFrame({
    "Hypothesis": hypotheses,
    "Prior": prior,
    "Likelihood": likelihood,
    "Posterior": posterior
})
print(df)
##    Hypothesis  Prior  Likelihood  Posterior
## 0     Disease  0.001        0.95   0.086837
## 1  No disease  0.999        0.01   0.913163
import numpy as np
import matplotlib.pyplot as plt

x = np.arange(len(hypotheses)) 
width = 0.375

fig, ax = plt.subplots(layout='constrained')

rects = ax.bar(x-width/2, np.round(prior, 4), width, label = 'zaczątek')
ax.bar_label(rects, padding=3)

rects = ax.bar(x+width/2, np.round(posterior, 4), width, label = 'posterior')
ax.bar_label(rects, padding=3)
ax.set_ylabel('probability')
ax.set_xlabel('hypotheses')
ax.set_xticks(x, hypotheses)
ax.legend(loc='upper left', ncols=2)
#ax.set_ylim(0, np.max([prior, posterior])*1.2)
plt.show()

C.2 Distributions

C.2.1 Discrete random variable

Discrete random variable – calculator — Google spreadsheet

Discrete random variable – calculator — Excel template

# Discrete probability distribution of a single variable
x <- c(0, 1, 4)
Px <- c(1/3, 1/3, 1/3)

# Check
if(length(x)!=length(Px))
{print("Both vectors should be of the same length.")}
if(!sum(Px)==1)
{print("The probabilities should add up to 1. ")}
if(any(Px<0))
{print("The probabilities cannot be negative. ")}

# Calculations
EX <- sum(x*Px)
VarX <- sum((x-EX)^2*Px)
SDX <- sqrt(VarX)
SkX <- sum((x-EX)^3*Px)/SDX^3
KurtX <- sum((x-EX)^4*Px)/SDX^4 - 3

print(c('Expected value' = EX, 
        'Variance' = VarX,
        'Standard deviation' = SDX,
        'Skewness' = SkX,
        'Excess kurtosis' = KurtX))
##     Expected value           Variance Standard deviation           Skewness    Excess kurtosis 
##           1.666667           2.888889           1.699673           0.528005          -1.500000
# Joint probability distribution of two discrete random variables
x <- c(2, -1, -1)
y <- c(-1, 1, -1)
Pxy <- c(1/2, 1/3, 1-1/2-1/3) #sum(c(1/2, 1/3, 1/6))==1 may return FALSE for numerical reason
# Check
if(length(x)!=length(y) || length(x)!=length(Pxy))
{print("The vectors should be of the same length. ")}
if(!sum(Pxy)==1)
{print("The probabilities should add up to 1. ")}
if(any(Pxy<0))
{print("The probabilities cannot be negative. ")}

EX <- sum(x*Pxy)
VarX <- sum((x-EX)^2*Pxy)
SDX <- sqrt(VarX)
SkX <- sum((x-EX)^3*Pxy)/SDX^3
KurtX <- sum((x-EX)^4*Pxy)/SDX^4 - 3

EY <- sum(y*Pxy)
VarY <- sum((y-EY)^2*Pxy)
SDY <- sqrt(VarY)
SkY <- sum((y-EY)^3*Pxy)/SDY^3
KurtY <- sum((y-EY)^4*Pxy)/SDY^4 - 3

CovXY <- sum((x-EX)*(y-EY)*Pxy)
CorXY <- CovXY/(SDX*SDY)

print(c('Expected value of X' = EX, 
        'Variance of X' = VarX,
        'Standard deviation of X' = SDX,
        'Skewness of X' = SkX,
        'Excess kurtosis of X' = KurtX,
        'Expected value of Y' = EY, 
        'Variance of Y' = VarY,
        'Standard deviation of Y' = SDY,
        'Skewness of Y' = SkY,
        'Excess kurtosis of Y' = KurtY,
        'Covariance of X and Y' = CovXY,
        'Correlation of X and Y' = CorXY
))
##     Expected value of X           Variance of X Standard deviation of X           Skewness of X    Excess kurtosis of X 
##            5.000000e-01            2.250000e+00            1.500000e+00           -3.289550e-17           -2.000000e+00 
##     Expected value of Y           Variance of Y Standard deviation of Y           Skewness of Y    Excess kurtosis of Y 
##           -3.333333e-01            8.888889e-01            9.428090e-01            7.071068e-01           -1.500000e+00 
##   Covariance of X and Y  Correlation of X and Y 
##           -1.000000e+00           -7.071068e-01
# Discrete probability distribution of a single variable
x = [0, 1, 4]
Px = [1/3, 1/3, 1/3]

# Check
if len(x) != len(Px):
    print("Both vectors should be of the same length.")
if sum(Px) != 1:
    print("The probabilities should add up to 1. ")
if any(p < 0 for p in Px):
    print("The probabilities cannot be negative.")

# Calculations
EX = sum([a*b for a, b in zip(x, Px)])
VarX = sum([(a-EX)**2*b for a, b in zip(x, Px)])
SDX = VarX**0.5
SkX = sum([(a-EX)**3*b for a, b in zip(x, Px)]) / SDX**3
KurtX = sum([(a-EX)**4*b for a, b in zip(x, Px)]) / SDX**4 - 3

# Results
print({'Expected value': EX,
       'Variance': VarX,
       'Standard deviation': SDX,
       'Skewness': SkX,
       'Excess kurtosis': KurtX})
## {'Expected value': 1.6666666666666665, 'Variance': 2.888888888888889, 'Standard deviation': 1.699673171197595, 'Skewness': 0.5280049792181879, 'Excess kurtosis': -1.5000000000000002}
# Joint probability distribution of two discrete random variables
x = [2, -1, -1]
y = [-1, 1, -1]
Pxy = [1/2, 1/3, 1-1/2-1/3]

# Check
if len(x) != len(y) or len(x) != len(Pxy):
    print("The vectors should be of the same length. ")
if sum(Pxy) != 1:
    print("The probabilities should add up to 1. ")
if any(p < 0 for p in Pxy):
    print("The probabilities cannot be negative. ")

# Calculations
EX = sum([a*b for a, b in zip(x, Pxy)])
VarX = sum([(a-EX)**2*b for a, b in zip(x, Pxy)])
SDX = VarX**0.5
SkX = sum([(a-EX)**3*b for a, b in zip(x, Pxy)]) / SDX**3
KurtX = sum([(a-EX)**4*b for a, b in zip(x, Pxy)]) / SDX**4 - 3

EY = sum([a*b for a, b in zip(y, Pxy)])
VarY = sum([(a-EY)**2*b for a, b in zip(y, Pxy)])
SDY = VarY**0.5
SkY = sum([(a-EY)**3*b for a, b in zip(y, Pxy)]) / SDY**3
KurtY = sum([(a-EY)**4*b for a, b in zip(y, Pxy)]) / SDY**4 - 3

CovXY = sum([(a-EX)*(b-EY)*c for a, b, c in zip(x, y, Pxy)])
CorXY = CovXY / (SDX*SDY)

# Results
print({'Expected value of X': EX,
       'Variance of X': VarX,
       'Standard deviation of X': SDX,
       'Skewness X': SkX,
       'Excess kurtosis of X': KurtX,
       'Expected value of Y': EY,
       'Variance of Y': VarY,
       'Standard deviation of Y': SDY,
       'Skewness of Y': SkY,
       'Excess kurtosis of Y': KurtY,
       'Covariance of X and Y': CovXY,
       'Correlation of X and Y': CorXY
})
## {'Expected value of X': 0.5, 'Variance of X': 2.25, 'Standard deviation of X': 1.5, 'Skewness X': -3.289549702593056e-17, 'Excess kurtosis of X': -2.0, 'Expected value of Y': -0.33333333333333337, 'Variance of Y': 0.888888888888889, 'Standard deviation of Y': 0.9428090415820634, 'Skewness of Y': 0.7071067811865478, 'Excess kurtosis of Y': -1.4999999999999991, 'Covariance of X and Y': -0.9999999999999998, 'Correlation of X and Y': -0.7071067811865475}

C.2.2 Parametric discrete distributions

Discrete distributions calculator — Google spreadsheet

Discrete distributions calculator — Excel template

# Binomial distribution
n <- 18
p <- 0.6
from <- 12
to <- 14

result <- pbinom(to, n, p)-pbinom(from-1, n, p)
if (from > to) {
  # error from > to
  print("!!! 'From' cannot be greater than 'to' !!!")
} else {
  p=paste0("P(", from, " <= X <= ", to, ")")
  print(p)
  print(result)
}
## [1] "P(12 <= X <= 14)"
## [1] 0.3414956
# Poisson
lambda <- 5/3
from <- 2
to <- Inf

result <- ppois(to, lambda)-ppois(from-1, lambda)
if (from > to) {
  # error from > to
  print("!!! 'From' cannot be greater than 'to' !!!")
} else {
  p=paste0("P(", from, " <= X <= ", to, ")")
  print(p)
  print(result)
}
## [1] "P(2 <= X <= Inf)"
## [1] 0.4963317
# Hypergeometric distribution
N <- 49
r <- 6
n <- 6
from <- 3
to <- 6

result <- phyper(to, r, N-r, n)-phyper(from-1, r, N-r, n)
if (from > to) {
  # error from > to
  print("!!! 'From' cannot be greater than 'to' !!!")
} else {
  p=paste0("P(", from, " <= X <= ", to, ")")
  print(p)
  print(result)
}
## [1] "P(3 <= X <= 6)"
## [1] 0.01863755
from scipy.stats import binom, poisson, hypergeom

# Binomial distribution
n = 18
p = 0.6
_from = 12
_to = 14
result = binom.cdf(_to, n, p) - binom.cdf(_from-1, n, p)
if _from > _to:
    print("!!! 'From' cannot be greater than 'to' !!!")
else:
    p = "P(" + str(_from) + " <= X <= " + str(_to) + ")"
    print(p)
    print(result)
## P(12 <= X <= 14)
## 0.34149556326865305
    
# Poisson distribution
lambda_val = 5/3
from_val = 2
to_val = float('inf')

result = poisson.cdf(to_val, lambda_val) - poisson.cdf(from_val-1, lambda_val)

if from_val > to_val:
    print("!!! 'From' cannot be greater than 'to' !!!")
else:
    p = "P(" + str(from_val) + " <= X <= " + str(to_val) + ")"
    print(p)
    print(result)
## P(2 <= X <= inf)
## 0.49633172576650164
    
# Hypergeometric distribution
N = 49
r = 6
n = 6
_from = 3
_to = 6

result = hypergeom.cdf(_to, N, r, n) - hypergeom.cdf(_from-1, N, r, n)

if _from > _to:
    print("!!! 'From' cannot be greater than 'to' !!!")
else:
    p = "P(" + str(_from) + " <= X <= " + str(_to) + ")"
    print(p)
    print(result) 
## P(3 <= X <= 6)
## 0.018637545002022304

C.2.3 Continuous distributions

Normal distribution calculator — Google spreadsheet

Normal distribution calculator — Excel template

##### 1. Area under PDF #####
# Gaussian distribution parameters:
# mean:
m <- 0
# standard deviation:
sd <- 2

# Calculating the area under PDF
# from
# *for minus infinity use from <- -Inf
from <- -Inf
# to
# *for plus infinity use to <- Inf
to <- 2

# Data validation, calculation
if (from > to) {
  # from > to error
  print("!!! _From_ cannot be greater than _to_ !!!")
} else {
  
  # Probability notation
  
  if (to==Inf) {
    p=paste0("P(X>", from, ")")
  } else if (from==-Inf) {
    p=paste0("P(X<", to, ")")
  } else {
    p=paste0("P(", from, "<X<", to, ")")
  }
  print(p)
  
  # Calculating the area for the given segment:
  result<-pnorm(to, m, sd)-pnorm(from, m, sd)
  print(result)
}
## [1] "P(X<2)"
## [1] 0.8413447
# Plot
library(ggplot2)

x1=if(from==-Inf){min(-4*sd+m, to-2*sd)} else {min(from-2*sd, -4*sd+m)}
x2=if(to==Inf){max(4*sd+m, from+2*sd)} else {max(to+2*sd, 4*sd+m)}

df<-data.frame(y=c(0, 0), 
               x=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}),
               label=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}))

plt<-ggplot(NULL, aes(c(x1, x2))) +
  theme_minimal() +
  xlab('') +
  ylab('') +
  geom_area(stat = "function", 
            fun = function(x){dnorm(x, m, sd)}, 
            fill = "orange", 
            xlim = c(if(from==-Inf){x1}else{from}, if(to==Inf){x2}else{to})) +
  geom_line(stat = "function", fun = function(x){dnorm(x, m, sd)}, col = "blue", lty=2, lwd=1) +
  scale_x_continuous(breaks=c(m, m-sd, m-2*sd, m+sd, m+2*sd, m-3*sd, m+3*sd, m-4*sd, m+4*sd)) +
  geom_point(data = df, aes(x=x, y=y), shape=4) +
  geom_text(data = df, aes(x=x, y=y, label=signif(label, 6)), vjust=1.4) +
  annotate("text", label = paste0("M = ", m, "\nSD = ", sd, "\n", p, " = ", signif(result,6)), 
           x = x1, y = dnorm(m, m, sd)*1.2, size = 6, hjust="inward", vjust = "inward")

suppressWarnings(print(plt))

##### 2. Find x #####
# Gaussian distribution parameters:
# mean:
m <- 1
# standard deviation:
sd <- 3

# The area under PDF:
P <- 0.95

# 'L' - left, 'R' - right, 'S' - symmetrical (middle)
typ <- 'L'

# Calculations
from <- -qnorm(if(typ=='L'){1} else if(typ=='R'){P} else {1-(1-P)/2})*sd+m
to <- qnorm(if(typ=='L'){P} else if(typ=='R'){1} else {1-(1-P)/2})*sd+m

# P notation
if (to==Inf) {
  p=paste0("P(X>", signif(from, 6), ")")
} else if (from==-Inf) {
  p=paste0("P(X<", signif(to, 6), ")")
} else {
  p=paste0("P(", signif(from, 6), " < X < ", signif(to, 6), ")")
}
print(paste0(p, " = ", P))
## [1] "P(X<5.93456) = 0.95"
# Plot
library(ggplot2)

x1=if(from==-Inf){min(-4*sd+m, to-2*sd)} else {min(from-2*sd, -4*sd+m)}
x2=if(to==Inf){max(4*sd+m, from+2*sd)} else {max(to+2*sd, 4*sd+m)}

df<-data.frame(y=c(0, 0), 
               x=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}),
               label=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}))

plt<-ggplot(NULL, aes(c(x1, x2))) +
  theme_minimal() +
  xlab('') +
  ylab('') +
  geom_area(stat = "function", 
            fun = function(x){dnorm(x, m, sd)}, 
            fill = "orange", 
            xlim = c(if(from==-Inf){x1}else{from}, if(to==Inf){x2}else{to})) +
  geom_line(stat = "function", fun = function(x){dnorm(x, m, sd)}, col = "blue", lty=2, lwd=1) +
  scale_x_continuous(breaks=c(m, m-sd, m-2*sd, m+sd, m+2*sd, m-3*sd, m+3*sd, m-4*sd, m+4*sd)) +
  geom_point(data = df, aes(x=x, y=y), shape=4) +
  geom_text(data = df, aes(x=x, y=y, label=signif(label, 6)), vjust=1.4) +
  annotate("text", label = paste0("M = ", m, "\nSD = ", sd, "\n", p, " = ", P), 
           x = x1, y = dnorm(m, m, sd)*1.2, size = 6, hjust="inward", vjust = "inward")

suppressWarnings(print(plt))

from scipy.stats import norm
##### 1. Area under PDF #####
# Gaussian distribution parameters:
# mean:
m = 0
# standard deviation:
sd = 2
# Calculating the area under PDF
# from
# *for minus infinity use _from = float('-inf')
_from = float('-inf')
# to
_to = 2

# Data validation, calculation
if _from > _to:
    print("!!! _From_ cannot be greater than _to_ !!!")
else:
    if _to == float('inf'):
        p = "P(X>" + str(_from) + ")"
    elif _from == float('-inf'):
        p = "P(X<" + str(_to) + ")"
    else:
        p = "P(" + str(_from) + "<X<" + str(_to) + ")"
    print(p)

    result = norm.cdf(_to, m, sd) - norm.cdf(_from, m, sd)
    print(result)
## P(X<2)
## 0.8413447460685429
##### 2. Find x #####

import numpy as np
from scipy.stats import norm

# Gaussian distribution parameters:
# mean:
m = 1
# standard deviation:
sd = 3

# The area under PDF:
P = 0.95

# 'L' - left, 'R' - right, 'S' - symmetrical (middle)
typ = 'L'

# Obliczenia
if typ == 'L':
    _from = -norm.ppf(1) * sd + m
    _to = norm.ppf(P) * sd + m
elif typ == 'R':
    _from = -norm.ppf(P) * sd + m
    _to = norm.ppf(1) * sd + m
else:
    _from = -norm.ppf(1-(1-P)/2) * sd + m
    _to = norm.ppf(1-(1-P)/2) * sd + m

# Zapis prawdopodobieństwa
if np.isinf(_to):
    p = f"P(X>{np.round(_from, 6)})"
elif np.isinf(_from):
    p = f"P(X<{np.round(_to, 6)})"
else:
    p = f"P({np.round(_from, 6)} < X < {np.round(_to, 6)})"
print(f"{p} = {P}")
## P(X<5.934561) = 0.95

Student-t distribution calculator — Google spreadsheet

Student-t distribution calculator — Excel template

##### 1. Area under PDF #####
# Parameter:
# Degrees of freedom:
nu <- 23

# Calculating the area under PDF
# from
# *for minus infinity use from <- -Inf
from <- 1
# to
# *for plus infinity use to <- Inf
to <- Inf

# Data validation, calculation
if (from > to) {
  # from > to error
  print("!!! _From_ cannot be greater than _to_ !!!")
} else {
  
  # Probability notation
  
  if (to==Inf) {
    p=paste0("P(X>", from, ")")
  } else if (from==-Inf) {
    p=paste0("P(X<", to, ")")
  } else {
    p=paste0("P(", from, "<X<", to, ")")
  }
  print(p)
  
  # Calculating the area for the given segment:
  result<-pt(to, nu)-pt(from, nu)
  print(result)
}
## [1] "P(X>1)"
## [1] 0.1638579
# Plot
library(ggplot2)

x1=-5
x2=5

df<-data.frame(y=c(0, 0), 
               x=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}),
               label=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}))

plt<-ggplot(NULL, aes(c(x1, x2))) +
  theme_minimal() +
  xlab('') +
  ylab('') +
  geom_area(stat = "function", 
            fun = function(x){dt(x, nu)}, 
            fill = "orange", 
            xlim = c(if(from==-Inf){x1}else{from}, if(to==Inf){x2}else{to})) +
  geom_line(stat = "function", fun = function(x){dt(x, nu)}, col = "blue", lty=2, lwd=1) +
  geom_point(data = df, aes(x=x, y=y), shape=4) +
  geom_text(data = df, aes(x=x, y=y, label=signif(label, 6)), vjust=1.4) +
  annotate("text", label = paste0("ν = ", nu, "\n", p, " = ", signif(result,6)), 
           x = x1, y = dt(0, nu)*1.2, size = 6, hjust="inward", vjust = "inward")

suppressWarnings(print(plt))

##### 2. Find x #####
# Parameter:
# Degrees of freedom:
nu <- 23

# The area under PDF:
P <- 0.95

# 'L' - left, 'R' - right, 'S' - symmetrical (middle)
typ <- 'S'

# Calculations
from <- -qt(if(typ=='L'){1} else if(typ=='R'){P} else {1-(1-P)/2}, nu)
to <- qt(if(typ=='L'){P} else if(typ=='R'){1} else {1-(1-P)/2}, nu)

# P notation
if (to==Inf) {
  p=paste0("P(X>", signif(from, 6), ")")
} else if (from==-Inf) {
  p=paste0("P(X<", signif(to, 6), ")")
} else {
  p=paste0("P(", signif(from, 6), " < X < ", signif(to, 6), ")")
}
print(paste0(p, " = ", P))
## [1] "P(-2.06866 < X < 2.06866) = 0.95"
# Plot
library(ggplot2)

x1=-5
x2=5

df<-data.frame(y=c(0, 0), 
               x=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}),
               label=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}))

plt<-ggplot(NULL, aes(c(x1, x2))) +
  theme_minimal() +
  xlab('') +
  ylab('') +
  geom_area(stat = "function", 
            fun = function(x){dt(x, nu)}, 
            fill = "orange", 
            xlim = c(if(from==-Inf){x1}else{from}, if(to==Inf){x2}else{to})) +
  geom_line(stat = "function", fun = function(x){dt(x, nu)}, col = "blue", lty=2, lwd=1) +
  geom_point(data = df, aes(x=x, y=y), shape=4) +
  geom_text(data = df, aes(x=x, y=y, label=signif(label, 6)), vjust=1.4) +
  annotate("text", label = paste0("ν = ", nu, "\n", p, " = ", P), 
           x = x1, y = dt(0, nu)*1.2, size = 6, hjust="inward", vjust = "inward")

suppressWarnings(print(plt))

from scipy.stats import t
##### 1. Area under PDF #####
# Parameter:
# Degrees of freedom:
nu = 23

# Calculating the area under PDF
# from
# *for minus infinity use _from = float('-inf')
_from = 1
# to
# *for plus infinity use _to = float('inf')
_to = float('inf')

if _from > _to:
    print("!!! _From_ cannot be greater than _to_ !!!")
else:
    if _to == float('inf'):
        p = "P(X>" + str(_from) + ")"
    elif _from == float('-inf'):
        p = "P(X<" + str(_to) + ")"
    else:
        p = "P(" + str(_from) + "<X<" + str(_to) + ")"
    print(p)
    result = t.cdf(_to, nu) - t.cdf(_from, nu)
    print(result)
## P(X>1)
## 0.16385790307142933
    
##### 2. Find x #####

import numpy as np
from scipy.stats import t

# Parameter:
# Degrees of freedom:
nu = 23

# The area under PDF:
P = 0.95

# 'L' - left, 'R' - right, 'S' - symmetrical (middle)
typ = 'S'

# Obliczenia
if typ == 'L':
    _from = -t.ppf(1, nu)
    _to = t.ppf(P, nu)
elif typ == 'R':
    _from = -t.ppf(P, nu)
    _to = t.ppf(1)
else:
    _from = -t.ppf(1-(1-P)/2, nu)
    _to = t.ppf(1-(1-P)/2, nu)

# P notation
if np.isinf(_to):
    p = f"P(X>{np.round(_from, 6)})"
elif np.isinf(_from):
    p = f"P(X<{np.round(_to, 6)})"
else:
    p = f"P({np.round(_from, 6)} < X < {np.round(_to, 6)})"
print(f"{p} = {P}")
## P(-2.068658 < X < 2.068658) = 0.95

Chi-square distribution calculator — Google spreadsheet

##### 1. Area under PDF #####
# Parameter:
# Degrees of freedom:
nu <- 4

# Calculating the area under PDF
# from
# *for minus infinity use from <- -Inf
from <- 10
# do
# *for plus infinity use to <- Inf
to <- Inf

# Data validation, calculation
if (from > to) {
  # from > to error
  print("!!! _From_ cannot be greater than _to_ !!!")
} else {
  
  # Probability notation
  
  if (to==Inf) {
    p=paste0("P(X>", from, ")")
  } else if (from==-Inf) {
    p=paste0("P(X<", to, ")")
  } else {
    p=paste0("P(", from, "<X<", to, ")")
  }
  print(p)
  
  # Calculating the area for the specified segment:
  result<-pchisq(to, nu)-pchisq(from, nu)
  print(result)
}
## [1] "P(X>10)"
## [1] 0.04042768
# Plot
library(ggplot2)

x1=0
x2=nu*4

df<-data.frame(y=c(0, 0), 
               x=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}),
               label=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}))

plt<-ggplot(NULL, aes(c(x1, x2))) +
  theme_minimal() +
  xlab('') +
  ylab('') +
  geom_area(stat = "function", 
            fun = function(x){dchisq(x, nu)}, 
            fill = "orange", 
            xlim = c(if(from==-Inf){x1}else{from}, if(to==Inf){x2}else{to})) +
  geom_line(stat = "function", fun = function(x){dchisq(x, nu)}, col = "blue", lty=2, lwd=1) +
  geom_point(data = df, aes(x=x, y=y), shape=4) +
  geom_text(data = df, aes(x=x, y=y, label=signif(label, 6)), vjust=1.4) +
  annotate("text", label = paste0("ν = ", nu, "\n", p, " = ", signif(result,4)), 
           x = 4*nu, y = dchisq(max(nu-2,0), nu), size = 6, hjust="inward", vjust = "inward")

suppressWarnings(print(plt))

##### 2. Find x #####
# Parameter:
# Degrees of freedom:
nu <- 4

# The area under PDF:
P <- 0.05

# 'L' - left, 'R' - right, 'S' - symmetrical (middle)
typ <- 'R'

# Calculations
from <- qchisq(if(typ=='L'){0} else if(typ=='R'){1-P} else {(1-P)/2}, nu)
to <- qchisq(if(typ=='L'){P} else if(typ=='R'){1} else {1-(1-P)/2}, nu)

# P notation
if (to==Inf) {
  p=paste0("P(X>", signif(from, 6), ")")
} else if (from==-Inf) {
  p=paste0("P(X<", signif(to, 6), ")")
} else {
  p=paste0("P(", signif(from, 6), " < X < ", signif(to, 6), ")")
}
print(paste0(p, " = ", P))
## [1] "P(X>9.48773) = 0.05"
# Plot
library(ggplot2)

x1=0
x2=nu*4

df<-data.frame(y=c(0, 0), 
               x=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}),
               label=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}))

plt<-ggplot(NULL, aes(c(x1, x2))) +
  theme_minimal() +
  xlab('') +
  ylab('') +
  geom_area(stat = "function", 
            fun = function(x){dchisq(x, nu)}, 
            fill = "orange", 
            xlim = c(if(from==-Inf){x1}else{from}, if(to==Inf){x2}else{to})) +
  geom_line(stat = "function", fun = function(x){dchisq(x, nu)}, col = "blue", lty=2, lwd=1) +
  geom_point(data = df, aes(x=x, y=y), shape=4) +
  geom_text(data = df, aes(x=x, y=y, label=signif(label, 6)), vjust=1.4) +
  annotate("text", label = paste0("ν = ", nu, "\n", p, " = ", P), 
           x = 4*nu, y = dchisq(max(nu-2,0), nu), size = 6, hjust="inward", vjust = "inward")


suppressWarnings(print(plt))

from scipy.stats import chi2
##### 1. Area under PDF #####
# Parameter:
# Degrees of freedom:
nu = 4

# Calculating the area under PDF
# from
# *for minus infinity use _from = float('-inf')
_from = 10
# to
# *for plus infinity use _to = float('inf')
_to = float('inf')

if _from > _to:
    print("!!! _From_ cannot be greater than _to_ !!!")
else:
    if _to == float('inf'):
        p = "P(X>" + str(_from) + ")"
    elif _from == float('-inf'):
        p = "P(X<" + str(_to) + ")"
    else:
        p = "P(" + str(_from) + "<X<" + str(_to) + ")"
    print(p)
    result = chi2.cdf(_to, nu) - chi2.cdf(_from, nu)
    print(result)
## P(X>10)
## 0.04042768199451274
    
##### 2. Find x #####

import numpy as np
from scipy.stats import chi

# Parameter:
# Degrees of freedom:
nu = 4

# The area under PDF:
P = 0.05

# 'L' - left, 'R' - right, 'S' - symmetrical (middle)
typ = 'R'

# Computation
if typ == 'L':
    _from = chi2.ppf(0, nu)
    _to = chi2.ppf(P, nu)
elif typ == 'R':
    _from = chi2.ppf(1-P, nu)
    _to = chi2.ppf(1, nu)
else:
    _from = chi2.ppf((1-P)/2, nu)
    _to = chi2.ppf(1-(1-P)/2, nu)

# P notation
if np.isinf(_to):
    p = f"P(X>{np.round(_from, 6)})"
elif np.isinf(_from):
    p = f"P(X<{np.round(_to, 6)})"
else:
    p = f"P({np.round(_from, 6)} < X < {np.round(_to, 6)})"
print(f"{p} = {P}")
## P(X>9.487729) = 0.05

F distribution calculator — Google spreadsheet

##### 1. Area under PDF #####
# Parameters:
# Degrees of freedom:
nu1 <- 39
nu2 <- 34

# Calculating the area under PDF
# from
# *for minus infinity use from <- -Inf
from <- 1
# do
# *for plus infinity use to <- Inf
to <- Inf

# Data validation, calculation
if (from > to) {
  # from > to error
  print("!!! _From_ cannot be greater than _to_ !!!")
} else {
  
  # Probability notation
  
  if (to==Inf) {
    p=paste0("P(X>", from, ")")
  } else if (from==-Inf) {
    p=paste0("P(X<", to, ")")
  } else {
    p=paste0("P(", from, "<X<", to, ")")
  }
  print(p)
  
  # Calculating the area for the specified segment:
  result<-pf(to, nu1, nu2)-pf(from, nu1, nu2)
  print(result)
}
## [1] "P(X>1)"
## [1] 0.5030343
# Plot
library(ggplot2)

x1=0
x2=qf(.999, nu1, nu2)

df<-data.frame(y=c(0, 0), 
               x=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}),
               label=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}))

plt<-ggplot(NULL, aes(c(x1, x2))) +
  theme_minimal() +
  xlab('') +
  ylab('') +
  geom_line(stat = "function", fun = function(x){df(x, nu1, nu2)}, col = "blue", lty=2, lwd=1) +
  geom_area(stat = "function", 
            fun = function(x){df(x, nu1, nu2)}, 
            fill = "orange", 
            xlim = c(if(from==-Inf){x1}else{from}, if(to==Inf){x2}else{to})) +
  geom_point(data = df, aes(x=x, y=y), shape=4) +
  geom_text(data = df, aes(x=x, y=y, label=signif(label, 6)), vjust=1.4) +
  annotate("text", label = paste0("ν1 = ", nu1, "\n", "ν2 = ", nu2, "\n", p, " = ", signif(result,4)), 
           x = x2, y = df(max((nu1-2)/nu1*nu2/(nu2+2),0), nu1, nu2), size = 6, hjust="inward", vjust = "inward")

suppressWarnings(print(plt))

##### 2. Find x #####
# Parameters:
# Degrees of freedom:
nu1 <- 34
nu2 <- 39
  
# The area under PDF:
P <- 0.05

# 'L' - left, 'R' - right, 'S' - symmetrical (middle)
typ <- 'R'

# Obliczenia
from <- qf(if(typ=='L'){0} else if(typ=='R'){1-P} else {(1-P)/2}, nu1, nu2)
to <- qf(if(typ=='L'){P} else if(typ=='R'){1} else {1-(1-P)/2}, nu1, nu2)

# P notation
if (to==Inf) {
  p=paste0("P(X>", signif(from, 6), ")")
} else if (from==-Inf) {
  p=paste0("P(X<", signif(to, 6), ")")
} else {
  p=paste0("P(", signif(from, 6), " < X < ", signif(to, 6), ")")
}
print(paste0(p, " = ", P))
## [1] "P(X>1.72803) = 0.05"
# Plot
library(ggplot2)

x1=0
x2=qf(.9999, nu1, nu2)


df<-data.frame(y=c(0, 0), 
               x=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}),
               label=c(if(from==-Inf){NA}else{from}, if(to==-Inf){NA}else{to}))

plt<-ggplot(NULL, aes(c(x1, x2))) +
  theme_minimal() +
  xlab('') +
  ylab('') +
  geom_area(stat = "function", 
            fun = function(x){stats::df(x, nu1, nu2)}, 
            fill = "orange", 
            xlim = c(if(from==-Inf){x1}else{from}, if(to==Inf){x2}else{to})) +
  geom_line(stat = "function", fun = function(x){stats::df(x, nu1, nu2)}, col = "blue", lty=2, lwd=1) +
  geom_point(data = df, aes(x=x, y=y), shape=4) +
  geom_text(data = df, aes(x=x, y=y, label=signif(label, 6)), vjust=1.4) +
  annotate("text", label = paste0("ν1 = ", nu1, "\nν2 = ", nu2, "\n", p, " = ", P), 
           x = x2, y = df(max((nu1-2)/nu1*nu2/(nu2+2),0), nu1, nu2), size = 6, hjust="inward", vjust = "inward")


suppressWarnings(print(plt))

from scipy.stats import f
##### 1. Area under PDF #####
# Parameters:
# Degrees of freedom 1:
nu1 = 39
# Degrees of freedom 2:
nu2 = 34

# Calculating the area under PDF
# from
# *for minus infinity use _from = float('-inf')
_from = 1
# to
# *for plus infinity use _to = float('inf')
_to = float('inf')

if _from > _to:
    print("!!! _From_ cannot be greater than _to_ !!!")
else:
    if _to == float('inf'):
        p = "P(X>" + str(_from) + ")"
    elif _from == float('-inf'):
        p = "P(X<" + str(_to) + ")"
    else:
        p = "P(" + str(_from) + "<X<" + str(_to) + ")"
    print(p)
    result = f.cdf(_to, nu1, nu2) - f.cdf(_from, nu1, nu2)
    print(result)
## P(X>1)
## 0.503034322091588
    
##### 2. Find x #####

import numpy as np
from scipy.stats import chi

# Parameters:
# Degrees of freedom 1:
nu1 = 39
# Degrees of freedom 2:
nu2 = 34

# The area under PDF:
P = 0.05

# 'L' - left, 'R' - right, 'S' - symmetrical (middle)
typ = 'R'

# Obliczenia
if typ == 'L':
    _from = f.ppf(0, nu1, nu2)
    _to = f.ppf(P, nu1, nu2)
elif typ == 'R':
    _from = f.ppf(1-P, nu1, nu2)
    _to = f.ppf(1, nu1, nu2)
else:
    _from = f.ppf((1-P)/2, nu1, nu2)
    _to = f.ppf(1-(1-P)/2, nu1, nu2)

# P notation
if np.isinf(_to):
    p = f"P(X>{np.round(_from, 6)})"
elif np.isinf(_from):
    p = f"P(X<{np.round(_to, 6)})"
else:
    p = f"P({np.round(_from, 6)} < X < {np.round(_to, 6)})"
print(f"{p} = {P}")
## P(X>1.749073) = 0.05

C.3 Confidence intervals

Confidence interval for a proportion — Google spreadsheet

Confidence interval for a proportion — Excel template

# Confidence interval for a proportion
# Data:
# Sample size:
n <- 160
# Number of favourable observations:
x <- 15
# Sample proportion:
phat <- x/n
# Confidence level:
conf <- 0.95

# Simple formula:
alpha <- 1 - conf
resa <- phat + c(-qnorm(1-alpha/2), qnorm(1-alpha/2)) * sqrt((1/n)*phat*(1-phat))
# Wilson score:
resw <- prop.test(x, n, conf.level = 1-alpha, correct = FALSE)$conf.int

print(paste(
  list(
    "Confidence interval - simple formula:",
    resa, 
    "Confidence interval - Wilson score:", 
    resw)))
## [1] "Confidence interval - simple formula:"    "c(0.0485854437380776, 0.138914556261922)"
## [3] "Confidence interval - Wilson score:"      "c(0.0576380069455474, 0.148912026631301)"
# Using the binom package
# Sample size:
n <- 160
# Number of favourable observations:
x <- 15
# Confidence level:
conf <- 0.95
# methods="all" returns all methods, to obtain the CI with the "simple method" is method="asymptotic"
binom::binom.confint(x, n, conf.level = conf, methods = "all")
##           method  x   n       mean      lower     upper
## 1  agresti-coull 15 160 0.09375000 0.05667743 0.1498726
## 2     asymptotic 15 160 0.09375000 0.04858544 0.1389146
## 3          bayes 15 160 0.09627329 0.05301161 0.1424125
## 4        cloglog 15 160 0.09375000 0.05494601 0.1449700
## 5          exact 15 160 0.09375000 0.05342512 0.1499102
## 6          logit 15 160 0.09375000 0.05730929 0.1496827
## 7         probit 15 160 0.09375000 0.05616008 0.1472798
## 8        profile 15 160 0.09375000 0.05506974 0.1453215
## 9            lrt 15 160 0.09375000 0.05506409 0.1453210
## 10     prop.test 15 160 0.09375000 0.05523020 0.1525939
## 11        wilson 15 160 0.09375000 0.05763801 0.1489120
# Confidence interval for a proportion
# Data:
# Sample size:
n = 160
# Number of favourable observations:
x = 15
# Sample proportion:
phat = x/n
# Confidence level:
conf = 0.95

from statsmodels.stats.proportion import proportion_confint
# Simple formula:
resa = proportion_confint(x, n, alpha=1-conf, method='normal')
# Wilson score:
resw = proportion_confint(x, n, alpha=1-conf, method='wilson')

print("Confidence interval - simple formula:", resa, 
"\nConfidence interval - Wilson score:", resw)
## Confidence interval - simple formula: (0.048585443738077556, 0.13891455626192245) 
## Confidence interval - Wilson score: (0.05763800694554742, 0.14891202663130057)

Confidence interval for a mean — Google spreadsheet

Confidence interval for a mean — Excel template

# Confidence interval for the mean
# Data:
# Sample size:
n <- 24
# Sample mean:
xbar <- 183
# Population/sample standard deviation:
s <- 5.19
# Confidence level:
conf <- 0.95

alpha <- 1 - conf

# z:
ci_z <- xbar + c(-qnorm(1-alpha/2), qnorm(1-alpha/2)) * s/sqrt(n)

# t:
df<- n-1
ci_t <- xbar + c(-qt(1-alpha/2, df), qt(1-alpha/2, df)) * s/sqrt(n)

print(paste(
  list(
    "Confidence interval - z:",
    ci_z, 
    "Confidence interval - t:", 
    ci_t)))
## [1] "Confidence interval - z:"              "c(180.923605699976, 185.076394300024)" "Confidence interval - t:"             
## [4] "c(180.808455203843, 185.191544796157)"
# Using raw data:
dane <- c(34.1, 35.6, 34.2, 33.9, 25.1)
test_result<-t.test(dane, conf.level = 0.99)
print(test_result$conf.int)
## [1] 23.85964 41.30036
## attr(,"conf.level")
## [1] 0.99
import math
import scipy.stats as stats

n = 24
xbar = 183
s = 5.19
conf = 0.95
alpha = 1 - conf

ci_z = [xbar + (-stats.norm.ppf(1-alpha/2)) * s/math.sqrt(n), xbar + (stats.norm.ppf(1-alpha/2)) * s/math.sqrt(n)]

df = n-1
ci_t = [xbar + (-stats.t.ppf(1-alpha/2, df)) * s/math.sqrt(n), xbar + (stats.t.ppf(1-alpha/2, df)) * s/math.sqrt(n)]

print("Confidence interval - z:", ci_z,
"\nConfidence interval - t:", ci_t)
## Confidence interval - z: [180.92360569997632, 185.07639430002368] 
## Confidence interval - t: [180.80845520384258, 185.19154479615742]
# Version 2

print(stats.norm.interval(confidence=conf, loc=xbar, scale=s/math.sqrt(n)), "\n",
stats.t.interval(confidence=conf, df=df, loc=xbar, scale=s/math.sqrt(n)))
## (180.92360569997632, 185.07639430002368) 
##  (180.80845520384258, 185.19154479615742)
# Using raw data:
import numpy as np
from scipy import stats

dane = np.array([34.1, 35.6, 34.2, 33.9, 25.1])
test_result = stats.ttest_1samp(dane, popmean=np.mean(dane))
conf_int = test_result.confidence_interval(0.99)
print(conf_int)
## ConfidenceInterval(low=23.85964498330358, high=41.300355016696415)

C.3.1 Sample size

Sample size — Google spreadsheet

Sample size — Excel template

# Proportion estimation
# Data:
# Confidence level:
conf <- 0.99
# Margin of error (e):
e <- 0.01
# Assumed proportion (p):
p <- 0.5

# Computations
alpha <- 1 - conf
z <- -qnorm(alpha/2)

ceiling(z^2 * p * (1-p) / e^2)
## [1] 16588
# Mean estimation
# Data:
# Confidence level:
conf <- 0.9
# Margin of error (e):
e <- 1
# Assumed standard deviation (sigma):
sigma <- 10

# Computation
alpha <- 1 - conf
z <- -qnorm(alpha/2)

ceiling(z^2 * sigma^2 / e^2)
## [1] 271
# Proportion estimation
import numpy as np
from scipy.stats import norm
# Data:
# Confidence level:
conf = 0.99
# Margin of error (e):
e = 0.01
# Assumed proportion (p):
p = 0.5

# Computations
alpha = 1 - conf
z = -norm.ppf(alpha/2)

print(np.ceil(z**2 * p * (1 - p) / e**2))
## 16588.0
# Mean estimation
import numpy as np
from scipy.stats import norm
# Data:
# Confidence level:
conf = 0.9
# Margin of error (e):
e = 1
# Assumed standard deviation (sigma):
sigma = 10

# Computations
alpha = 1 - conf
z = -norm.ppf(alpha / 2)

print(np.ceil(z**2 * sigma**2 / e**2))
## 271.0

C.4 Tests for means and proportions

C.4.1 Tests for 1 mean and 1 proportion

Tests for 1 mean and 1 proportion — Google spreadsheet

Tests for 1 mean and 1 proportion — Excel template

# Test for 1 mean
# Sample size:
n <- 38
# Sample mean:
xbar <- 184.21
# Sample/population standard deviation:
s <- 6.1034
# Significance level:
alpha <- 0.05

# Null value for the mean:
mu0 <- 179
# Alternative (sign): "<"; ">"; "<>"; "≠"
alt <- ">"

# Computations
# degrees of freedom:
df <- n-1

# critical value (t test):
crit_t <- if (alt == "<") {qt(alpha, df)} else if (alt == ">") {qt(1-alpha, df)} else {qt(1-alpha/2, df)}

# test statistic (t/z)
test_tz <- (xbar-mu0)/(s/sqrt(n))

# p-value (t test):
p.value = if(alt == ">"){1-pt(test_tz, df)} else if (alt == ">") {pt(test_tz, df)} else {2*(1-pt(abs(test_tz),df))}

# critical value (z test):
crit_z <- if (alt == "<") {qnorm(alpha)} else if (alt == ">") {qnorm(1-alpha)} else {qnorm(1-alpha/2)}

# p-value (z test):
p.value.z = if(alt == ">"){1-pnorm(test_tz)} else if (alt == ">") {pnorm(test_tz)} else {2*(1-pnorm(abs(test_tz)))}

print(c('Mean' = xbar, 
        'SD' = s,
        'Sample size' = n,
        'Null hypothesis' = paste0('mu = ', mu0),
        'Alt. hypothesis' = paste0('mu ', alt, ' ', mu0),
        'Test statistic t/z' = test_tz,
        'Critical value (t test)' = crit_t,
        'P value (t test)' = p.value,
        'Critical value (z test)' = crit_z,
        'P value (z test)' = p.value.z
))
##                    Mean                      SD             Sample size         Null hypothesis         Alt. hypothesis 
##                "184.21"                "6.1034"                    "38"              "mu = 179"              "mu > 179" 
##      Test statistic t/z Critical value (t test)        P value (t test) Critical value (z test)        P value (z test) 
##      "5.26208293008297"      "1.68709361959626"   "3.1304551380007e-06"      "1.64485362695147"  "7.12162456784071e-08"
# Based on raw data (test t):

# Data vector
data <- c(176.5267, 195.5237, 184.9741, 179.5349, 188.2120, 190.7425, 178.7593, 196.2744, 186.6965, 187.8559, 183.1323, 176.2569, 191.4752, 186.5975, 180.2120, 184.3434, 178.1691, 184.8852, 187.7973, 178.5013, 172.7343, 176.8545, 184.2068, 181.2395, 186.1983, 173.6317, 181.9529, 185.9135, 188.6081, 183.0285, 183.3375, 188.5512, 184.6348, 186.9657, 183.9622, 200.9014, 183.5353, 177.2538)

# Storing as an object
# Choose alternative: "two-sided" (default), "less", or "greater" and the null value (default is zero)
test_result <- t.test(data, alternative = "greater", mu = 179)

# Printing test results. Single components can be printed using for example test_result$statistic.
print(test_result)
## 
##  One Sample t-test
## 
## data:  data
## t = 5.2621, df = 37, p-value = 3.13e-06
## alternative hypothesis: true mean is greater than 179
## 95 percent confidence interval:
##  182.5396      Inf
## sample estimates:
## mean of x 
##    184.21
# Test for 1 mean
from scipy.stats import t, norm
from math import sqrt

# Sample size:
n = 38

# Sample mean:
xbar = 184.21

# Sample/population standard deviation:
s = 6.1034

# Significance level:
alpha = 0.05

# Null value for the population mean:
mu0 = 179

# Alternative (sign): "<"; ">"; "<>"; "≠"
alt = ">"

# Calculations:
# degrees of freedom:
df = n - 1

# critical value (t test):
if alt == "<":
    crit_t = t.ppf(alpha, df)
elif alt == ">":
    crit_t = t.ppf(1 - alpha, df)
else:
    crit_t = t.ppf(1 - alpha / 2, df)

# test statistic (t/z)
test_tz = (xbar - mu0) / (s / sqrt(n))

# p-value (t test):
if alt == ">":
    p_value_t = 1 - t.cdf(test_tz, df)
elif alt == "<":
    p_value_t = t.cdf(test_tz, df)
else:
    p_value_t = 2 * (1 - t.cdf(abs(test_tz), df))

# critical value (z test):
if alt == "<":
    crit_z = norm.ppf(alpha)
elif alt == ">":
    crit_z = norm.ppf(1 - alpha)
else:
    crit_z = norm.ppf(1 - alpha / 2)

# p-value (z test):
if alt == ">":
    p_value_z = 1 - norm.cdf(test_tz)
elif alt == "<":
    p_value_z = norm.cdf(test_tz)
else:
    p_value_z = 2 * (1 - norm.cdf(abs(test_tz)))

results = {
    'Mean': xbar,
    'SD': s,
    'Sample size': n,
    'Null hypothesis': f'mu = {mu0}',
    'Alt. hypothesis': f'mu {alt} {mu0}',
    'Test stat. t/z': test_tz,
    'Critical value t': crit_t,
    'P-value (t test)': p_value_t,
    'Critical value z': crit_z,
    'P-value (z test)': p_value_z
}

for key, value in results.items():
    print(f"{key}: {value}")
## Mean: 184.21
## SD: 6.1034
## Sample size: 38
## Null hypothesis: mu = 179
## Alt. hypothesis: mu > 179
## Test stat. t/z: 5.262082930082973
## Critical value t: 1.6870936167109876
## P-value (t test): 3.1304551380006984e-06
## Critical value z: 1.6448536269514722
## P-value (z test): 7.121624567840712e-08
# Raw data (t test):

import scipy.stats as stats

data = [176.5267, 195.5237, 184.9741, 179.5349, 188.2120, 190.7425, 178.7593, 196.2744, 186.6965, 187.8559, 183.1323, 176.2569, 191.4752, 186.5975, 180.2120, 184.3434, 178.1691, 184.8852, 187.7973, 178.5013, 172.7343, 176.8545, 184.2068, 181.2395, 186.1983, 173.6317, 181.9529, 185.9135, 188.6081, 183.0285, 183.3375, 188.5512, 184.6348, 186.9657, 183.9622, 200.9014, 183.5353, 177.2538]

test_result = stats.ttest_1samp(data, popmean=179, alternative='greater')

print(test_result)
## TtestResult(statistic=5.262096550537936, pvalue=3.1303226590428976e-06, df=37)
# Test for 1 proportion
# Sample size:
n <- 200
# Number of favourable observations:
x <- 90
# Sample proportion:
p <- x/n
# Significance level:
alpha <- 0.1
# Null proportion value:
p0 <- 0.5
# Alternative (sign): "<"; ">"; "<>"; "≠"
alt <- "≠"

alttext <- if(alt==">") {"greater"} else if(alt=="<") {"less"} else {"two.sided"}

test <- prop.test(x, n, p0, alternative=alttext, correct=FALSE)
test_z <- unname(sign(test$estimate-test$null.value)*sqrt(test$statistic))
crit_z <- if(test$alternative=="less") {qnorm(alpha)} else if(test$alternative=="greater") {qnorm(1-alpha)} else {qnorm(1-alpha/2)}

print(c('Sample proportion' = test$estimate, 
        'Sample size' = n,
        'Null hypothesis' = paste0('p = ', test$null.value),
        'Alternative hypothesis' = paste0('p ', alt, ' ', test$null.value),
        'Test stat. z' = test_z,
        'Test. stat. chi^2' = unname(test$statistic),
        'Critical value (z)' = crit_z,
        'P-value' = test$p.value
))
##    Sample proportion.p            Sample size        Null hypothesis Alternative hypothesis           Test stat. z 
##                 "0.45"                  "200"              "p = 0.5"              "p ≠ 0.5"     "-1.4142135623731" 
##      Test. stat. chi^2     Critical value (z)                P-value 
##                    "2"     "1.64485362695147"    "0.157299207050284"
# Test for 1 proportion
from statsmodels.stats.proportion import proportions_ztest

# Sample size:
n = 200
# Number of favourable observations:
x = 90
# Sample proportion:
p = x/n
# Significance level:
alpha = 0.1
# Null proportion value:
p0 = 0.5
# Alternative hypothesis (sign): "<"; ">"; "<>"; "≠"
alt = "≠"

if alt == ">":
    alttext = "larger"
elif alt == "<":
    alttext = "smaller"
else:
    alttext = "two-sided"

test_result = proportions_ztest(count = x, nobs = n, value = p0, alternative = alttext, prop_var=p0)

print("Test statistic (z):", test_result[0], "\np-value:", test_result[1])
## Test statistic (z): -1.4142135623730947 
## p-value: 0.15729920705028533

C.4.2 Tests and confidence intervals for 2 means

Test and confidence intervals for 2 means — Google spreadsheet

Test and confidence intervals for 2 means — Excel template

# Test z for 2 means
# Sample 1 size:
n1 <- 100
# Sample 1 mean:
xbar1 <- 76.5
# Sample 1 standard deviation:
s1 <- 38.0

# Sample 2 size:
n2 <- 100
# Sample 2 mean:
xbar2 <- 88.1
# Sample 2 standard deviation:
s2 <- 40.0

# Significance level:
alpha <- 0.05

# Null value (usually 0):
mu0 <- 0

# Alternative (sign): "<"; ">"; "<>"; "≠"
alt <- "<"

# Calculations:
# Z test statistic:
test_z <- (xbar1-xbar2-mu0)/sqrt(s1^2/n1+s2^2/n2)

# Z test critical value:
crit_z <- if (alt == "<") {qnorm(alpha)} else if (alt == ">") {qnorm(1-alpha)} else {qnorm(1-alpha/2)}

# Z test p-value:
p.value.z = if(alt == ">"){1-pnorm(test_z)} else if (alt == "<") {pnorm(test_z)} else {2*(1-pnorm(abs(test_z)))}

print(c('Mean 1' = xbar1, 
        'SD 1' = s1,
        'Sample 1 size' = n1,
        'Mean 2' = xbar2, 
        'SD 2' = s2,
        'Sample 2 size' = n2,
        'Null hypothesis' = paste0('mu1-mu2 = ', mu0),
        'Alternative hypothesis' = paste0('mu1-mu2 ', alt, ' ', mu0),
        'Z test statistic' = test_z,
        'Z test critical value' = crit_z,
        'Z test p-value' = p.value.z
))
##                 Mean 1                   SD 1          Sample 1 size                 Mean 2                   SD 2 
##                 "76.5"                   "38"                  "100"                 "88.1"                   "40" 
##          Sample 2 size        Null hypothesis Alternative hypothesis       Z test statistic  Z test critical value 
##                  "100"          "mu1-mu2 = 0"          "mu1-mu2 < 0"     "-2.1024983574238"    "-1.64485362695147" 
##         Z test p-value 
##   "0.0177548216928505"
# Test z for 2 means
# Sample 1 size:
n1 <- 14
# Sample 1 mean:
xbar1 <- 185.2142
# Sample 1 standard deviation:
s1 <- 7.5261

# Sample 2 size:
n2 <- 19
# Sample 2 mean:
xbar2 <- 184.8421
# Sample 2 standard deviation:
s2 <- 5.0471


# Significance level:
alpha <- 0.05

# Null value (usually 0):
mu0 <- 0

# Alternative (sign): "<"; ">"; "<>"; "≠"
alt <- "≠"

# Assume equal variances? (TRUE/FALSE):
eqvar <- FALSE

# Calculations
# Pooled standard deviation:
sp <- sqrt(((n1-1)*s1^2+(n2-1)*s2^2)/(n1+n2-2))

# T test statistic:
test_t <- if(eqvar) {(xbar1-xbar2-mu0)/sqrt(sp^2*(1/n1+1/n2))} else {(xbar1-xbar2-mu0)/sqrt(s1^2/n1+s2^2/n2)}

# Degrees of freedom:
df<-if(eqvar) {n1+n2-2} else {(s1^2/n1+s2^2/n2)^2/((s1^2/n1)^2/(n1-1)+(s2^2/n2)^2/(n2-1))}

# T critical value:
crit_t <- if (alt == "<") {qt(alpha, df)} else if (alt == ">") {qt(1-alpha, df)} else {qt(1-alpha/2, df)}

# T test p-value:
p.value.t = if(alt == ">"){1-pt(test_t, df)} else if (alt == ">") {pt(test_t, df)} else {2*(1-pt(abs(test_t),df))}

print(c('Mean 1' = xbar1, 
        'SD 1' = s1,
        'Sample 1 size' = n1,
        'Mean 2' = xbar2, 
        'SD 2' = s2,
        'Sample size 2' = n2,
        'Null hypothesis' = paste0('mu1-mu2 = ', mu0),
        'Alt. hypothesis' = paste0('mu1-mu2 ', alt, ' ', mu0),
        'T test statistic' = test_t,
        'T test critival value' = crit_t,
        'T test p-value' = p.value.t
))
##                Mean 1                  SD 1         Sample 1 size                Mean 2                  SD 2 
##            "185.2142"              "7.5261"                  "14"            "184.8421"              "5.0471" 
##         Sample size 2       Null hypothesis       Alt. hypothesis      T test statistic T test critival value 
##                  "19"         "mu1-mu2 = 0"         "mu1-mu2 ≠ 0"   "0.160325899672522"    "2.07753969816904" 
##        T test p-value 
##      "0.874131604618"
# Using raw data (t test)

# Two data vectors:
data1 <- c(1.2, 3.1, 1.7, 2.8, 3.0)
data2 <- c(4.2, 2.7, 3.6, 3.9)

# Storing test results as an object.
# Choose alternative: "two-sided" (default), "less", or "greater" and the the equality of variances assumption TRUE/FALSE (default is FALSE)
test_result <- t.test(data1, data2, alternative="two.sided", var.equal = TRUE)

# Printing test results. Single components can be printed using for example test_result$statistic.
print(test_result)
## 
##  Two Sample t-test
## 
## data:  data1 and data2
## t = -2.3887, df = 7, p-value = 0.04826
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.46752406 -0.01247594
## sample estimates:
## mean of x mean of y 
##      2.36      3.60
import math
from scipy.stats import norm

# Test z for 2 means
# Sample 1 size:
n1 = 100
# Sample 1 mean:
xbar1 = 76.5
# Sample 1 standard deviation:
s1 = 38.0
# Sample 2 size:
n2 = 100
# Sample 2 mean:
xbar2 = 88.1
# Sample 2 standard deviation:
s2 = 40.0
# Significance level:
alpha = 0.05
# Null value (usually 0):
mu0 = 0
# Alternative (sign): "<"; ">"; "<>"/"≠"
alt = "<"

# Calculations:
# Z test statistic:
test_z = (xbar1 - xbar2 - mu0) / math.sqrt(s1**2 / n1 + s2**2 / n2)

# Z test critical value:
if alt == "<":
    crit_z = norm.ppf(alpha)
elif alt == ">":
    crit_z = norm.ppf(1 - alpha)
else:
    crit_z = norm.ppf(1 - alpha / 2)

# Z test p-value:
if alt == ">":
    p_value_z = 1 - norm.cdf(test_z)
elif alt == "<":
    p_value_z = norm.cdf(test_z)
else:
    p_value_z = 2 * (1 - norm.cdf(abs(test_z)))

results = {
    'Mean 1': xbar1,
    'SD 1': s1,
    'Sample 1 size': n1,
    'Mean 2': xbar2,
    'SD 2': s2,
    'Sample 2 size': n2,
    'Null hypothesis': f'mu1-mu2 = {mu0}',
    'Alt. hypothesis': f'mu1-mu2 {alt} {mu0}',
    'Z test statistic': test_z,
    'Z test critical value': crit_z,
    'Z test p-value': p_value_z
}

for key, value in results.items():
    print(f"{key}: {value}")
## Mean 1: 76.5
## SD 1: 38.0
## Sample 1 size: 100
## Mean 2: 88.1
## SD 2: 40.0
## Sample 2 size: 100
## Null hypothesis: mu1-mu2 = 0
## Alt. hypothesis: mu1-mu2 < 0
## Z test statistic: -2.102498357423799
## Z test critical value: -1.6448536269514729
## Z test p-value: 0.017754821692850486
# T test for 2 means
from scipy.stats import t
# Sample 1 size:
n1 = 14
# Sample 1 mean:
xbar1 = 185.2142
# Sample 1 standard deviation:
s1 = 7.5261
# Sample 2 size:
n2 = 19
# Sample 2 mean:
xbar2 = 184.8421
# Sample 2 standard deviation:
s2 = 5.0471
# Poziom istotności:
alpha = 0.05
# Null value (usually 0):
mu0 = 0
# Alternative (sign): "<"; ">"; "<>"/"≠"
alt = "≠"
# Assume equal variances? (True/False):
eqvar = False

# Calculations
# Pooled standard deviation:
sp = math.sqrt(((n1 - 1) * s1**2 + (n2 - 1) * s2**2) / (n1 + n2 - 2))

# T test statistic and degrees of freedom:
if eqvar:
    test_t = (xbar1 - xbar2 - mu0) / math.sqrt(sp**2 * (1/n1 + 1/n2))
    df = n1 + n2 - 2
else:
    test_t = (xbar1 - xbar2 - mu0) / math.sqrt(s1**2 / n1 + s2**2 / n2)
    df = (s1**2 / n1 + s2**2 / n2)**2 / ((s1**2 / n1)**2 / (n1 - 1) + (s2**2 / n2)**2 / (n2 - 1))

# T test critival value:
if alt == "<":
    crit_t = t.ppf(alpha, df)
elif alt == ">":
    crit_t = t.ppf(1 - alpha, df)
else:
    crit_t = t.ppf(1 - alpha / 2, df)

# T test p-value:
if alt == ">":
    p_value_t = 1 - t.cdf(test_t, df)
elif alt == "<":
    p_value_t = t.cdf(test_t, df)
else:
    p_value_t = 2 * (1 - t.cdf(abs(test_t), df))

results = {
    'Mean 1': xbar1,
    'SD 1': s1,
    'Sample 1 size': n1,
    'Mean 2': xbar2,
    'SD 2': s2,
    'Sample 2 size': n2,
    'Null hypothesis': f'mu1-mu2 = {mu0}',
    'Alt. hypothesis': f'mu1-mu2 {alt} {mu0}',
    'Z test statistic': test_t,
    'Z test critical value': crit_t,
    'Z test p-value': p_value_t
}

for key, value in results.items():
    print(f"{key}: {value}")
## Mean 1: 185.2142
## SD 1: 7.5261
## Sample 1 size: 14
## Mean 2: 184.8421
## SD 2: 5.0471
## Sample 2 size: 19
## Null hypothesis: mu1-mu2 = 0
## Alt. hypothesis: mu1-mu2 ≠ 0
## Z test statistic: 0.16032589967252212
## Z test critical value: 2.0775396981690264
## Z test p-value: 0.8741316046180003
    
# Using raw data (t test)
from scipy.stats import ttest_ind, t

# Two data vectors
data1 = [1.2, 3.1, 1.7, 2.8, 3.0]
data2 = [4.2, 2.7, 3.6, 3.9]

# Storing test results as an object.
# Choose alternative: "two-sided" (default), "less", or "greater" and the the equality of variances assumption True/False (default is False)
test_result = ttest_ind(data1, data2, alternative='two-sided', equal_var=True)

print(test_result)
## TtestResult(statistic=-2.3886571085065054, pvalue=0.04826397365151946, df=7.0)

C.4.3 Tests and confidence intervals for 2 proportions

Tests and confidence intervals for 2 proportions — Google spreadsheet

Tests and confidence intervals for 2 proportions — Excel template

# Test for 2 proportions
# Sample 1 size:
n1 <- 24
# Number of favourable observations in sample 1:
x1 <- 21
# Sample 1 proportion:
phat1 <- x1/n1
# Sample 2 size:
n2 <- 24
# Number of favourable observations in sample 2:
x2 <- 14
# Sample 2 proportion:
phat2 <- x2/n2


# Significance level:
alpha <- 0.05
# Alternative (sign): "<"; ">"; "<>"/"≠"
alt <- ">"

alttext <- if(alt==">") {"greater"} else if(alt=="<") {"less"} else {"two.sided"}

test <- prop.test(c(x1, x2), c(n1, n2), alternative=alttext, correct=FALSE)
test_z <- unname(-sign(diff(test$estimate))*sqrt(test$statistic))
crit_z <- if(test$alternative=="less") {qnorm(alpha)} else if(test$alternative=="greater") {qnorm(1-alpha)} else {qnorm(1-alpha/2)}

print(c('Sample proportions ' = test$estimate, 
        'Sample sizes ' = c(n1, n2),
        'Null hypothesis' = paste0('p1-p2 = ', 0),
        'Alt. hypothesis' = paste0('p1-p2 ', alt, ' ', 0),
        'Z test statistic' = test_z,
        'Chi^2 test statistic' = unname(test$statistic),
        'Z test critival value' = crit_z,
        'Chi^2 test critival value' = crit_z^2,
        'P-value' = test$p.value
))
## Sample proportions .prop 1 Sample proportions .prop 2             Sample sizes 1             Sample sizes 2 
##                    "0.875"        "0.583333333333333"                       "24"                       "24" 
##            Null hypothesis            Alt. hypothesis           Z test statistic       Chi^2 test statistic 
##                "p1-p2 = 0"                "p1-p2 > 0"         "2.27359424023522"         "5.16923076923077" 
##      Z test critival value  Chi^2 test critival value                    P-value 
##         "1.64485362695147"         "2.70554345409541"       "0.0114951970462325"
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import norm, chi2_contingency
import numpy as np

n1 = 24
x1 = 21
phat1 = x1 / n1

n2 = 24
x2 = 14
phat2 = x2 / n2

alpha = 0.05
alt = ">"

if alt == ">":
    alttext = "larger"
elif alt == "<":
    alttext = "smaller"
else:
    alttext = "two-sided"

test_result = proportions_ztest(count = np.array([x1, x2]), nobs = np.array([n1, n2]), alternative = alttext)

print("Z test statistic:", test_result[0], 
"\np-value:", test_result[1])
## Z test statistic: 2.2735942402352203 
## p-value: 0.011495197046232447

C.5 Chi-square test

Chi-square test — Google spreadsheet

Chi-square test — Excel template

# Indepenence/homogeneity test

# Data matrix

# Input vector
m <- c(
  21, 14,
  3, 10
)

# Number of rows in the data matrix 
nrow <- 2

# Significance level
alpha <- 0.05

# Transformation of the vector to a matrix
m <- matrix(data=m, nrow=nrow, byrow=TRUE)

# Chi-square test without Yates' correction
test_chi <- chisq.test(m, correct=FALSE)

# Chi-square test with Yates' correction for 2x2 tables
test_chi_corrected <- chisq.test(m)

# G test
test_g <- AMR::g.test(m)

# Exact Fisher test
exact_fisher<-fisher.test(m)

print(c('Degrees of freedom' = test_chi$parameter, 
        'Critical value' = qchisq(1-alpha, test_chi$parameter), 
        'Chi^2 statistic' = unname(test_chi$statistic),
        'Chi^2 test p-value' = test_chi$p.value,
        "Cramér's V" = unname(sqrt(test_chi$statistic/sum(m)/min(dim(m)-1))),
        'Phi coefficient (2x2 tables)' = if(all(dim(m)==2)) {psych::phi(m, digits=10)},
        "Chi^2 with Yates' correction test statistic" = unname(test_chi_corrected$statistic),
        "Chi^2 with Yates' correction p-value" = test_chi_corrected$p.value,
        'G statistic' = unname(test_g$statistic),
        'G test p-value' = test_g$p.value,
        'Exact Fisher test p-value' = exact_fisher$p.value
))
##                       Degrees of freedom.df                              Critical value 
##                                  1.00000000                                  3.84145882 
##                             Chi^2 statistic                          Chi^2 test p-value 
##                                  5.16923077                                  0.02299039 
##                                  Cramér's V                Phi coefficient (2x2 tables) 
##                                  0.32816506                                  0.32816506 
## Chi^2 with Yates' correction test statistic        Chi^2 with Yates' correction p-value 
##                                  3.79780220                                  0.05131990 
##                                 G statistic                              G test p-value 
##                                  5.38600494                                  0.02029889 
##                   Exact Fisher test p-value 
##                                  0.04899141
# Goodness-of-fit chi-square test

# Observed frequencies:
observed <- c(70, 10, 20)

# Expected frequencies:
expected <- c(80, 10, 10)

# Just in case: Adjustment of the expected frequencies so that their sum is equal to the sum of the observed frequencies:
expected <- expected / sum(expected) * sum(observed)

# Significance level
alpha <- 0.05

test_chi <- chisq.test(x = observed, p = expected, rescale.p = TRUE)
test_g <- AMR::g.test(x = observed, p = expected, rescale.p = TRUE)

print(c('Degrees of freedom' = test_chi$parameter, 
        'Critical value' = qchisq(1-alpha, test_chi$parameter), 
        'Chi^2 statistic' = unname(test_chi$statistic),
        'p-value (chi-square test)' = test_chi$p.value,
        'G statistic' = unname(test_g$statistic),
        'p-value (G test)' = test_g$p.value
))
##     Degrees of freedom.df            Critical value           Chi^2 statistic p-value (chi-square test) 
##               2.000000000               5.991464547              11.250000000               0.003606563 
##               G statistic          p-value (G test) 
##               9.031492255               0.010935443
# Indepenence/homogeneity test
import numpy as np
import scipy.stats as stats
from scipy.stats import chi2
from statsmodels.stats.contingency_tables import Table2x2

# Data matrix
m = np.array([
  [21, 14],
  [3, 10]
])

alpha = 0.05

# Chi-square test without Yates' correction
test_chi = stats.chi2_contingency(m, correction=False)

# Chi-square test with Yates' correction for 2x2 tables
test_chi_corrected = stats.chi2_contingency(m)

# G test
g, p, dof, expected = stats.chi2_contingency(m, lambda_="log-likelihood")

# Exact Fisher test
exact_fisher = stats.fisher_exact(m)

# Cramér's V
cramers_v = np.sqrt(test_chi[0] / m.sum() / min(m.shape[0]-1, m.shape[1]-1))

# Phi coefficient (2x2 tables)
phi_coefficient = None
if m.shape == (2, 2):
    phi_coefficient = cramers_v*np.sign(np.diagonal(m).prod()-np.diagonal(np.fliplr(m)).prod())

# Results
results = {
    'Number of degrees of freedom': test_chi[2],
    'Critical value': chi2.ppf(1-alpha, test_chi[2]),
    'Chi-square statistic': test_chi[0],
    'p-value (chi-square test)': test_chi[1],
    "Cramer\'s V": cramers_v,
    'Phi coefficient (for 2x2 table)': phi_coefficient,
    "Chi-square statistic with Yates' correction": test_chi_corrected[0],
    "p-value (chi-square test with Yates' correction)": test_chi_corrected[1],
    'G statistic': g,
    'p-value (G test)': p,
    'p-value (Fisher’s exact test)': exact_fisher[1]
}

for key, value in results.items():
    print(f"{key}: {value}")
## Number of degrees of freedom: 1
## Critical value: 3.841458820694124
## Chi-square statistic: 5.169230769230769
## p-value (chi-square test): 0.022990394092464842
## Cramer's V: 0.3281650616569468
## Phi coefficient (for 2x2 table): 0.3281650616569468
## Chi-square statistic with Yates' correction: 3.7978021978021976
## p-value (chi-square test with Yates' correction): 0.05131990358807137
## G statistic: 3.9106978537750194
## p-value (G test): 0.04797967015430134
## p-value (Fisher’s exact test): 0.048991413058947844
# Goodness-of-fit chi-square test

from scipy.stats import chisquare, chi2
import numpy as np

# Liczebności rzeczywiste:
observed = np.array([70, 10, 20])
# Liczebności oczekiwane:
expected = np.array([80, 10, 10])

# Ewentualna korekta liczebności oczekiwanych, żeby ich suma była na pewno równa sumie rzeczywistych:
expected = expected / expected.sum() * observed.sum()

# Test chi-kwadrat:
chi_stat, chi_p = chisquare(f_obs=observed, f_exp=expected)

# Liczba stopni swobody:
df = len(observed) - 1

# Poziom istotności:
alpha = 0.05  

# Wartość krytyczna:
critical_value = chi2.ppf(1 - alpha, df)

# Test G:
from scipy.stats import power_divergence
g_stat, g_p = power_divergence(f_obs=observed, f_exp=expected, lambda_="log-likelihood")

# Wyniki

results = {
    'Liczba stopni swobody': df,
    'Wartość krytyczna': critical_value,
    'Statystyka chi^2': chi_stat,
    'Wartość p (test chi-kwadrat)': chi_p,
    'Statystyka G': g_stat,
    'Wartość p (test G)': g_p
}

for key, value in results.items():
    print(f"{key}: {value}")
## Liczba stopni swobody: 2
## Wartość krytyczna: 5.991464547107979
## Statystyka chi^2: 11.25
## Wartość p (test chi-kwadrat): 0.0036065631360157305
## Statystyka G: 9.031492254964643
## Wartość p (test G): 0.010935442847719828

C.6 ANOVA i test Levene'a

ANOVA — Google spreadsheet

ANOVA — Excel template

# Example data
group <- as.factor(c(rep("A",7), rep("B",7), rep("C",7), rep("D",7)))
result <- c(51, 87, 50, 48, 79, 61, 53, 82, 91, 92, 80, 52, 79, 73, 
            79, 84, 74, 98, 63, 83, 85, 85, 80, 65, 71, 67, 51, 80)
data<-data.frame(group, result)

#To view the data, you can run: View(data)            
#To write the data to a text file, you can run: write.csv2(data, "data.csv")
#To load data from a text file, you can run: read.csv2("data.csv")

# Summary table
library(dplyr)
data %>% 
  group_by(group) %>% 
  summarize(n=n(), suma = sum(result), srednia = mean(result), odch_st=sd(result), mediana = median(result)) %>%
  data.frame() -> summary_table

# ANOVA table
model<-aov(result~group, data=data)
anova_summary<-summary(model)

# Levene's test (Brown-Forsythe test)
levene_result<-car::leveneTest(result~group, data=data)

# Tukey's procedure
tukey_result<-TukeyHSD(x=model, conf.level=0.95)

print(list(`Summary` = summary_table, `ANOVA table` = anova_summary, `Levene test` = levene_result,
           `Tukey HSD` = tukey_result))
## $Summary
##   group n suma  srednia  odch_st mediana
## 1     A 7  429 61.28571 15.56400      53
## 2     B 7  549 78.42857 13.45185      80
## 3     C 7  566 80.85714 10.76148      83
## 4     D 7  499 71.28571 11.61485      71
## 
## $`ANOVA table`
##             Df Sum Sq Mean Sq F value Pr(>F)  
## group        3   1620   539.8   3.204 0.0412 *
## Residuals   24   4043   168.5                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $`Levene test`
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  3  0.1898 0.9023
##       24               
## 
## $`Tukey HSD`
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = result ~ group, data = data)
## 
## $group
##          diff        lwr       upr     p adj
## B-A 17.142857  -1.996412 36.282127 0.0905494
## C-A 19.571429   0.432159 38.710698 0.0437429
## D-A 10.000000  -9.139270 29.139270 0.4870470
## C-B  2.428571 -16.710698 21.567841 0.9849136
## D-B -7.142857 -26.282127 11.996412 0.7340659
## D-C -9.571429 -28.710698  9.567841 0.5237024
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.anova import anova_lm

# Data frame
group = ['A']*7 + ['B']*7 + ['C']*7 + ['D']*7
result = [51, 87, 50, 48, 79, 61, 53,
          82, 91, 92, 80, 52, 79, 73, 
          79, 84, 74, 98, 63, 83, 85, 
          85, 80, 65, 71, 67, 51, 80]
data = pd.DataFrame({'group': group, 'result': result})

# Summary table
summary_table = data.groupby('group')['result'].agg(['count', 'sum', 'mean', 'std', 'median']).reset_index()
summary_table.columns = ['group', 'n', 'sum', 'mean', 'sd', 'median']

# ANOVA
model = ols('result ~ group', data=data).fit()
anova_summary = anova_lm(model, typ=2)

# Levene's test (Brown-Forsythe test)
levene_result = stats.levene(data['result'][data['group'] == 'A'],
                             data['result'][data['group'] == 'B'],
                             data['result'][data['group'] == 'C'],
                             data['result'][data['group'] == 'D'])

# Tukey's HSD
tukey_result = pairwise_tukeyhsd(endog=data['result'], groups=data['group'], alpha=0.05)

print('Summary:\n\n', summary_table, '\n\nANOVA:\n\n', anova_summary, '\n\nLevene\'s test:\n\n', levene_result,
       '\n\nTukey HSD:\n\n', tukey_result)
## Summary:
## 
##    group  n  sum       mean         sd  median
## 0     A  7  429  61.285714  15.564000    53.0
## 1     B  7  549  78.428571  13.451854    80.0
## 2     C  7  566  80.857143  10.761483    83.0
## 3     D  7  499  71.285714  11.614851    71.0 
## 
## ANOVA:
## 
##                 sum_sq    df         F    PR(>F)
## group     1619.535714   3.0  3.204282  0.041204
## Residual  4043.428571  24.0       NaN       NaN 
## 
## Levene's test:
## 
##  LeveneResult(statistic=0.18975139523084725, pvalue=0.9023335775328473) 
## 
## Tukey HSD:
## 
##   Multiple Comparison of Means - Tukey HSD, FWER=0.05 
## =====================================================
## group1 group2 meandiff p-adj   lower    upper  reject
## -----------------------------------------------------
##      A      B  17.1429 0.0905  -1.9964 36.2821  False
##      A      C  19.5714 0.0437   0.4322 38.7107   True
##      A      D     10.0  0.487  -9.1393 29.1393  False
##      B      C   2.4286 0.9849 -16.7107 21.5678  False
##      B      D  -7.1429 0.7341 -26.2821 11.9964  False
##      C      D  -9.5714 0.5237 -28.7107  9.5678  False
## -----------------------------------------------------

C.7 Nonparametric tests

Nonparametric tests — Google spreadsheet

Nonparametric tests — Excel template

# Runs test
# Example data
vHT <- "HHTHTTTHHTHTTHHTHTHHTHTHHHTTHTHTHTTHHTTTTHHTHHTTHTHTTHTHHHT"

# Transforming into a vector of 0s and 1s
v01 <- 1*(unlist(strsplit(vHT, ""))=="H")

# Test
DescTools::RunsTest(v01)
## 
##  Runs Test for Randomness
## 
## data:  v01
## z = 1.8413, runs = 38, m = 29, n = 30, p-value = 0.06557
## alternative hypothesis: true number of runs is not equal the expected number
# Mann-Whitney test
# Data (example):
p1<-c(24, 25, 21, 22, 23, 18, 17, 28, 24, 27, 21, 23)
p2<-c(20, 23, 21, 25, 18, 17, 18, 24, 20, 24, 23, 19)

# Test
# with parameters: wilcox.test(p1, p2, correct=TRUE, exact=FALSE)
wilcox.test(p1, p2)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  p1 and p2
## W = 94, p-value = 0.2114
## alternative hypothesis: true location shift is not equal to 0
# Wilcoxon paired test
# Data (example):
p1<-c(24, 25, 21, 22, 23, 18, 17, 28, 24, 27, 21, 23)
p2<-c(20, 23, 21, 25, 18, 17, 18, 24, 20, 24, 23, 19)

# Test:
# parameters: wilcox.test(p2, p1, paired=TRUE, correct=FALSE, exact=FALSE, alternative="two.sided")
wilcox.test(p1, p2, paired=TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  p1 and p2
## V = 55.5, p-value = 0.04898
## alternative hypothesis: true location shift is not equal to 0
# Kruskal-Wallis test
# Data (example):
df<-data.frame(A = c(7,8,9,9,10,11), B = c(12,13,14,14,15,16))
df2 <- tidyr::gather(df)

# Test:
kruskal.test(value ~ key, data = df2)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  value by key
## Kruskal-Wallis chi-squared = 8.3662, df = 1, p-value = 0.003823
import numpy as np
import pandas as pd
from scipy.stats import wilcoxon, kruskal, mannwhitneyu
from statsmodels.stats.proportion import proportions_ztest

# Runs test
# Example data
vHT = "HHTHTTTHHTHTTHHTHTHHTHTHHHTTHTHTHTTHHTTTTHHTHHTTHTHTTHTHHHT"

# Transforming into a vector of 0s and 1s
v01 = np.array([1 if char == 'H' else 0 for char in vHT])

# Calculations
def getRuns(l):
    import itertools
    # return len([sum(1 for _ in r) for _, r in itertools.groupby(l)])
    return sum(1 for _ in itertools.groupby(l))
r = getRuns(v01)
n = len(v01)
n1 = sum(v01)
n0 = n - n1
mu_r = (2 * n0 * n1 / n) + 1
sigma_r = np.sqrt((2 * n0 * n1 * (2 * n0 * n1 - n)) / (n ** 2 * (n - 1)))
z = (r-mu_r-np.sign(r-mu_r)/2)/sigma_r
p_value = 2 * (1 - norm.cdf(abs(z)))
print("z =", z, "p-value =", p_value)
## z = 1.8413274595803757 p-value = 0.0655735860394191
# Mann-Whitney test
# Data (example):
p1 = np.array([24, 25, 21, 22, 23, 18, 17, 28, 24, 27, 21, 23])
p2 = np.array([20, 23, 21, 25, 18, 17, 18, 24, 20, 24, 23, 19])

# Test
mannwhitneyu(p1, p2)
## MannwhitneyuResult(statistic=94.0, pvalue=0.21138945901258455)
# Wilcoxon paired test
# Data (example):
p1 = np.array([24, 25, 21, 22, 23, 18, 17, 28, 24, 27, 21, 23])
p2 = np.array([20, 23, 21, 25, 18, 17, 18, 24, 20, 24, 23, 19])

# Test
wilcoxon(p1, p2)
## WilcoxonResult(statistic=10.5, pvalue=0.044065400736826854)
# Kruskal-Wallis test
# Data (example):
df = pd.DataFrame({'A': [7, 8, 9, 9, 10, 11], 'B': [12, 13, 14, 14, 15, 16]})
df2 = df.melt()

# Test
kruskal(*[group["value"].values for name, group in df2.groupby("variable")])
## KruskalResult(statistic=8.366197183098597, pvalue=0.0038226470545864484)

C.8 Other tests

C.8.1 Regression

Simple regression — Google spreadsheet

Simple regression — Excel template

Multiple regression — Google spreadsheet

Multiple regression — Excel template

C.8.2 Normality check

Normality check — Google spreadsheet

Normality check — Excel template

# Normality check
# Data
data<-c(76, 62, 55, 62, 56, 64, 44, 56, 94, 37, 72, 85, 72, 71, 60, 61, 64, 71, 69, 70, 64, 59, 71, 65, 84)
# Sample mean
m<-mean(data)
# Sample standard deviation
s<-sd(data)
# Skewness
skew<-e1071::skewness(data, type=2)
# Kurtosis
kurt<-e1071::kurtosis(data, type=2)
# IQR/s ratio
IQR_to_s<-IQR(data)/s
# proportion of observations within 1 sd from the mean
within1s<-mean(abs(data-m)<s)
# proportion of observations within 2 sds from the mean
within2s<-mean(abs(data-m)<2*s)
# proportion of observations within 3 sd from the mean
within3s<-mean(abs(data-m)<3*s)
# Shapiro-Wilk test
SW_res<-shapiro.test(data)
# Jarque-Bera test
JB_res<-tseries::jarque.bera.test(data)
# Anderson-Darling test
AD_res<-nortest::ad.test(data)
# Kolmogorov-Smirnov test
KS_res <- ks.test(data, function(x){pnorm(x, mean(data), sd(data))})

# Results
print(c('Mean' = m, 
        'St. deviation' = s,
        'Sample size' = length(data),
        'Skewnes (~=0?)' = skew,
        'Kurtosis (~=0?)' = kurt,
        'IQR/s (~=1,3?)' = IQR_to_s,
        '68% rule' = within1s*100,
        '95% rule' = within2s*100,
        '100% rule' = within3s*100,
        'Test stat. - SW test' = SW_res$statistic,
        'P-value - SW test' = SW_res$p.value,
        'Test stat. - JB test' = JB_res$statistic,
        'P-value - JB test' = JB_res$p.value,
        'Test stat. - AD test' = AD_res$statistic,
        'P-value - AD test' = AD_res$p.value,
        'Test stat - KS test' = KS_res$statistic,
        'P-value - KS test' = KS_res$p.value
))
##                           Mean                  St. deviation                    Sample size                 Skewnes (~=0?) 
##                   65.760000000                   12.145918382                   25.000000000                   -0.002892879 
##                Kurtosis (~=0?)                 IQR/s (~=1,3?)                       68% rule                       95% rule 
##                    1.121315636                    0.905654036                   80.000000000                   92.000000000 
##                      100% rule         Test stat. - SW test.W              P-value - SW test Test stat. - JB test.X-squared 
##                  100.000000000                    0.964892812                    0.520206266                    0.479578632 
##              P-value - JB test         Test stat. - AD test.A              P-value - AD test          Test stat - KS test.D 
##                    0.786793609                    0.445513641                    0.260454333                    0.143712404 
##              P-value - KS test 
##                    0.680154210
# Plots
hist(data, prob=TRUE)
curve(dnorm(x, m, s), col="darkblue", lwd=2, add=TRUE)

qqnorm(data, pch=16)
qqline(data, col='darkblue')

import numpy as np
import math as math
from scipy import stats
from scipy.stats import skew, kurtosis, anderson, kstest, norm

data = [76, 62, 55, 62, 56, 64, 44, 56, 94, 37, 72, 85, 72, 71, 60, 61, 64, 71, 69, 70, 64, 59, 71, 65, 84]

m = np.mean(data)

s = np.std(data, ddof=1)

skew = skew(data, bias=False)

kurt = kurtosis(data, bias=False)

IQR_to_s = stats.iqr(data) / s

within1s = np.mean(np.abs(data - m) < s)

within2s = np.mean(np.abs(data - m) < 2 * s)

within3s = np.mean(np.abs(data - m) < 3 * s)

SW_res = stats.shapiro(data)

JB_res = stats.jarque_bera(data)

AD_res = stats.anderson(data)

AD2 = AD_res[0]*(1 + (.75/50) + 2.25/(50**2))

if AD2 >= .6:
    AD_p = math.exp(1.2937 - 5.709*AD2 - .0186*(AD2**2))
elif AD2 >=.34:
    AD_p = math.exp(.9177 - 4.279*AD2 - 1.38*(AD2**2))
elif AD2 >.2:
    AD_p = 1 - math.exp(-8.318 + 42.796*AD2 - 59.938*(AD2**2))
else:
    AD_p = 1 - math.exp(-13.436 + 101.14*AD2 - 223.73*(AD2**2))
    
KS_res = kstest(data, norm(loc=np.mean(data), scale=np.std(data)).cdf)

results = {
    'Mean': m, 
    'St. deviation': s,
    'Sample size': len(data),
    'Skewness (~=0?)': skew,
    'Excess kurtosis (~=0?)': kurt,
    'IQR/s (~=1.3?)': IQR_to_s,
    '68% rule': within1s * 100,
    '95% rule': within2s * 100,
    '100% rule': within3s * 100,
    'Test stat. - SW test': SW_res[0],
    'P-value - SW test': SW_res[1],
    'Test stat. - JB test': JB_res[0],
    'P-value - JB test': JB_res[1],
    'Test stat. - AD test': AD_res[0],
    'P-value - AD test': AD_p,
    'Test stat. - KS test': KS_res[0],
    'P-value - KS test': KS_res[1]
}


for key, value in results.items():
    print(f"{key}: {value}")
## Mean: 65.76
## St. deviation: 12.145918381634768
## Sample size: 25
## Skewness (~=0?): -0.002892878582795321
## Excess kurtosis (~=0?): 1.121315636006976
## IQR/s (~=1.3?): 0.9056540357320815
## 68% rule: 80.0
## 95% rule: 92.0
## 100% rule: 100.0
## Test stat. - SW test: 0.9648927450180054
## P-value - SW test: 0.5202044248580933
## Test stat. - JB test: 0.47957863171421605
## P-value - JB test: 0.7867936085428064
## Test stat. - AD test: 0.44551364110243696
## P-value - AD test: 0.27208274566708046
## Test stat. - KS test: 0.14001867878746022
## P-value - KS test: 0.6601736174333425
import matplotlib.pyplot as plt

# Histogram
plt.hist(data, density=True)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, m, s)
plt.plot(x, p, 'k', linewidth=2, color="darkblue")
plt.show()

# Q-Q plot
qq_data = stats.probplot(data, dist="norm", plot=plt)
plt.show()

C.8.3 Correlation

Correlation — Google spreadsheet

Correlation — Excel template

#data:
x <- c(4, 8, 6, 5, 3, 8, 6, 5, 5, 6, 3, 3, 3, 7, 3, 6, 4, 4, 8, 7)
y <- c(4, 6, 8, 3, 4, 6, 5, 8, 1, 4, 4, 3, 1, 6, 7, 5, 4, 4, 9, 5)
#test and confidence interval:
cor.test(x, y, alternative = "two.sided", conf.level = 0.95)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 2.56, df = 18, p-value = 0.01968
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.09607185 0.78067292
## sample estimates:
##       cor 
## 0.5166288
import scipy.stats as stats
import numpy as np

x = [4, 8, 6, 5, 3, 8, 6, 5, 5, 6, 3, 3, 3, 7, 3, 6, 4, 4, 8, 7]
y = [4, 6, 8, 3, 4, 6, 5, 8, 1, 4, 4, 3, 1, 6, 7, 5, 4, 4, 9, 5]

correlation_test = stats.pearsonr(x, y)

# Confidence interval:
ci_tmp = stats.norm.interval(0.95, 
  loc=0.5*math.log((1+correlation_test[0])/(1-correlation_test[0])), 
  scale=1/((len(x)-3)**0.5))
confidence_interval = (np.exp(2*np.array(ci_tmp))-1)/(np.exp(2*np.array(ci_tmp))+1)

print('Test:\n\n', correlation_test, '\n\n Confidence interval: \n\n', confidence_interval)
## Test:
## 
##  PearsonRResult(statistic=0.5166287822129882, pvalue=0.01968490591016886) 
## 
##  Confidence interval: 
## 
##  [0.09607185 0.78067292]