binomPMF = (k, n, p) => {
if (k < 0 || k > n) return 0;
const binomCoeff = factorial(n) / (factorial(k) * factorial(n - k));
return binomCoeff * Math.pow(p, k) * Math.pow(1 - p, n - k);
}
// Exponential-based probability mass function
expPMF = (k, n, theta) => {
if (k < 0 || k > n) return 0;
// Calculate unnormalized probabilities
const unnormalized = [];
let sum = 0;
for (let i = 0; i <= n; i++) {
const val = Math.exp(-Math.abs(i - n * theta));
unnormalized.push(val);
sum += val;
}
// Return normalized probability
return unnormalized[k] / sum;
}
// Factorial function with memoization
factorial = (n) => {
if (n <= 1) return 1;
let result = 1;
for (let i = 2; i <= n; i++) {
result *= i;
}
return result;
}
// Interactive controls
viewof n = Inputs.range([1, 8], {value: 4, step: 1, label: "n"})
viewof theta = Inputs.range([0.01, 0.99], {value: 0.30, step: 0.01, label: "θ"})
viewof t_select = Inputs.range([0, 2*n], {value: Math.min(3, 2*n), step: 1, label: "T = t"})
viewof distributionType = Inputs.radio(["Binomial", "Discrete Laplace"], {value: "Binomial", label: "Distribution"})
// Select which PMF to use based on dropdown selection
pmf = distributionType === "Binomial" ? binomPMF : expPMF
// Generate joint distribution data
jointData = {
const data = [];
for (let x1 = 0; x1 <= n; x1++) {
for (let x2 = 0; x2 <= n; x2++) {
const prob = pmf(x1, n, theta) * pmf(x2, n, theta);
const t = x1 + x2;
data.push({
x1: x1,
x2: x2,
prob: prob,
t: t,
selected: t === t_select
});
}
}
return data;
}
// Generate conditional distribution data (X given T = t)
conditionalData = {
const data = [];
// Only process if t_select is valid
if (t_select <= 2 * n) {
// Find all (x1, x2) pairs where x1 + x2 = t
const validPairs = jointData.filter(d => d.t === t_select);
const totalCondProb = validPairs.reduce((sum, d) => sum + d.prob, 0);
validPairs.forEach(d => {
if (totalCondProb > 0) {
data.push({
x1: d.x1,
x2: d.x2,
condProb: d.prob / totalCondProb,
t: d.t
});
}
});
}
return data;
}
// Calculate maximum probability for scaling
maxProb = Math.max(...jointData.map(d => d.prob))
maxCondProb = conditionalData.length > 0 ? Math.max(...conditionalData.map(d => d.condProb)) : 1
// Generate marginal distributions
marginalX1 = {
const data = [];
for (let x1 = 0; x1 <= n; x1++) {
data.push({
x1: x1,
prob: pmf(x1, n, theta)
});
}
return data;
}
marginalX2 = {
const data = [];
for (let x2 = 0; x2 <= n; x2++) {
data.push({
x2: x2,
prob: pmf(x2, n, theta)
});
}
return data;
}
// Generate conditional marginal distributions
conditionalMarginalX1 = {
const data = [];
const totalT = Math.min(t_select, 2 * n);
// Only process if t_select is valid
if (t_select <= 2 * n) {
// Calculate total probability for normalization
let totalProb = 0;
for (let x1 = Math.max(0, totalT - n); x1 <= Math.min(n, totalT); x1++) {
const x2 = totalT - x1;
if (x2 >= 0 && x2 <= n) {
totalProb += pmf(x1, n, theta) * pmf(x2, n, theta);
}
}
for (let x1 = 0; x1 <= n; x1++) {
const x2 = totalT - x1;
if (x2 >= 0 && x2 <= n && totalProb > 0) {
const jointProb = pmf(x1, n, theta) * pmf(x2, n, theta);
data.push({
x1: x1,
prob: jointProb / totalProb
});
}
}
}
return data;
}
conditionalMarginalX2 = {
const data = [];
const totalT = Math.min(t_select, 2 * n);
// Only process if t_select is valid
if (t_select <= 2 * n) {
// Calculate total probability for normalization
let totalProb = 0;
for (let x2 = Math.max(0, totalT - n); x2 <= Math.min(n, totalT); x2++) {
const x1 = totalT - x2;
if (x1 >= 0 && x1 <= n) {
totalProb += pmf(x1, n, theta) * pmf(x2, n, theta);
}
}
for (let x2 = 0; x2 <= n; x2++) {
const x1 = totalT - x2;
if (x1 >= 0 && x1 <= n && totalProb > 0) {
const jointProb = pmf(x1, n, theta) * pmf(x2, n, theta);
data.push({
x2: x2,
prob: jointProb / totalProb
});
}
}
}
return data;
}
maxRad = 15 * 5 / (n + 1)
maxMarginalProb = Math.max(...marginalX1.map(d => d.prob), ...marginalX2.map(d => d.prob))
maxCondMarginalProb = conditionalMarginalX1.length > 0 && conditionalMarginalX2.length > 0 ?
Math.max(...conditionalMarginalX1.map(d => d.prob), ...conditionalMarginalX2.map(d => d.prob)) : 1
// Create joint distribution plot with marginal histograms
jointPlot = Plot.plot({
width: 480,
height: 480,
marginTop: 140,
marginLeft: 80,
marginBottom: 80,
marginRight: 120,
style: { fontSize: "20px" },
r: {
type: "linear",
domain: [0, maxRad], // Fixed domain from 0 to maxRad
range: [0, maxRad], // Maps directly to pixel sizes
clamp: false // Prevents values outside domain
},
x: {
domain: [-0.5, n + 0.5],
ticks: Array.from({length: n + 1}, (_, i) => i),
tickFormat: d => d.toString(),
label: "X₁",
labelAnchor: "center",
labelOffset: 50,
labelArrow: "none"
},
y: {
domain: [-0.5, n + 0.5],
ticks: Array.from({length: n + 1}, (_, i) => i),
tickFormat: d => d.toString(),
label: "X₂",
labelAnchor: "center",
labelOffset: 50,
labelArrow: "none"
},
marks: [
// Grid lines
Plot.ruleX(Array.from({length: n + 1}, (_, i) => i), {stroke: "#f0f0f0"}),
Plot.ruleY(Array.from({length: n + 1}, (_, i) => i), {stroke: "#f0f0f0"}),
// Probability circles
Plot.dot(jointData, {
x: "x1",
y: "x2",
r: d => Math.sqrt(d.prob / maxProb) * maxRad,
fill: d => d.selected ? "red" : "steelblue",
fillOpacity: 0.7,
stroke: "black",
strokeWidth: 0.5
}),
// Top marginal histogram (X₁)
Plot.rect(marginalX1, {
x1: d => d.x1 - 0.3,
x2: d => d.x1 + 0.3,
y1: n + 0.56 + n * .06,
y2: d => n + 0.56 + n * .06 + (d.prob / maxMarginalProb) * (n + 1) * 0.3,
fill: "steelblue",
fillOpacity: 0.7,
stroke: "black",
strokeWidth: 0.5
}),
// Right marginal histogram (X₂)
Plot.rect(marginalX2, {
x1: n + 0.56 + n * .06,
x2: d => n + 0.56 + n * .06 + (d.prob / maxMarginalProb) * (n + 1) * 0.3,
y1: d => d.x2 - 0.3,
y2: d => d.x2 + 0.3,
fill: "steelblue",
fillOpacity: 0.7,
stroke: "black",
strokeWidth: 0.5
}),
// Title
Plot.text(["Joint Distribution of X = (X₁, X₂)"], {
x: n/2,
y: n + 0.5 + (n + 1) * .45,
fontSize: 24,
fontWeight: "bold",
textAnchor: "middle"
})
]
})
// Create conditional distribution plot with conditional marginal histograms
conditionalPlot = Plot.plot({
width: 480,
height: 480,
marginTop: 140,
marginLeft: 80,
marginBottom: 80,
marginRight: 120,
style: { fontSize: "20px" },
r: {
type: "linear",
domain: [0, maxRad], // Fixed domain from 0 to 20
range: [0, maxRad], // Maps directly to pixel sizes
clamp: false // Prevents values outside domain
},
x: {
domain: [-0.5, n + 0.5],
ticks: Array.from({length: n + 1}, (_, i) => i),
tickFormat: d => d.toString(),
label: "X₁",
labelAnchor: "center",
labelOffset: 50,
labelArrow: "none"
},
y: {
domain: [-0.5, n + 0.5],
ticks: Array.from({length: n + 1}, (_, i) => i),
tickFormat: d => d.toString(),
label: "X₂",
labelAnchor: "center",
labelOffset: 50,
labelArrow: "none"
},
marks: [
// Grid lines
Plot.ruleX(Array.from({length: n + 1}, (_, i) => i), {stroke: "#f0f0f0"}),
Plot.ruleY(Array.from({length: n + 1}, (_, i) => i), {stroke: "#f0f0f0"}),
// All possible points (faded)
Plot.dot(jointData, {
x: "x1",
y: "x2",
r: 3,
fill: "lightgray",
fillOpacity: 0.3
}),
// Conditional probability circles
Plot.dot(conditionalData, {
x: "x1",
y: "x2",
r: d => conditionalData.length > 0 ? Math.sqrt(d.condProb / maxCondProb) * maxRad : 5,
fill: "red",
fillOpacity: 0.8,
stroke: "black",
strokeWidth: 1
}),
// Top conditional marginal histogram (X₁ | T = t)
Plot.rect(conditionalMarginalX1, {
x1: d => d.x1 - 0.3,
x2: d => d.x1 + 0.3,
y1: n + 0.56 + n * .06,
y2: d => n + 0.56 + n * .06 + (d.prob / maxCondMarginalProb) * (n + 1) * 0.3,
fill: "red",
fillOpacity: 0.7,
stroke: "black",
strokeWidth: 0.5
}),
// Right conditional marginal histogram (X₂ | T = t)
Plot.rect(conditionalMarginalX2, {
x1: n + 0.56 + n * .06,
x2: d => n + 0.56 + n * .06 + (d.prob / maxCondMarginalProb) * (n + 1) * 0.3,
y1: d => d.x2 - 0.3,
y2: d => d.x2 + 0.3,
fill: "red",
fillOpacity: 0.7,
stroke: "black",
strokeWidth: 0.5
}),
// Title
Plot.text([`Conditional Distribution P(X | T = ${t_select})`], {
x: n/2,
y: n + 0.5 + (n + 1) * .45,
fontSize: 24,
fontWeight: "bold",
textAnchor: "middle"
})
]
})
// Display plots side by side
html`<div style="display: flex; gap: 20px; align-items: center; justify-content: center;">
${jointPlot}
${conditionalPlot}
</div>`
Sufficiency
\[ \newcommand{\cB}{\mathcal{B}} \newcommand{\cF}{\mathcal{F}} \newcommand{\cN}{\mathcal{N}} \newcommand{\cP}{\mathcal{P}} \newcommand{\cX}{\mathcal{X}} \newcommand{\EE}{\mathbb{E}} \newcommand{\PP}{\mathbb{P}} \newcommand{\RR}{\mathbb{R}} \newcommand{\ZZ}{\mathbb{Z}} \newcommand{\td}{\,\textrm{d}} \newcommand{\simiid}{\stackrel{\textrm{i.i.d.}}{\sim}} \newcommand{\simind}{\stackrel{\textrm{ind.}}{\sim}} \newcommand{\eqas}{\stackrel{\textrm{a.s.}}{=}} \newcommand{\eqPas}{\stackrel{\cP\textrm{-a.s.}}{=}} \newcommand{\eqmuas}{\stackrel{\mu\textrm{-a.s.}}{=}} \newcommand{\eqD}{\stackrel{D}{=}} \newcommand{\indep}{\perp\!\!\!\!\perp} \DeclareMathOperator*{\minz}{minimize\;} \DeclareMathOperator*{\maxz}{minimize\;} \DeclareMathOperator*{\argmin}{argmin\;} \DeclareMathOperator*{\argmax}{argmax\;} \newcommand{\Var}{\textnormal{Var}} \newcommand{\Cov}{\textnormal{Cov}} \newcommand{\Corr}{\textnormal{Corr}} \]
Sufficiency
Sufficiency is a central concept in statistics that allows us to focus on the essential aspects of the data set while ignoring details that are irrelevant to the inference problem. If \(X\sim P_\theta\) represents the entire data set, drawn from a model \(\cP = \{P_\theta:\; \theta \in \Theta\}\), then this lecture will concern the idea of a sufficient statistic \(T(X)\) that carries all of the information in the data that can help us learn about \(\theta\).
A statistic \(T(X)\) is any random variable which is a function of the data \(X\), and which does not depend on the unknown parameter \(\theta\). We say the statistic \(T(X)\) is sufficient for the model \(\cP\) if \(P_\theta(X \mid T)\) does not depend on \(\theta\). This lecture will be devoted to interpreting this definition and giving examples.
Example (Independent Bernoulli sequence): We introduced the binomial example from Lecture 2 by telling a story about an investigator who flips a biased coin \(n\) times and records the total number of heads, which has a binomial distribution. All of the estimators we considered were functions only of the (binomially-distributed) count of heads.
But if the investigator had actually performed this experiment, they would have observed more than just the total number of heads: they would have observed the entire sequence of \(n\) heads and tails. If we let \(X_i\) denote a binary indicator of whether the \(i\)th throw is heads, for \(i=1,\ldots,n\), then we have assumed that these indicators are i.i.d. Bernoulli random variables:
\[ X_1,\ldots,X_n \simiid \text{Bern}(\theta). \]
Let \(T(X) = \sum_i X_i \sim \text{Binom}(n,\theta)\) denote the summary statistic that we previously used to represent the entire data set. It is undeniable that we have lost some information by only recording \(T(X)\) instead of the entire sequence \(X = (X_1,\ldots,X_n)\). As a result, we might wonder whether we could have improved the estimator by considering all functions of \(X\), not just functions of \(T(X)\).
The answer is that, no, we did not really lose anything by summarizing the data by \(T(X)\) because \(T(X)\) is sufficient. The joint pmf of the data set \(X \in \{0,1\}^n\) (i.e., the density wrt the counting measure on \(\{0,1\}^n\)) is
\[ p_\theta(x) = \prod_{i=1}^n \theta^{x_i}(1-\theta)^{1-x_i} = \theta^{\sum_i x_i}(1-\theta)^{n-\sum_i x_i}. \]
Note that this pmf depends only on \(T(x)\): it assigns probability \(\theta^t (1-\theta)^{n-t}\) to every sequence with \(T(X)=t\) total heads. As a result, the conditional distribution given \(T(X)=t\) should be uniform on all of the \(\binom{n}{t}\) sequences with \(t\) heads. We can confirm this by calculating the conditional pmf directly:
\[ \begin{aligned} \PP_\theta(X = x \mid T(X) = t) &= \frac{\PP_\theta(X=x, \sum_i X_i = t)}{\PP_\theta(T(X) = t)} \\[7pt] &= \frac{\theta^t (1-\theta)^{n-t}1\{\sum_i x_i = t\}}{\theta^t(1-\theta)^{n-t}\binom{n}{t}}\\[5pt] &= \binom{n}{t}^{-1}1\{T(x) = t\}. \end{aligned} \]
Since the conditional distribution does not depend on \(\theta\), \(T(X)\) is sufficient for the model \(\cP\).
Visualization of sufficiency
The applet below illustrates the concept of sufficiency for a pair of random variables \(X_1,X_2 \simiid p_\theta(x)\), where \(p_\theta(x)\) is a univariate probability mass function on \(\{0,\ldots,n\}\). We illustrate two cases, first the case where \(p_\theta(x)\) is the binomial \[ p_\theta(x) = \binom{n}{x}\theta^x(1-\theta)^{n-x}, \] and second the case where \(p_\theta(x)\) is a simple pmf we can call the “discrete Laplace” on \(\{0,\ldots,n\}\) that decays exponentially with \(|x-n \theta|\): \[ p_\theta(x) \propto e^{-|x-n\theta|}. \] As we will see shortly, \(T(X) = X_1+X_2\) is a sufficient statistic in the binomial case, but it is not sufficient in the discrete Laplace case. To see this directly, select a value of \(n\) and \(t\), and observe how the conditional distribution of \(X\) given \(T(X)=t\) does or does not change as you adjust the value of \(\theta\).
In either case, changing \(\theta\) has a major effect on the joint probability distribution of \(X = (X_1,X_2)\); that is precisely why observing \(X\) is informative about \(\theta\). But in the binomial case, having already observed \(T(X)\), \(\theta\) has no effect on the conditional distribution of \(X\); that is why observing \(X\) gives us no additional information about \(\theta\), once we’ve already seen \(T(X)\). In the discrete Laplace case, \(T(X)\) is not sufficient: if \(\theta = 0.3\), \(X = (1,2)\) is much more likely than
To understand why \(T(X)\) is sufficient in the binomial case, we can make a simple calculation on the joint pmf of the random vector \(X = (X_1,X_2)\): \[ \begin{aligned} \PP_\theta(X = x) &= \binom{n}{x_1} \theta^{x_1}(1-\theta)^{n-x_1} \cdot \binom{n}{x_2} \theta^{x_2}(1-\theta)^{n-x_2}\\[7pt] &= \theta^{T(x)}(1-\theta)^{2n-T(x)} \binom{n}{x_1}\binom{n}{x_2}, \end{aligned} \]
When we calculate the conditional distribution of \(X\) given \(T(X) = t\) by Bayes’ rule, the factor that depends on \(\theta\) drops out as before: \[ \begin{aligned} \PP_\theta(X = x \mid T(X) = t) &= \frac{\theta^{t}(1-\theta)^{2n-t} \binom{n}{x_1}\binom{n}{x_2}1\{x_1+x_2=t\} }{\sum_{k=0}^t \theta^t(1-\theta)^{2n-t} \binom{n}{k}\binom{n}{t-k}}\cdot\\[7pt] &= \frac{\binom{n}{x_1}\binom{n}{x_2}1\{x_1+x_2=t\}}{\sum_{k=0}^t \binom{n}{k}\binom{n}{t-k}}, \end{aligned} \] giving a conditional distribution with no dependence on \(\theta\). Applying the combinatorial identity \(\sum_{k=0}^t \binom{n}{k}\binom{n}{t-k} = \binom{2n}{t}\), we obtain a hypergeometric distribution for \(x_1\) (or for \(x_2\)): \[ \PP_\theta(X_1 = x_1 \mid T(X) = t) = \frac{\binom{n}{x_1}\binom{n}{t-x_1}}{\binom{2n}{t}} = \text{Hypergeom}(2n,n,t). \]
The next section generalizes this calculation to any density that can be factorized in a similar way, giving us an easy way to recognize sufficient statistics just by inspecting the joint density.
Factorization theorem
We didn’t really need to go to the trouble of calculating the conditional distribution in the previous examples. The easiest way to verify that a statistic is sufficient is to show that the density \(p_\theta\) factorizes into a part that involves only \(\theta\) and \(T(x)\), and a part that involves only \(x\).
Factorization Theorem: Let \(\cP\) be a model having densities \(p_\theta(x)\) with respect to a common dominating measure \(\mu\). Then \(T(X)\) is sufficient for \(\cP\) if and only if there exist non-negative functions \(g_\theta\) and \(h\) for which
\[ p_\theta(x) = g_\theta(T(x)) h(x), \]
for almost every \(x\) under \(\mu\).
The “almost every \(x\)” qualification means that
\[ \mu\left(\{x:\; p_\theta(x) \neq g_\theta(T(x))h(x)\}\right) = 0. \]
It is needed to avoid counterexamples where we mess with the densities on a set of points that the base measure doesn’t assign any mass, which would let us destroy the factorization structure without changing any of the distributions.
Proof (discrete \(\cX\)): The proof is easiest in the discrete case, so that we don’t have to deal with conditioning on measure-zero events and worry about things like Jacobians for change of variables.
We’ll assume without loss of generality that \(\mu\) is the counting measure: if \(\mu\) were some other measure, it would have to have a density \(m\) with respect to the counting measure and we would just have to carry around \(m(x)\) in all of our expressions.
First, assume that there exists a factorization \(p_\theta(x) = g_\theta(T(x)) h(x)\). Then we have
\[ \begin{aligned} \PP_\theta(X = x \mid T(X) = t) &= \frac{\PP_\theta(X = x, T(X) = t)}{\PP_\theta(T(X) = t)}\\[7pt] &= \frac{g_\theta(t) h(x) 1\{T(x) = t\}}{g_\theta(t)\displaystyle\sum_{z:\;T(z) = t} h(z)}\\[7pt] &= \frac{h(x) 1\{T(x) = t\}}{\displaystyle\sum_{z:\;T(z) = t} h(z)}, \end{aligned} \]
which we see does not depend on \(\theta\).
Next consider the opposite direction. If \(T(X)\) is sufficient, then we can construct a factorization by writing
\[ \begin{aligned} g_\theta(t) &= \PP_\theta(T(X)=t)\\ h(x) &= \PP(X = x \mid T(X) = T(x)), \end{aligned} \] noting that the conditional probability in the definition of \(h(x)\) does not depend on \(\theta\) by sufficiency. Then we have
\[ \PP_\theta(X = x) = \PP_\theta(T(X) = T(x)) \;\cdot\;\PP_\theta(X = x \mid T(X) = T(x)) = g_\theta(T(x)) h(x), \] so \(g_\theta(T(x))h(x)\) is indeed the pmf \(p_\theta(x)\).
Statement for general \(\mathcal{X}\)
Examples
Some initial examples:
Example: Normal location family Assume we observe an i.i.d. sample from a normal distribution with unit variance and unknown mean:
\[ X_1,\ldots,X_n \simiid N(\theta,1) = \frac{1}{\sqrt{2\pi}} e^{-(x-\theta)^2/2} = \frac{1}{\sqrt{2\pi}} e^{-x^2/2 + \theta x - \theta^2/2} \]
The joint density function for the full data set \(X = (X_1,\ldots,X_n)\) over \(\RR^n\) is
\[ \begin{aligned} p_\theta(x) &= (2\pi)^{-n/2} \cdot \prod_{i=1}^n e^{-x_i^2/2 + \theta x_i - \theta^2/2}\\[7pt] &= \underbrace{e^{\theta \left(\sum_i x_i\right) -n\theta^2/2}}_{g_\theta\left(\sum_i x_i\right)}\cdot \underbrace{\frac{\prod_{i=1}^n e^{-x_i^2/2}}{(2\pi)^{-n/2}}}_{h(x)}, \end{aligned} \] which by the factorization theorem shows that \(\sum_i X_i\) is sufficient.
Example: Poisson family Next assume we observe an i.i.d. sample from a Poisson distribution with unknown mean \(\theta\):
\[ X_1,\ldots,X_n \simiid \text{Pois}(\theta) = \frac{\theta^x e^{-\theta}}{x!}, \quad \text{ for } x = 0,1,\ldots \] The joint pmf for the full data set \(X = (X_1,\ldots,X_n)\) over \(\{0,1,\ldots\}^n\) is
\[ \begin{aligned} p_\theta(x) &= \prod_{i=1}^n \frac{\theta^{x_i}e^{-\theta}}{x_i!}\\[7pt] &= \underbrace{e^{\theta \left(\sum_i x_i\right) -n\theta}}_{g_\theta\left(\sum_i x_i\right)}\cdot \underbrace{\left(\prod_{i=1}^n x_i!\right)^{-1}}_{h(x)}, \end{aligned} \] Showing once again that \(T(X) = \sum_i X_i\) is sufficient. As we’ll find out in the next lecture, there is a good reason why these calculations turned out so similarly: both of these models have what is called an exponential family structure.
Example: Uniform location family As a third example, suppose we observe an i.i.d. sample from a uniform distribution on the interval \([\theta, \theta + 1]\):
\[ X_1,\ldots, X_n \simiid U[\theta, \theta+1] = 1\{\theta \leq x \leq \theta + 1\}. \] The joint density function for the full data set is
\[ p_\theta(x) = \prod_{i=1}^n 1\{\theta \leq x \leq \theta + 1} = 1\{\theta \leq \min_i x_i\} 1\{\max_i x_i \leq \theta + 1\}, \] so \(T(X) = (\min_i X_i, \max_i X_i)\) is sufficient.
Interpretations of sufficiency
Sufficient statistics under i.i.d. sampling
Minimal sufficiency
Consider again the example with \(X_1,\ldots,X_n \simiid N(\theta,1)\). We have shown that \(\sum_i X_i\) is sufficient. It follows that \(\overline{X} = \frac{1}{n}\sum_i X_i\) is also sufficient. But we have also shown that the vector of order statistics \(S(X) = (X_{(1)}, \ldots, X_{(n)})\) is another sufficient statistic. Finally, the full data set \(X\) is always a sufficient statistic, by definition.
While these are indeed all sufficient statistics, some of them represent more significant compressions of the data than others. We can see this from the fact that \(\sum_i X_i\) and \(\overline{X}\) are recoverable from each other, and can also be recovered from either \(S(X)\) or \(X\) itself, but \(S(X)\) cannot be recovered from \(\sum_i X_i\) or \(\overline{X}\). The full data \(X\) cannot be recovered from any of the other three, but any of the other three can be recovered from it. Among these statistics, the first two represent the greatest reduction of the data, and \(X\) represents no reduction at all, while \(S(X)\) sits in the middle.
Any statistic from which a sufficient statistic can be recovered is immediately a sufficient statistic:
Proposition: If \(T(X)\) is sufficient and \(T(X) = f(S(X))\) then \(S(X)\) is also sufficient.
Proof: By the factorization theorem we can find densities with \[ p_\theta(x) = g_\theta(T(x))h(x) = (g_\theta \circ f)(S(x)) h(x), \] showing that \(S(X)\) is sufficient as well.
We say that a sufficient statistic is minimal if it can be recovered from any other sufficient statistic. That is, \(T(X)\) is minimal sufficient if
\(T(X)\) is sufficient, and
For any other sufficient statistic \(S(X)\), we have \(T(X) = f(S(X))\) for some \(f\) (almost surely in \(\cP\)).
We would like to be able to recognize minimal sufficient statistics when we can. For a model \(\cP\) with densities \(p_\theta\), we can say that two data sets \(x,y\in \cX\) are equivalent (with respect to statistical inference in \(\cP\)) if \(p_\theta(x)/p_\theta(y)\) does not depend on \(\theta\). We can write \(x \equiv_{\cP} y\) if this is the case.
Note that, for any sufficient statistic \(T(X)\), values that map to the same output value \(t\) must be equivalent: if \(T(x)=T(y)=t\), we have \[ \frac{p_\theta(x)}{p_\theta(y)} = \frac{\PP_\theta(X = x \text{ and } T(X) = t)}{\PP_\theta(X = y \text{ and } T(X) = t)} = \frac{\PP(X = x \mid T(X) = t)}{\PP(X = y \mid T(X) = t)}, \] which does not depend on \(\theta\) by sufficiency of \(T(X)\). Thus, for any sufficient statistic we have \(T(x) = T(y) \Rightarrow x \equiv_{\cP} y\): the function \(T\) is only allowed to collapse values that are equivalent to each other. For a minimal sufficient statistic, this implication goes both ways since \(T\) collapses the sample space as much as possible. That is,
Proposition: Assume \(\cP\) has densities \(p_\theta(x)\), and \(T(X)\) is any statistic. If we have \[x \equiv_{\cP} y \iff T(x) = T(y),\] then \(T(X)\) is minimal sufficient.
Proof (discrete \(\cX\)): First, we show \(T(X)\) is sufficient. For any \(x\) with \(T(x) = t\), we have
\[ \PP_\theta(X = x \mid T(X) = t) = \frac{p_\theta(x)}{\sum_{z: T(z) = t} p_\theta(z)} = \frac{1}{\sum_{z:\; T(z) = t} p_\theta(z)/p_\theta(x)}, \] which does not depend on \(\theta\) because all of the values \(z\) that we sum over in the denominator map to the same \(t\), and are therefore equivalent to \(x\) by assumption.
Next, assume \(S(X)\) is any other sufficient statistic. If \(S(x) = S(y) = s\), then \(x \equiv_{\cP} y\), by the argument above the theorem, and consequently \(T(x) = T(y)\) by assumption. Then we can set \(f(s) = T(x)\). For any other value \(z\) with \(S(z) = s\), we must also have \(x \equiv_{\cP} z\) so \(T(z) = T(x) = f(S(z))\). Since \(s\) was arbitrary, we have the result.