Skip to content

Statistics

Various methods needed to evaluate experiment.

Source code in src/epstats/toolkit/statistics.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
class Statistics:
    """
    Various methods needed to evaluate experiment.
    """

    @classmethod
    def ttest_evaluation(cls, stats: np.array, control_variant: str) -> pd.DataFrame:
        """
        Testing statistical significance of relative difference in means of treatment and control variant.

        This is inspired by [scipy.stats.ttest_ind_from_stats](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind_from_stats.html)
        method that returns many more statistics than p-value and test statistic.

        Statistics used:

        1. [Welch's t-test](https://en.wikipedia.org/wiki/Welch%27s_t-test)
        1. [Welch–Satterthwaite equation](https://en.wikipedia.org/wiki/Welch%E2%80%93Satterthwaite_equation)
        approximation of degrees of freedom.

        Arguments:
            stats: array with dimensions (metrics, variants, stats)
            control_variant: string with the name of control variant

        `stats` array values:

        0. `metric_id`
        1. `metric_name`
        1. `exp_variant_id`
        1. `count`
        1. `mean`
        1. `std`
        1. `sum_value`
        1. `sum_sqr_value`

        Returns:
            dataframe containing statistics from the t-test

        Schema of returned dataframe:

        1. `metric_id` - metric id as in [`Experiment`][epstats.toolkit.experiment.Experiment] definition
        1. `metric_name` - metric name as in [`Experiment`][epstats.toolkit.experiment.Experiment] definition
        1. `exp_variant_id` - variant id
        1. `count` - number of exposures, value of metric denominator
        1. `mean` - `sum_value` / `count`
        1. `std` - sample standard deviation
        1. `sum_value` - value of goals, value of metric nominator
        1. `confidence_level` - current confidence level used to calculate `p_value` and `confidence_interval`
        1. `diff` - relative diff between sample means of this and control variant
        1. `test_stat` - value of test statistic of the relative difference in means
        1. `p_value` - p-value of the test statistic under current `confidence_level`
        1. `confidence_interval` - confidence interval of the `diff` under current `confidence_level`
        1. `standard_error` - standard error of the `diff`
        1. `degrees_of_freedom` - degrees of freedom of this variant mean
        """
        stats = stats.transpose(1, 2, 0)

        stat_res = []  # semiresults
        variants_count = stats.shape[0]  # number of variants

        # get only stats (not metric_id, metric_name, exp_variant_id) from the stats array as floats
        stats_values = stats[:, 3:8, :].astype(float)

        # select stats data for control variant
        for s in stats:
            if s[2][0] == control_variant:
                stats_values_control_variant = s[3:8, :].astype(float)
                break

        # control variant values
        count_cont = stats_values_control_variant[0]  # number of observations
        mean_cont = stats_values_control_variant[1]  # mean
        std_cont = stats_values_control_variant[2]  # standard deviation
        conf_level = stats_values_control_variant[4]  # confidence level

        # this for loop goes over variants and compares one variant values against control variant values for
        # all metrics at once. Similar to scipy.stats.ttest_ind_from_stats
        for i in range(variants_count):
            # treatment variant data
            s = stats_values[i]
            count_treat = s[0]  # number of observations
            mean_treat = s[1]  # mean
            std_treat = s[2]  # standard deviation
            sum_value = s[3]  # sum of observations

            # degrees of freedom
            num = (std_cont**2 / count_cont + std_treat**2 / count_treat) ** 2
            den = (std_cont**4 / (count_cont**2 * (count_cont - 1))) + (
                std_treat**4 / (count_treat**2 * (count_treat - 1))
            )

            with np.errstate(divide="ignore", invalid="ignore"):
                # We fill in zeros, when goal data are missing for some variant.
                # There could be division by zero here which is expected as we return
                # nan or inf values to the caller.
                # np.round() in case of roundoff errors, e.g. f = 9.999999998 => trunc(round(f, 5)) = 10
                f = np.trunc(np.round(num / den, 5))  # (rounded & truncated) degrees of freedom

            # t-quantile
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", category=RuntimeWarning)
                t_quantile = st.t.ppf(conf_level + (1 - conf_level) / 2, f)  # right quantile

            # relative difference and test statistics
            with np.errstate(divide="ignore", invalid="ignore"):
                # We fill in zeros, when goal data are missing for some variant.
                # There could be division by zero here which is expected as we return
                # nan or inf values to the caller.
                rel_diff = (mean_treat - mean_cont) / np.abs(mean_cont)
                # standard error for relative difference
                rel_se = (
                    np.sqrt((mean_treat * std_cont) ** 2 / (mean_cont**2 * count_cont) + (std_treat**2 / count_treat))
                    / mean_cont
                )
                test_stat = rel_diff / rel_se

                # If test_stat is inf and sum of non-zero observations is low,
                # set test_stat to 0 to prevent p-value from being 0.
                if test_stat.shape == sum_value.shape:
                    mask = np.isinf(test_stat) & (sum_value <= 10)
                    test_stat[mask] = 0.0

            # p-value
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", category=RuntimeWarning)
                pval = 2 * (1 - st.t.cdf(np.abs(test_stat), f))

            # confidence interval
            conf_int = rel_se * t_quantile

            # save results
            stat_res.append((rel_diff, test_stat, pval, conf_int, rel_se, f))

        # Tune up results
        s = np.hstack([stats, stat_res])
        s = s.transpose(2, 0, 1)  # move back to metrics, variants, stats order
        x, y, z = s.shape
        arr = s.reshape(x * y, z)

        # Output dataframe
        col = [
            "metric_id",
            "metric_name",
            "exp_variant_id",
            "count",
            "mean",
            "std",
            "sum_value",
            "confidence_level",
            "diff",
            "test_stat",
            "p_value",
            "confidence_interval",
            "standard_error",
            "degrees_of_freedom",
        ]
        r = pd.DataFrame(arr, columns=col)
        return r

    @classmethod
    def multiple_comparisons_correction(
        cls, df: pd.DataFrame, n_variants: int, metrics: int, confidence_level: float
    ) -> pd.DataFrame:
        """
        [Holm-Bonferroni correction](https://en.wikipedia.org/wiki/Holm%E2%80%93Bonferroni_method)
        for multiple comparisons problem. It is applied when we have more than two variants,
        i.e. we have one control variant and at least two treatment variants.

        It adjusts p-value and length of confidence interval - both to be more conservative.
        [Complete manual](../stats/multiple.md)

        Algorithm:

        For each metric, select (unadjusted) p-values and replace them with adjusted ones.
        Based on adjustment ratio, compute new (adjusted) confidence intervals and replace
        old (unadjusted) ones.

        Arguments:
            df: dataframe as output of [`ttest_evaluation`][epstats.toolkit.statistics.Statistics.ttest_evaluation]
            n_variants: number of variants in the experiment
            metrics: number of metrics of experiment
            confidence_level: desired confidence level at the end of the experiment, e.g. 0.95

        Returns:
            dataframe of the same format as input with adjusted p-values and confidence intervals.
        """
        alpha = 1 - confidence_level  # level of significance

        for m in range(metrics):
            # indices of rows with metric m data
            index_from = m * n_variants + 1
            index_to = (m + 1) * n_variants - 1

            # p-value adjustment
            pvals = df.loc[index_from:index_to, "p_value"].to_list()  # select old p-values
            adj_pvals = multipletests(pvals=pvals, alpha=alpha, method="holm")[1]  # compute adjusted p-values

            # confidence interval adjustment
            # we set ratio to 1 when test_stat is so big that pvals are zero, no reason to update ci
            adj_ratio = np.nan_to_num(pvals / adj_pvals, nan=1)  # adjustment ratio
            adj_alpha = adj_ratio * alpha  # adjusted level alpha

            f = df.loc[index_from:index_to, "degrees_of_freedom"].to_list()  # degrees of freedom
            se = df.loc[index_from:index_to, "standard_error"].to_list()  # standard error

            t_quantile = st.t.ppf(np.ones(n_variants - 1) - adj_alpha + adj_alpha / 2, f)  # right t-quantile
            adj_conf_int = se * t_quantile  # adjusted confidence interval

            # replace (unadjusted) p-values and confidence intervals with new adjusted ones
            df.loc[index_from:index_to, "p_value"] = adj_pvals
            df.loc[index_from:index_to, "confidence_interval"] = adj_conf_int
        return df

    @classmethod
    def obf_alpha_spending_function(cls, confidence_level: int, total_length: int, actual_day: int) -> int:
        """
        [O'Brien-Fleming alpha spending function](https://online.stat.psu.edu/stat509/lesson/9/9.6/).
        We adjust confidence level in time in experiment. Confidence level in this setting is
        a decreasing function of experiment time. See [Sequential Analysis](../stats/sequential.md) for details.

        Arguments:
            confidence_level: required confidence level at the end of the test, e.g. 0.95
            total_length: length of the test in days, e.g. 7, 14, 21
            actual_day: actual days in the experiment period, must be between 1 and `total_length`

        Returns:
            adjusted confidence level with respect to actual day of the experiment and total
            length of the experiment.
        """
        alpha = 1 - confidence_level
        t = actual_day / total_length  # t in (0, 1]
        q = st.norm.ppf(1 - alpha / 2)  # quantile of normal distribution
        alpha_adj = 2 - 2 * st.norm.cdf(q / np.sqrt(t))
        return np.round(1 - alpha_adj, decimals=4)

    @staticmethod
    def required_sample_size_per_variant(
        n_variants: int,
        minimum_effect: float,
        mean: float,
        std: float,
        std_2: Optional[float] = None,
        confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
        power: float = DEFAULT_POWER,
    ) -> Union[int, float]:
        """
        Computes the sample size required to reach the defined `confidence_level` and `power`.

        Uses the following formula:

        $$
        N = \\frac{(Z_{1-\\alpha/2} + Z_{1-\\beta})^2(s_1^2 + s_2^2)}{\\Delta^2}
        $$

        where $\\Delta = \\mathrm{MEI}\\mu_1$. When `std_2` is unknown,
        we assume equal variance $s_1^2 = s_2^2$:

        $$
        N = \\frac{(Z_{1-\\alpha/2} + Z_{1-\\beta})^2 2s_1^2}{\\Delta^2}
        $$

        For `confidence_level = 0.95` and `power = 0.8`:
        $$
        N = \\frac{7.84 * 2s_1^2}{\\Delta^2} = \\frac{15.7s_1^2}{\\Delta^2}
        $$

        The calculation is using Bonferroni correction when `n_variants > 2`. The initial
        $\\alpha$ defined by the `confidence_level` parameter is adjusted to

        $$
        \\alpha^{*} = \\alpha / m
        $$

        where $m$ is the number of treatment variants. This correction produces
        greater total sample size than Holm-Bonferroni correction because it assigns
        the most conservative $\\alpha^{*}$ to all variants.

        Arguments:
            n_variants: number of variants in the experiment
            minimum_effect: minimum (relative) effect that we find meaningful to detect
            mean: estimate of the current population mean, also known as rate in case of Bernoulli distribution
            std: estimate of the current population standard deviation
            std_2: estimate of the treatment population standard deviation
            confidence_level: confidence level of the test
            power: power of the test

        Returns:
            required sample size
        """

        if minimum_effect < 0:
            raise ValueError("minimum_effect must be greater than zero.")

        if n_variants < 2:
            raise ValueError("There must be at least two variants.")

        two_vars = 2 * (std**2) if std_2 is None else (std**2 + std_2**2)
        delta = np.float64(mean * minimum_effect)

        alpha = 1 - confidence_level
        m = n_variants - 1
        alpha = alpha / m  # Bonferroni correction
        # 7.84 for 80% power and 95% confidence, alpha / 2 for two-sided hypothesis
        confidence_and_power = (st.norm.ppf(1 - alpha / 2) + st.norm.ppf(power)) ** 2
        with np.errstate(divide="ignore", invalid="ignore"):
            samples_size_per_variant = confidence_and_power * (two_vars / delta**2)
        return np.round(samples_size_per_variant)

    @classmethod
    def required_sample_size_per_variant_bernoulli(
        cls,
        n_variants: int,
        minimum_effect: float,
        mean: float,
        confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
        power: float = DEFAULT_POWER,
        **unused_kwargs,
    ) -> Union[int, float]:
        """
        Computes the sample size required to reach the defined `confidence_level`
        and `power` when the data follow Bernoulli distribution.

        Uses `Statistics.required_sample_size_per_variant` with `std_2` defined as

        $$
        p_2 = p_1(1 + \\mathrm{MEI}) \\\\
        s_2^2 = p_2(1 - p_2) \\\\
        $$

        Arguments:
            n_variants: number of variants in the experiment
            minimum_effect: minimum (relative) effect that we find meaningful to detect
            mean: estimate of the current population mean,
                  also known as rate in case of Bernoulli distribution
            confidence_level: confidence level of the test
            power: power of the test

        Returns:
            required sample size
        """

        if mean > 1 or mean < 0:
            raise ValueError(f"mean must be between zero and one, received {mean}.")

        # if we know minimum effect, we know treatment mean and treatment variance
        # see https://github.com/bookingcom/powercalculator/blob/master/src/js/math.js#L113

        def get_std(mean):
            return np.sqrt(mean * (1 - mean))

        mean_2 = mean * (1 + minimum_effect)

        return cls.required_sample_size_per_variant(
            n_variants=n_variants,
            minimum_effect=minimum_effect,
            mean=mean,
            std=get_std(mean),
            std_2=get_std(mean_2),
            confidence_level=confidence_level,
            power=power,
        )

    @staticmethod
    def power_from_required_sample_size_per_variant(
        n_variants: int,
        sample_size_per_variant: Union[int, float],
        required_sample_size_per_variant: Union[int, float],
        required_power: float = DEFAULT_POWER,
        required_confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
    ) -> float:
        """
        Computes power based on the ratio of `sample_size_per_variant`
        and `required_sample_size_per_variant`.

        How does it work? Consider the formula for computing the sample size $N$
        for a given $\\alpha$ and $1-\\beta$:

        $$
        N = \\frac{(Z_{1-\\alpha/2} + Z_{1-\\beta})^2(s_1^2 + s_2^2)}{\\Delta^2}
        $$

        We can define the required sample size $N_R$ to reach 80% power as

        $$
        N_r = \\frac{(Z_{1-\\alpha/2} + Z_{0.8})^2(s_1^2 + s_2^2)}{\\Delta^2}
        $$

        The ratio $\\frac{N}{N_r}$ simplifies to

        $$
        \\frac{N}{N_r} = \\frac{(Z_{1-\\alpha/2} + Z_{1-\\beta})^2}{(Z_{1-\\alpha/2} + Z_{0.8})^2}
        $$

        This means that the power can be computed as

        $$
        Z_{1-\\beta} = \\sqrt\\frac{N}{N_r}(Z_{1-\\alpha/2}+Z_{0.8})-Z_{1-\\alpha/2} \\\\
        1-\\beta = \\Phi(Z_{1-\\beta})
        $$

        Arguments:
            n_variants: number of variants in the experiment
            sample_size_per_variant: number of samples in one variant
            required_sample_size_per_variant: number of samples required to reach the
                                              `required_power` using the `required_confidence_level`
            required_confidence_level: confidence level used to compute the
                                       `required_sample_size_per_variant`
            required_power: power used to compute the `required_sample_size_per_variant`

        Returns:
            power
        """

        if n_variants < 2 or required_sample_size_per_variant == 0:
            return np.nan

        required_sample_size_ratio = sample_size_per_variant / required_sample_size_per_variant
        alpha = (1 - required_confidence_level) / (n_variants - 1)

        return st.norm.cdf(
            np.sqrt(required_sample_size_ratio) * (st.norm.ppf(1 - alpha / 2) + st.norm.ppf(required_power))
            - st.norm.ppf(1 - alpha / 2)
        )

    @staticmethod
    def false_positive_risk(
        null_hypothesis_rate: float,
        power: float,
        p_value: float,
    ) -> float:
        """
        Computes false positive risk defined as:

        $$
        P(H_0|S) = \\frac{P(S|H_0)P(H_0)}{P(S)} = \\frac{\\alpha\\pi}{\\alpha\\pi + (1 - \\beta)(1 - \\pi)}
        $$

        where $S$ is a statisically significant outcome, $H_0$ is the null hypothesis, $1 - \\beta$
        is the power of a test, and $\\pi$ is the global null hypothesis rate defined as the proportion
        of all tests in an experimentation program that have not improved or degraded the primary metric.

        False positive risk $P(H_0|S)$ is not the same as the false positive rate $P(S|H_0) = \\alpha$.

        More information can be found in the paper: https://bit.ly/ABTestingIntuitionBusters.

        Arguments:
            null_hypothesis_rate: global null hypothesis rate of the experimanation program
            current_power: power achieved in the test
            confidence_level: confidence level of the test

        Returns:
            false positive risk
        """

        pi = null_hypothesis_rate
        return (p_value * pi) / (p_value * pi + power * (1 - pi))

false_positive_risk(null_hypothesis_rate, power, p_value) staticmethod

Computes false positive risk defined as:

\[ P(H_0|S) = \frac{P(S|H_0)P(H_0)}{P(S)} = \frac{\alpha\pi}{\alpha\pi + (1 - \beta)(1 - \pi)} \]

where \(S\) is a statisically significant outcome, \(H_0\) is the null hypothesis, \(1 - \beta\) is the power of a test, and \(\pi\) is the global null hypothesis rate defined as the proportion of all tests in an experimentation program that have not improved or degraded the primary metric.

False positive risk \(P(H_0|S)\) is not the same as the false positive rate \(P(S|H_0) = \alpha\).

More information can be found in the paper: https://bit.ly/ABTestingIntuitionBusters.

Parameters:

Name Type Description Default
null_hypothesis_rate float

global null hypothesis rate of the experimanation program

required
current_power

power achieved in the test

required
confidence_level

confidence level of the test

required

Returns:

Type Description
float

false positive risk

Source code in src/epstats/toolkit/statistics.py
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
@staticmethod
def false_positive_risk(
    null_hypothesis_rate: float,
    power: float,
    p_value: float,
) -> float:
    """
    Computes false positive risk defined as:

    $$
    P(H_0|S) = \\frac{P(S|H_0)P(H_0)}{P(S)} = \\frac{\\alpha\\pi}{\\alpha\\pi + (1 - \\beta)(1 - \\pi)}
    $$

    where $S$ is a statisically significant outcome, $H_0$ is the null hypothesis, $1 - \\beta$
    is the power of a test, and $\\pi$ is the global null hypothesis rate defined as the proportion
    of all tests in an experimentation program that have not improved or degraded the primary metric.

    False positive risk $P(H_0|S)$ is not the same as the false positive rate $P(S|H_0) = \\alpha$.

    More information can be found in the paper: https://bit.ly/ABTestingIntuitionBusters.

    Arguments:
        null_hypothesis_rate: global null hypothesis rate of the experimanation program
        current_power: power achieved in the test
        confidence_level: confidence level of the test

    Returns:
        false positive risk
    """

    pi = null_hypothesis_rate
    return (p_value * pi) / (p_value * pi + power * (1 - pi))

multiple_comparisons_correction(df, n_variants, metrics, confidence_level) classmethod

Holm-Bonferroni correction for multiple comparisons problem. It is applied when we have more than two variants, i.e. we have one control variant and at least two treatment variants.

It adjusts p-value and length of confidence interval - both to be more conservative. Complete manual

Algorithm:

For each metric, select (unadjusted) p-values and replace them with adjusted ones. Based on adjustment ratio, compute new (adjusted) confidence intervals and replace old (unadjusted) ones.

Parameters:

Name Type Description Default
df DataFrame

dataframe as output of ttest_evaluation

required
n_variants int

number of variants in the experiment

required
metrics int

number of metrics of experiment

required
confidence_level float

desired confidence level at the end of the experiment, e.g. 0.95

required

Returns:

Type Description
DataFrame

dataframe of the same format as input with adjusted p-values and confidence intervals.

Source code in src/epstats/toolkit/statistics.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
@classmethod
def multiple_comparisons_correction(
    cls, df: pd.DataFrame, n_variants: int, metrics: int, confidence_level: float
) -> pd.DataFrame:
    """
    [Holm-Bonferroni correction](https://en.wikipedia.org/wiki/Holm%E2%80%93Bonferroni_method)
    for multiple comparisons problem. It is applied when we have more than two variants,
    i.e. we have one control variant and at least two treatment variants.

    It adjusts p-value and length of confidence interval - both to be more conservative.
    [Complete manual](../stats/multiple.md)

    Algorithm:

    For each metric, select (unadjusted) p-values and replace them with adjusted ones.
    Based on adjustment ratio, compute new (adjusted) confidence intervals and replace
    old (unadjusted) ones.

    Arguments:
        df: dataframe as output of [`ttest_evaluation`][epstats.toolkit.statistics.Statistics.ttest_evaluation]
        n_variants: number of variants in the experiment
        metrics: number of metrics of experiment
        confidence_level: desired confidence level at the end of the experiment, e.g. 0.95

    Returns:
        dataframe of the same format as input with adjusted p-values and confidence intervals.
    """
    alpha = 1 - confidence_level  # level of significance

    for m in range(metrics):
        # indices of rows with metric m data
        index_from = m * n_variants + 1
        index_to = (m + 1) * n_variants - 1

        # p-value adjustment
        pvals = df.loc[index_from:index_to, "p_value"].to_list()  # select old p-values
        adj_pvals = multipletests(pvals=pvals, alpha=alpha, method="holm")[1]  # compute adjusted p-values

        # confidence interval adjustment
        # we set ratio to 1 when test_stat is so big that pvals are zero, no reason to update ci
        adj_ratio = np.nan_to_num(pvals / adj_pvals, nan=1)  # adjustment ratio
        adj_alpha = adj_ratio * alpha  # adjusted level alpha

        f = df.loc[index_from:index_to, "degrees_of_freedom"].to_list()  # degrees of freedom
        se = df.loc[index_from:index_to, "standard_error"].to_list()  # standard error

        t_quantile = st.t.ppf(np.ones(n_variants - 1) - adj_alpha + adj_alpha / 2, f)  # right t-quantile
        adj_conf_int = se * t_quantile  # adjusted confidence interval

        # replace (unadjusted) p-values and confidence intervals with new adjusted ones
        df.loc[index_from:index_to, "p_value"] = adj_pvals
        df.loc[index_from:index_to, "confidence_interval"] = adj_conf_int
    return df

obf_alpha_spending_function(confidence_level, total_length, actual_day) classmethod

O'Brien-Fleming alpha spending function. We adjust confidence level in time in experiment. Confidence level in this setting is a decreasing function of experiment time. See Sequential Analysis for details.

Parameters:

Name Type Description Default
confidence_level int

required confidence level at the end of the test, e.g. 0.95

required
total_length int

length of the test in days, e.g. 7, 14, 21

required
actual_day int

actual days in the experiment period, must be between 1 and total_length

required

Returns:

Type Description
int

adjusted confidence level with respect to actual day of the experiment and total

int

length of the experiment.

Source code in src/epstats/toolkit/statistics.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
@classmethod
def obf_alpha_spending_function(cls, confidence_level: int, total_length: int, actual_day: int) -> int:
    """
    [O'Brien-Fleming alpha spending function](https://online.stat.psu.edu/stat509/lesson/9/9.6/).
    We adjust confidence level in time in experiment. Confidence level in this setting is
    a decreasing function of experiment time. See [Sequential Analysis](../stats/sequential.md) for details.

    Arguments:
        confidence_level: required confidence level at the end of the test, e.g. 0.95
        total_length: length of the test in days, e.g. 7, 14, 21
        actual_day: actual days in the experiment period, must be between 1 and `total_length`

    Returns:
        adjusted confidence level with respect to actual day of the experiment and total
        length of the experiment.
    """
    alpha = 1 - confidence_level
    t = actual_day / total_length  # t in (0, 1]
    q = st.norm.ppf(1 - alpha / 2)  # quantile of normal distribution
    alpha_adj = 2 - 2 * st.norm.cdf(q / np.sqrt(t))
    return np.round(1 - alpha_adj, decimals=4)

power_from_required_sample_size_per_variant(n_variants, sample_size_per_variant, required_sample_size_per_variant, required_power=DEFAULT_POWER, required_confidence_level=DEFAULT_CONFIDENCE_LEVEL) staticmethod

Computes power based on the ratio of sample_size_per_variant and required_sample_size_per_variant.

How does it work? Consider the formula for computing the sample size \(N\) for a given \(\alpha\) and \(1-\beta\):

\[ N = \frac{(Z_{1-\alpha/2} + Z_{1-\beta})^2(s_1^2 + s_2^2)}{\Delta^2} \]

We can define the required sample size \(N_R\) to reach 80% power as

\[ N_r = \frac{(Z_{1-\alpha/2} + Z_{0.8})^2(s_1^2 + s_2^2)}{\Delta^2} \]

The ratio \(\frac{N}{N_r}\) simplifies to

\[ \frac{N}{N_r} = \frac{(Z_{1-\alpha/2} + Z_{1-\beta})^2}{(Z_{1-\alpha/2} + Z_{0.8})^2} \]

This means that the power can be computed as

\[ Z_{1-\beta} = \sqrt\frac{N}{N_r}(Z_{1-\alpha/2}+Z_{0.8})-Z_{1-\alpha/2} \\ 1-\beta = \Phi(Z_{1-\beta}) \]

Parameters:

Name Type Description Default
n_variants int

number of variants in the experiment

required
sample_size_per_variant Union[int, float]

number of samples in one variant

required
required_sample_size_per_variant Union[int, float]

number of samples required to reach the required_power using the required_confidence_level

required
required_confidence_level float

confidence level used to compute the required_sample_size_per_variant

DEFAULT_CONFIDENCE_LEVEL
required_power float

power used to compute the required_sample_size_per_variant

DEFAULT_POWER

Returns:

Type Description
float

power

Source code in src/epstats/toolkit/statistics.py
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
@staticmethod
def power_from_required_sample_size_per_variant(
    n_variants: int,
    sample_size_per_variant: Union[int, float],
    required_sample_size_per_variant: Union[int, float],
    required_power: float = DEFAULT_POWER,
    required_confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
) -> float:
    """
    Computes power based on the ratio of `sample_size_per_variant`
    and `required_sample_size_per_variant`.

    How does it work? Consider the formula for computing the sample size $N$
    for a given $\\alpha$ and $1-\\beta$:

    $$
    N = \\frac{(Z_{1-\\alpha/2} + Z_{1-\\beta})^2(s_1^2 + s_2^2)}{\\Delta^2}
    $$

    We can define the required sample size $N_R$ to reach 80% power as

    $$
    N_r = \\frac{(Z_{1-\\alpha/2} + Z_{0.8})^2(s_1^2 + s_2^2)}{\\Delta^2}
    $$

    The ratio $\\frac{N}{N_r}$ simplifies to

    $$
    \\frac{N}{N_r} = \\frac{(Z_{1-\\alpha/2} + Z_{1-\\beta})^2}{(Z_{1-\\alpha/2} + Z_{0.8})^2}
    $$

    This means that the power can be computed as

    $$
    Z_{1-\\beta} = \\sqrt\\frac{N}{N_r}(Z_{1-\\alpha/2}+Z_{0.8})-Z_{1-\\alpha/2} \\\\
    1-\\beta = \\Phi(Z_{1-\\beta})
    $$

    Arguments:
        n_variants: number of variants in the experiment
        sample_size_per_variant: number of samples in one variant
        required_sample_size_per_variant: number of samples required to reach the
                                          `required_power` using the `required_confidence_level`
        required_confidence_level: confidence level used to compute the
                                   `required_sample_size_per_variant`
        required_power: power used to compute the `required_sample_size_per_variant`

    Returns:
        power
    """

    if n_variants < 2 or required_sample_size_per_variant == 0:
        return np.nan

    required_sample_size_ratio = sample_size_per_variant / required_sample_size_per_variant
    alpha = (1 - required_confidence_level) / (n_variants - 1)

    return st.norm.cdf(
        np.sqrt(required_sample_size_ratio) * (st.norm.ppf(1 - alpha / 2) + st.norm.ppf(required_power))
        - st.norm.ppf(1 - alpha / 2)
    )

required_sample_size_per_variant(n_variants, minimum_effect, mean, std, std_2=None, confidence_level=DEFAULT_CONFIDENCE_LEVEL, power=DEFAULT_POWER) staticmethod

Computes the sample size required to reach the defined confidence_level and power.

Uses the following formula:

\[ N = \frac{(Z_{1-\alpha/2} + Z_{1-\beta})^2(s_1^2 + s_2^2)}{\Delta^2} \]

where \(\Delta = \mathrm{MEI}\mu_1\). When std_2 is unknown, we assume equal variance \(s_1^2 = s_2^2\):

\[ N = \frac{(Z_{1-\alpha/2} + Z_{1-\beta})^2 2s_1^2}{\Delta^2} \]

For confidence_level = 0.95 and power = 0.8: $$ N = \frac{7.84 * 2s_1^2}{\Delta^2} = \frac{15.7s_1^2}{\Delta^2} $$

The calculation is using Bonferroni correction when n_variants > 2. The initial \(\alpha\) defined by the confidence_level parameter is adjusted to

\[ \alpha^{*} = \alpha / m \]

where \(m\) is the number of treatment variants. This correction produces greater total sample size than Holm-Bonferroni correction because it assigns the most conservative \(\alpha^{*}\) to all variants.

Parameters:

Name Type Description Default
n_variants int

number of variants in the experiment

required
minimum_effect float

minimum (relative) effect that we find meaningful to detect

required
mean float

estimate of the current population mean, also known as rate in case of Bernoulli distribution

required
std float

estimate of the current population standard deviation

required
std_2 Optional[float]

estimate of the treatment population standard deviation

None
confidence_level float

confidence level of the test

DEFAULT_CONFIDENCE_LEVEL
power float

power of the test

DEFAULT_POWER

Returns:

Type Description
Union[int, float]

required sample size

Source code in src/epstats/toolkit/statistics.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
@staticmethod
def required_sample_size_per_variant(
    n_variants: int,
    minimum_effect: float,
    mean: float,
    std: float,
    std_2: Optional[float] = None,
    confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
    power: float = DEFAULT_POWER,
) -> Union[int, float]:
    """
    Computes the sample size required to reach the defined `confidence_level` and `power`.

    Uses the following formula:

    $$
    N = \\frac{(Z_{1-\\alpha/2} + Z_{1-\\beta})^2(s_1^2 + s_2^2)}{\\Delta^2}
    $$

    where $\\Delta = \\mathrm{MEI}\\mu_1$. When `std_2` is unknown,
    we assume equal variance $s_1^2 = s_2^2$:

    $$
    N = \\frac{(Z_{1-\\alpha/2} + Z_{1-\\beta})^2 2s_1^2}{\\Delta^2}
    $$

    For `confidence_level = 0.95` and `power = 0.8`:
    $$
    N = \\frac{7.84 * 2s_1^2}{\\Delta^2} = \\frac{15.7s_1^2}{\\Delta^2}
    $$

    The calculation is using Bonferroni correction when `n_variants > 2`. The initial
    $\\alpha$ defined by the `confidence_level` parameter is adjusted to

    $$
    \\alpha^{*} = \\alpha / m
    $$

    where $m$ is the number of treatment variants. This correction produces
    greater total sample size than Holm-Bonferroni correction because it assigns
    the most conservative $\\alpha^{*}$ to all variants.

    Arguments:
        n_variants: number of variants in the experiment
        minimum_effect: minimum (relative) effect that we find meaningful to detect
        mean: estimate of the current population mean, also known as rate in case of Bernoulli distribution
        std: estimate of the current population standard deviation
        std_2: estimate of the treatment population standard deviation
        confidence_level: confidence level of the test
        power: power of the test

    Returns:
        required sample size
    """

    if minimum_effect < 0:
        raise ValueError("minimum_effect must be greater than zero.")

    if n_variants < 2:
        raise ValueError("There must be at least two variants.")

    two_vars = 2 * (std**2) if std_2 is None else (std**2 + std_2**2)
    delta = np.float64(mean * minimum_effect)

    alpha = 1 - confidence_level
    m = n_variants - 1
    alpha = alpha / m  # Bonferroni correction
    # 7.84 for 80% power and 95% confidence, alpha / 2 for two-sided hypothesis
    confidence_and_power = (st.norm.ppf(1 - alpha / 2) + st.norm.ppf(power)) ** 2
    with np.errstate(divide="ignore", invalid="ignore"):
        samples_size_per_variant = confidence_and_power * (two_vars / delta**2)
    return np.round(samples_size_per_variant)

required_sample_size_per_variant_bernoulli(n_variants, minimum_effect, mean, confidence_level=DEFAULT_CONFIDENCE_LEVEL, power=DEFAULT_POWER, **unused_kwargs) classmethod

Computes the sample size required to reach the defined confidence_level and power when the data follow Bernoulli distribution.

Uses Statistics.required_sample_size_per_variant with std_2 defined as

\[ p_2 = p_1(1 + \mathrm{MEI}) \\ s_2^2 = p_2(1 - p_2) \\ \]

Parameters:

Name Type Description Default
n_variants int

number of variants in the experiment

required
minimum_effect float

minimum (relative) effect that we find meaningful to detect

required
mean float

estimate of the current population mean, also known as rate in case of Bernoulli distribution

required
confidence_level float

confidence level of the test

DEFAULT_CONFIDENCE_LEVEL
power float

power of the test

DEFAULT_POWER

Returns:

Type Description
Union[int, float]

required sample size

Source code in src/epstats/toolkit/statistics.py
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
@classmethod
def required_sample_size_per_variant_bernoulli(
    cls,
    n_variants: int,
    minimum_effect: float,
    mean: float,
    confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
    power: float = DEFAULT_POWER,
    **unused_kwargs,
) -> Union[int, float]:
    """
    Computes the sample size required to reach the defined `confidence_level`
    and `power` when the data follow Bernoulli distribution.

    Uses `Statistics.required_sample_size_per_variant` with `std_2` defined as

    $$
    p_2 = p_1(1 + \\mathrm{MEI}) \\\\
    s_2^2 = p_2(1 - p_2) \\\\
    $$

    Arguments:
        n_variants: number of variants in the experiment
        minimum_effect: minimum (relative) effect that we find meaningful to detect
        mean: estimate of the current population mean,
              also known as rate in case of Bernoulli distribution
        confidence_level: confidence level of the test
        power: power of the test

    Returns:
        required sample size
    """

    if mean > 1 or mean < 0:
        raise ValueError(f"mean must be between zero and one, received {mean}.")

    # if we know minimum effect, we know treatment mean and treatment variance
    # see https://github.com/bookingcom/powercalculator/blob/master/src/js/math.js#L113

    def get_std(mean):
        return np.sqrt(mean * (1 - mean))

    mean_2 = mean * (1 + minimum_effect)

    return cls.required_sample_size_per_variant(
        n_variants=n_variants,
        minimum_effect=minimum_effect,
        mean=mean,
        std=get_std(mean),
        std_2=get_std(mean_2),
        confidence_level=confidence_level,
        power=power,
    )

ttest_evaluation(stats, control_variant) classmethod

Testing statistical significance of relative difference in means of treatment and control variant.

This is inspired by scipy.stats.ttest_ind_from_stats method that returns many more statistics than p-value and test statistic.

Statistics used:

  1. Welch's t-test
  2. Welch–Satterthwaite equation approximation of degrees of freedom.

Parameters:

Name Type Description Default
stats array

array with dimensions (metrics, variants, stats)

required
control_variant str

string with the name of control variant

required

stats array values:

  1. metric_id
  2. metric_name
  3. exp_variant_id
  4. count
  5. mean
  6. std
  7. sum_value
  8. sum_sqr_value

Returns:

Type Description
DataFrame

dataframe containing statistics from the t-test

Schema of returned dataframe:

  1. metric_id - metric id as in Experiment definition
  2. metric_name - metric name as in Experiment definition
  3. exp_variant_id - variant id
  4. count - number of exposures, value of metric denominator
  5. mean - sum_value / count
  6. std - sample standard deviation
  7. sum_value - value of goals, value of metric nominator
  8. confidence_level - current confidence level used to calculate p_value and confidence_interval
  9. diff - relative diff between sample means of this and control variant
  10. test_stat - value of test statistic of the relative difference in means
  11. p_value - p-value of the test statistic under current confidence_level
  12. confidence_interval - confidence interval of the diff under current confidence_level
  13. standard_error - standard error of the diff
  14. degrees_of_freedom - degrees of freedom of this variant mean
Source code in src/epstats/toolkit/statistics.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
@classmethod
def ttest_evaluation(cls, stats: np.array, control_variant: str) -> pd.DataFrame:
    """
    Testing statistical significance of relative difference in means of treatment and control variant.

    This is inspired by [scipy.stats.ttest_ind_from_stats](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind_from_stats.html)
    method that returns many more statistics than p-value and test statistic.

    Statistics used:

    1. [Welch's t-test](https://en.wikipedia.org/wiki/Welch%27s_t-test)
    1. [Welch–Satterthwaite equation](https://en.wikipedia.org/wiki/Welch%E2%80%93Satterthwaite_equation)
    approximation of degrees of freedom.

    Arguments:
        stats: array with dimensions (metrics, variants, stats)
        control_variant: string with the name of control variant

    `stats` array values:

    0. `metric_id`
    1. `metric_name`
    1. `exp_variant_id`
    1. `count`
    1. `mean`
    1. `std`
    1. `sum_value`
    1. `sum_sqr_value`

    Returns:
        dataframe containing statistics from the t-test

    Schema of returned dataframe:

    1. `metric_id` - metric id as in [`Experiment`][epstats.toolkit.experiment.Experiment] definition
    1. `metric_name` - metric name as in [`Experiment`][epstats.toolkit.experiment.Experiment] definition
    1. `exp_variant_id` - variant id
    1. `count` - number of exposures, value of metric denominator
    1. `mean` - `sum_value` / `count`
    1. `std` - sample standard deviation
    1. `sum_value` - value of goals, value of metric nominator
    1. `confidence_level` - current confidence level used to calculate `p_value` and `confidence_interval`
    1. `diff` - relative diff between sample means of this and control variant
    1. `test_stat` - value of test statistic of the relative difference in means
    1. `p_value` - p-value of the test statistic under current `confidence_level`
    1. `confidence_interval` - confidence interval of the `diff` under current `confidence_level`
    1. `standard_error` - standard error of the `diff`
    1. `degrees_of_freedom` - degrees of freedom of this variant mean
    """
    stats = stats.transpose(1, 2, 0)

    stat_res = []  # semiresults
    variants_count = stats.shape[0]  # number of variants

    # get only stats (not metric_id, metric_name, exp_variant_id) from the stats array as floats
    stats_values = stats[:, 3:8, :].astype(float)

    # select stats data for control variant
    for s in stats:
        if s[2][0] == control_variant:
            stats_values_control_variant = s[3:8, :].astype(float)
            break

    # control variant values
    count_cont = stats_values_control_variant[0]  # number of observations
    mean_cont = stats_values_control_variant[1]  # mean
    std_cont = stats_values_control_variant[2]  # standard deviation
    conf_level = stats_values_control_variant[4]  # confidence level

    # this for loop goes over variants and compares one variant values against control variant values for
    # all metrics at once. Similar to scipy.stats.ttest_ind_from_stats
    for i in range(variants_count):
        # treatment variant data
        s = stats_values[i]
        count_treat = s[0]  # number of observations
        mean_treat = s[1]  # mean
        std_treat = s[2]  # standard deviation
        sum_value = s[3]  # sum of observations

        # degrees of freedom
        num = (std_cont**2 / count_cont + std_treat**2 / count_treat) ** 2
        den = (std_cont**4 / (count_cont**2 * (count_cont - 1))) + (
            std_treat**4 / (count_treat**2 * (count_treat - 1))
        )

        with np.errstate(divide="ignore", invalid="ignore"):
            # We fill in zeros, when goal data are missing for some variant.
            # There could be division by zero here which is expected as we return
            # nan or inf values to the caller.
            # np.round() in case of roundoff errors, e.g. f = 9.999999998 => trunc(round(f, 5)) = 10
            f = np.trunc(np.round(num / den, 5))  # (rounded & truncated) degrees of freedom

        # t-quantile
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=RuntimeWarning)
            t_quantile = st.t.ppf(conf_level + (1 - conf_level) / 2, f)  # right quantile

        # relative difference and test statistics
        with np.errstate(divide="ignore", invalid="ignore"):
            # We fill in zeros, when goal data are missing for some variant.
            # There could be division by zero here which is expected as we return
            # nan or inf values to the caller.
            rel_diff = (mean_treat - mean_cont) / np.abs(mean_cont)
            # standard error for relative difference
            rel_se = (
                np.sqrt((mean_treat * std_cont) ** 2 / (mean_cont**2 * count_cont) + (std_treat**2 / count_treat))
                / mean_cont
            )
            test_stat = rel_diff / rel_se

            # If test_stat is inf and sum of non-zero observations is low,
            # set test_stat to 0 to prevent p-value from being 0.
            if test_stat.shape == sum_value.shape:
                mask = np.isinf(test_stat) & (sum_value <= 10)
                test_stat[mask] = 0.0

        # p-value
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=RuntimeWarning)
            pval = 2 * (1 - st.t.cdf(np.abs(test_stat), f))

        # confidence interval
        conf_int = rel_se * t_quantile

        # save results
        stat_res.append((rel_diff, test_stat, pval, conf_int, rel_se, f))

    # Tune up results
    s = np.hstack([stats, stat_res])
    s = s.transpose(2, 0, 1)  # move back to metrics, variants, stats order
    x, y, z = s.shape
    arr = s.reshape(x * y, z)

    # Output dataframe
    col = [
        "metric_id",
        "metric_name",
        "exp_variant_id",
        "count",
        "mean",
        "std",
        "sum_value",
        "confidence_level",
        "diff",
        "test_stat",
        "p_value",
        "confidence_interval",
        "standard_error",
        "degrees_of_freedom",
    ]
    r = pd.DataFrame(arr, columns=col)
    return r