# Subjective Bayesian T-Test Code

###### ########################################################

rm(list=ls()) #will remove ALL objects

##############################################################
Bayes-Factor Calculations for T-tests
##############################################################

#Start of Settings

### Give a title for results output
Results.Title = ‘Normal(x,0,.5) N = 100 BS-Design, Obs.ES = 0′

### Criterion for Inference in Favor of H0, BF (H1/H0)
BF.crit.H0 = 1/3

### Criterion for Inference in Favor of H1
#set z.crit.H1 to Infinity to use Bayes-Factor, BF(H1/H0)
BF.crit.H1 = 3
z.crit.H1 = Inf

### Set Number of Groups
gr = 2

### Set Total Sample size
N = 100

### Set observed effect size
### for between-subject designs and one sample designs this is Cohen’s d
### for within-subject designs this is dz
obs.es = 0

### Set the mode of the alternative hypothesis
alt.mode = 0

### Set the variability of the alternative hypothesis
alt.var = .5

### Set the shape of the distribution of population effect sizes
alt.dist = 2  #1 = Cauchy; 2 = Normal

### Set the lower bound of population effect sizes
### Set to zero if there is zero probability to observe effects with the opposite sign
low = -3

### Set the upper bound of population effect sizes
### For example, set to 1, if you think effect sizes greater than 1 SD are unlikely
high = 3

### set the precision of density estimation (bigger takes longer)
precision = 100

### set the graphic resolution (higher resolution takes longer)
graphic.resolution = 20

### set limit for non-central t-values
nct.limit = 100

################################
# End of Settings
################################

# compute degrees of freedom
df = (N – gr)

# get range of population effect sizes
pop.es=seq(low,high,(1/precision))

# compute sampling error
se = gr/sqrt(N)

# limit population effect sizes based on non-central t-values
pop.es = pop.es[pop.es/se >= -nct.limit & pop.es/se <= nct.limit]

# function to get weights for Cauchy or Normal Distributions
get.weights=function(pop.es,alt.dist,p) {
if (alt.dist == 1) w = dcauchy(pop.es,alt.mode,alt.var)
if (alt.dist == 2) w = dnorm(pop.es,alt.mode,alt.var)
sum(w)
# get the scaling factor to scale weights to 1*precision
#scale = sum(w)/precision
# scale weights
#w = w / scale
return(w)
}

# get weights for population effect sizes
weights = get.weights(pop.es,alt.dist,precision)

#Plot Alternative Hypothesis
Title=”Alternative Hypothesis”
ymax=max(max(weights)*1.2,1)
plot(pop.es,weights,type=’l’,ylim=c(0,ymax),xlab=”Population Effect Size”,ylab=”Density”,main=Title,col=’blue’,lwd=3)
abline(v=0,col=’red’)

#create observations for plotting of prediction distributions
obs = seq(low,high,1/graphic.resolution)

# Get distribution for observed effect size assuming H1
H1.dist = as.numeric(lapply(obs, function(x) sum(dt(x/se,df,pop.es/se) * weights)/precision))

#Get Distribution for observed effect sizes assuming H0
H0.dist = dt(obs/se,df,0)

#Compute Bayes-Factors for Prediction Distribution of H0 and H1
BFs = H1.dist/H0.dist

#Compute z-scores (strength of evidence against H0)
z = qnorm(pt(obs/se,df,log.p=TRUE),log.p=TRUE)

# Compute H1 error rate rate
BFpos = BFs
BFpos[z < 0] = Inf
if (z.crit.H1 == Inf) z.crit.H1 = abs(z[which(abs(BFpos-BF.crit.H1) == min(abs(BFpos-BF.crit.H1)))])
ncz = qnorm(pt(pop.es/se,df,log.p=TRUE),log.p=TRUE)
weighted.power = sum(pnorm(abs(ncz),z.crit.H1)*weights)/sum(weights)
H1.error = 1-weighted.power

#Compute H0 Error Rate
z.crit.H0 = abs(z[which(abs(BFpos-BF.crit.H0) == min(abs(BFpos-BF.crit.H0)))])
H0.error = (1-pnorm(z.crit.H0))*2

# Get density for observed effect size assuming H0
Density.Obs.H0 = dt(obs.es,df,0)

# Get density for observed effect size assuming H1
Density.Obs.H1 = sum(dt(obs.es/se,df,pop.es/se) * weights)/precision

# Compute Bayes-Factor for observed effect size
BF.obs.es = Density.Obs.H1 / Density.Obs.H0

#Compute z-score for observed effect size
obs.z = qnorm(pt(obs.es/se,df,log.p=TRUE),log.p=TRUE)

#Show Results
ymax=max(H0.dist,H1.dist)*1.3
plot(type=’l’,z,H0.dist,ylim=c(0,ymax),xlab=”Strength of Evidence (z-value)”,ylab=”Density”,main=Results.Title,col=’black’,lwd=2)
par(new=TRUE)
plot(type=’l’,z,H1.dist,ylim=c(0,ymax),xlab=””,ylab=””,col=’blue’,lwd=2)
abline(v=obs.z,lty=2,lwd=2,col=’darkgreen’)
abline(v=-z.crit.H1,col=’blue’,lty=3)
abline(v=z.crit.H1,col=’blue’,lty=3)
abline(v=-z.crit.H0,col=’red’,lty=3)
abline(v=z.crit.H0,col=’red’,lty=3)
points(pch=19,c(obs.z,obs.z),c(Density.Obs.H0,Density.Obs.H1))
res = paste0(‘BF(H1/H0): ‘,format(round(BF.obs.es,3),nsmall=3))
text(min(z),ymax*.95,pos=4,res)
res = paste0(‘BF(H0/H1): ‘,format(round(1/BF.obs.es,3),nsmall=3))
text(min(z),ymax*.90,pos=4,res)
res = paste0(‘H1 Error Rate: ‘,format(round(H1.error,3),nsmall=3))
text(min(z),ymax*.80,pos=4,res)
res = paste0(‘H0 Error Rate: ‘,format(round(H0.error,3),nsmall=3))
text(min(z),ymax*.75,pos=4,res)

######################################################
### END OF Subjective Bayesian T-Test CODE
######################################################
### Thank you to Jeff Rouder for posting his code that got me started.
### http://jeffrouder.blogspot.ca/2016/01/what-priors-should-i-use-part-i.html

# Wagenmakers’ Default Prior is Inconsistent with the Observed Results in Psychologial Research

Bayesian statistics is like all other statistics. A bunch of numbers are entered into a formula and the end result is another number.  The meaning of the number depends on the meaning of the numbers that enter the formula and the formulas that are used to transform them.

The input for a Bayesian inference is no different than the input for other statistical tests.  The input is information about an observed effect size and sampling error. The observed effect size is a function of the unknown population effect size and the unknown bias introduced by sampling error in a particular study.

Based on this information, frequentists compute p-values and some Bayesians compute a Bayes-Factor. The Bayes Factor expresses how compatible an observed test statistic (e.g., a t-value) is with one of two hypothesis. Typically, the observed t-value is compared to a distribution of t-values under the assumption that H0 is true (the population effect size is 0 and t-values are expected to follow a t-distribution centered over 0 and an alternative hypothesis. The alternative hypothesis assumes that the effect size is in a range from -infinity to infinity, which of course is true. To make this a workable alternative hypothesis, H1 assigns weights to these effect sizes. Effect sizes with bigger weights are assumed to be more likely than effect sizes with smaller weights. A weight of 0 would mean a priori that these effects cannot occur.

As Bayes-Factors depend on the weights attached to effect sizes, it is also important to realize that the support for H0 depends on the probability that the prior distribution was a reasonable distribution of probable effect sizes. It is always possible to get a Bayes-Factor that supports H0 with an unreasonable prior.  For example, an alternative hypothesis that assumes that an effect size is at least two standard deviations away from 0 will not be favored by data with an effect size of d = .5, and the BF will correctly favor H0 over this improbable alternative hypothesis.  This finding would not imply that the null-hypothesis is true. It only shows that the null-hypothesis is more compatible with the observed result than the alternative hypothesis. Thus, it is always necessary to specify and consider the nature of the alternative hypothesis to interpret Bayes-Factors.

Although the a priori probabilities of  H0 and H1 are both unknown, it is possible to test the plausibility of priors against actual data.  The reason is that observed effect sizes provide information about the plausible range of effect sizes. If most observed effect sizes are less than 1 standard deviation, it is not possible that most population effect sizes are greater than 1 standard deviation.  The reason is that sampling error is random and will lead to overestimation and underestimation of population effect sizes. Thus, if there were many population effect sizes greater than 1, one would also see many observed effect sizes greater than 1.

To my knowledge, proponents of Bayes-Factors have not attempted to validate their priors against actual data. This is especially problematic when priors are presented as defaults that require no further justification for a specification of H1.

In this post, I focus on Wagenmakers’ prior because Wagenmaker has been a prominent advocate of Bayes-Factors as an alternative approach to conventional null-hypothesis-significance testing.  Wagenmakers’ prior is a Cauchy distribution with a scaling factor of 1.  This scaling factor implies a 50% probability that effect sizes are larger than 1 standard deviation.  This prior was used to argue that Bem’s (2011) evidence for PSI was weak. It has also been used in many other articles to suggest that the data favor the null-hypothesis.  These articles fail to point out that the interpretation of Bayes-Factors in favor of H0 is only valid for Wagenmakers’ prior. A different prior could have produced different conclusions.  Thus, it is necessary to examine whether Wagenmakers’ prior is a plausible prior for psychological science.

Wagenmakers’ Prior and Replicability

A prior distribution of effect sizes makes assumption about population effect sizes. In combination with information about sample size, it is possible to compute non-centrality parameters, which are equivalent to the population effect size divided by sampling error.  For each non-centrality parameter it is possible to estimate power as the area under the curve of the non-central t-distribution on the right side of the criterion value that corresponds to alpha, typically .05 (two-tailed).   The assumed typical power is simply the weighted average of the power values for each non-centrality parameters.

Replicability is not identical to power for a set of studies with heterogeneous non-centrality parameters because studies with higher power are more likely to become significant. Thus, the set of studies that achieved significance has higher average power as the original set of studies.

Aside from power, the distribution of observed test statistics is also informative. Unlikely power which is bound at 1, the distribution of test-statistics is unlimited. Thus, unreasonable assumptions about the distribution of effect sizes are visible in a distribution of test statistics that does not match distributions of tests statistics in actual studies.  One problem is that test-statistics are not directly comparable for different sample sizes or statistical tests because non-central distributions vary as a function of degrees of freedom and the test being used (e.g., chi-square vs. t-test).  To solve this problem, it is possible to convert all test statistics into z-scores so that they are on a common metric.  In a heterogeneous set of studies, the sign of the effect provides no useful information because signs only have to be consistent in tests of the same population effect size. As a result, it is necessary to use absolute z-scores. These absolute z-scores can be interpreted as the strength of evidence against the null-hypothesis.

I used a sample size of N = 80 and assumed a between subject design. In this case, sampling error is defined as 2/sqrt(80) = .224.  A sample size of N = 80 is the median sample size in Psychological Science. It is also the total sample size that would be obtained in a 2 x 2 ANOVA with n = 20 per cell.  Power and replicability estimates would increase for within-subject designs and for studies with larger N. Between subject designs with smaller N would yield lower estimates.

I simulated effect sizes in the range from 0 to 4 standard deviations.  Effect sizes of 4 or larger are extremely rare. Excluding these extreme values means that power estimates underestimate power slightly, but the effect is negligible because Wagenmakers’ prior assigns low probabilities (weights) to these effect sizes.

For each possible effect size in the range from 0 to 4 (using a resolution of d = .001)  I computed the non-centrality parameter as d/se.  With N = 80, these non-centrality parameters define a non-central t-distribution with 78 degrees of freedom.

I computed the implied power to achieve a significant result with alpha = .05 (two-tailed) with the formula

power = pt(ncp,N-2,qt(1-.025,N-2))

The formula returns the area under the curve on the right side of the criterion value that corresponds to a two-tailed test with p = .05.

The mean of these power values is the average power of studies if all effect sizes were equally likely.  The value is 89%. This implies that in the long run, a random sample of studies drawn from this population of effect sizes is expected to produce 89% significant results.

However, Wagenmakers’ prior assumes that smaller effect sizes are more likely than larger effect sizes. Thus, it is necessary to compute the weighted average of power using Wagenmakes’ prior distribution as weights.  The weights were obtained using the density of a Cauchy distribution with a scaling factor of 1 for each effect size.

wagenmakers.weights = dcauchy(es,0,1)

The weighted average power was computed as the sum of the weighted power estimates divided by the sum of weights.  The weighted average power is 69%.  This estimate implies that Wagenmakers’ prior assumes that 69% of statistical tests produce a significant result, when the null-hypothesis is false.

Replicability is always higher than power because the subset of studies that produce a significant result has higher average power than the the full set of studies. Replicabilty for a set of studies with heterogeneous power is the sum of the squared power of individual studies divided by the sum of power.

Replicability = sum(power^2) / sum(power)

The unweighted estimate of replicabilty is 96%.   To obtain the replicability for Wagenmakers’ prior, the same weighting scheme as for power can be used for replicability.

Wagenmakers.Replicability = sum(weights * power^2) / sum(weights*power)

The formula shows that Wagenmakers’ prior implies a replicabilty of 89%.  We see that the weighting scheme has relatively little effect on the estimate of replicability because many of the studies with small effect sizes are expected to produce a non-significant result, whereas the large effect sizes often have power close to 1, which implies that they wil be significant in the original study and the replication study.

The success rate of replication studies is difficult to estimate. Cohen estimated that typical studies in psychology have 50% power to detect a medium effect size, d = .5.  This would imply that the actual success rate would be lower because in an unknown percentage of studies the null-hypothesis is true.  However, replicability would be higher because studies with higher power are more likely to be significant.  Given this uncertainty, I used a scenario with 50% replicability.  That is an unbiased sample of studies taken from psychological journals would produce 50% successful replications in an exact replication study of the original studies.  The following computations show the implications of a 50% success rate in replication studies for the proportion of hypothesis tests where the null hypothesis is true, p(H0).

The percentage of true null-hypothesis is a function of the success rate in replication study, weighted average power, and weighted replicability.

p(H0) = (weighted.average.power * (weighted.replicability – success.rate)) / (success.rate*.05 – success.rate*weighted.average.power – .05^2 + weighted.average.power*weighted.replicability)

To produce a success rate of 50% in replication studies with Wagenmakers’ prior when H1 is true (89% replicability), the percentage of true null-hypothesis has to be 92%.

The high percentage of true null-hypothesis (92%) also has implications for the implied false-positive rate (i.e., the percentage of significant results that are true null-hypothesis.

False Positive Rate =  (Type.1.Error *.05)  / (Type.1.Error * .05 +
(1-Type.1.Error) * Weighted.Average.Power)
For every 100 studies, there are 92 true null-hypothesis that produce 92*.05 = 4.6 false positive results. For the remaining 8 studies with a true effect, there are 8 * .67 = 5.4 true discoveries.  The false positive rate is 4.6 / (4.6 + 5.4) = 46%.  This means Wagenmakers prior assumes that a success rate of 50% in replication studies implies that nearly 50% of studies that replicate successfully are false-positives results that would not replicate in future replication studies.

Aside from these analytically derived predictions about power and replicability, Wagenmakers’ prior also makes predictions about the distribution of observed evidence in individual studies. As observed scores are influenced by sampling error, I used simulations to illustrate the effect of Wagenmakers’ prior on observed test statistics.

For the simulation I converted the non-central t-values into non-central z-scores and simulated sampling error with a standard normal distribution.  The simulation included 92% true null-hypotheses and 8% true H1 based on Wagenmaker’s prior.  As published results suffer from publication bias, I simulated publication bias by selecting only observed absolute z-scores greater than 1.96, which corresponds to the p < .05 (two-tailed) significance criterion.  The simulated data were submitted to a powergraph analysis that estimates power and replicability based on the distribution of absolute z-scores.

Figure 1 shows the results.   First, the estimation method slightly underestimated the actual replicability of 50% by 2 percentage points.  Despite this slight estimation error, the Figure accurately illustrates the implications of Wagenmakers’ prior for observed distributions of absolute z-scores.  The density function shows a steep decrease in the range of z-scores between 2 and 3, and a gentle slope for z-scores greater than 4 to 10 (values greater than 10 are not shown).

Powergraphs provide some information about the composition of the total density by dividing the total density into densities for power less than 20%, 20-50%, 50% to 85% and more than 85%. The red line (power < 20%) mostly determines the shape of the total density function for z-scores from 2 to 2.5, and most the remaining density is due to studies with more than 85% power starting with z-scores around 4.   Studies with power in the range between 20% and 85% contribute very little to the total density. Thus, the plot correctly reveals that Wagenmakers’ prior assumes that the roughly 50% average replicability is mostly due to studies with very low power (< 20%) and studies with very high power (> 85%).

Validation Study 1: Michael Nujiten’s Statcheck Data

There are a number of datasets that can be used to evaluate Wagenmakers’ prior. The first dataset is based on an automatic extraction of test statistics from psychological journals. I used Michael Nuijten’s dataset to ensure that I did not cheery-pick data and to allow other researchers to reproduce the results.

The main problem with automatically extracted test statistics is that the dataset does not distinguish between  theoretically important test statistics and other statistics, such as significance tests of manipulation checks.  It is also not possible to distinguish between between-subject and within-subject designs.  As a result, replicability estimates for this dataset will be higher than the simulation based on a between-subject design.

Figure 2 shows all of the data, but only significant z-scores (z > 1.96) are used to estimate replicability and power. The most striking difference between Figure 1 and Figure 2 is the shape of the total density on the right side of the significance criterion.  In Figure 2 the slope is shallower. The difference is visible in the decomposition of the total density into densities for different power bands.  In Figure 1 most of the total density was accounted for by studies with less than 20% power and studies with more than 85% power.  In Figure 2, studies with power in the range between 20% and 85% account for the majority of studies with z-scores greater than 2.5 up to z-scores of 4.5.

The difference between Figure 1 and Figure 2 has direct implications for the interpretation of Bayes-Factors with t-values that correspond to z-scores in the range of just significant results. Given Wagenmakers’ prior, z-scores in this range mostly represent false-positive results. However, the real dataset suggests that some of these z-scores are the result of underpowered studies and publication bias. That is, in these studies the null-hypothesis is false, but the significant result will not replicate because these studies have low power.

Validation Study 2:  Open Science Collective Articles (Original Results)

The second dataset is based on the Open Science Collective (OSC) replication project.  The project aimed to replicate studies published in three major psychology journals in the year 2008.  The final number of articles that were selected for replication was 99. The project replicated one study per article, but articles often contained multiple studies.  I computed absolute z-scores for theoretically important tests from all studies of these 99 articles.  This analysis produced 294 test statistics that could be converted into absolute z-scores.

Figure 3 shows clear evidence of publication bias.  No sampling distribution can produce the steep increase in tests around the critical value for significance. This selection is not an artifact of my extraction, but an actual feature of published results in psychological journals (Sterling, 1959).

Given the small number of studies, the figure also contains bootstrapped 95% confidence intervals.  The 95% CI for the power estimate shows that the sample is too small to estimate power for all studies, including studies in the proverbial file drawer, based on the subset of studies that were published. However, the replicability estimate of 49% has a reasonably tight confidence interval ranging from 45% to 66%.

The shape of the density distribution in Figure 3 differs from the distribution in Figure 2 in two ways. Initially the slop is steeper in Figure 3, and there is less density in the tail with high z-scores.  Both aspects contribute to the lower estimate of replicability in Figure 3, suggesting that replicabilty of focal hypothesis tests is lower than replicabilty for all statistical tests.

Comparing Figure 3 and Figure 1 shows again that the powergraph based on Wagenmakers’ prior differs from the powergraph for real data. In this case, the discrepancy is even more notable because focal hypothesis tests rarely produce large z-scores (z > 6).

Validation Study 3:  Open Science Collective Articles (Replication Results)

At present, the only data that are somewhat representative of psychological research (at least of social and cognitive psychology) and that do not suffer from publication bias are the results from the replication studies of the OSC replication project.  Out of 97 significant results in original studies, 36 studies (37%) produced that produced a significant result in the original studies produced a significant result in the replication study.  After eliminating some replication studies (e.g., sample of replication study was considerably smaller), 88 studies remained.

Figure 4 shows the powergraph for the 88 studies. As there is no publication bias, estimates of power and replicability are based on non-significant and significant results.  Although the sample size is smaller, the estimate of power has a reasonably narrow confidence interval because the estimate includes non-significant results. Estimated power is only 31%. The 95% confidence interval includes the actual success rate of 40%, which shows that there is no evidence of publication bias.

A visual comparison of Figure 1 and Figure 4 shows again that real data diverge from the predicted pattern by Wagenmakers’ prior.  Real data show a greater contribution of power in the range between 20% and 85% to the total density, and large z-scores (z > 6) are relatively rare in real data.

Conclusion

Statisticians have noted that it is good practice to examine the assumptions underlying statistical tests. This blog post critically examines the assumptions underlying the use of Bayes-Factors with Wagenmakers’ prior.  The main finding is that Wagenmaker’s prior makes unreasonable assumptions about power, replicability, and the distribution of observed test-statistics with or without publication bias. The main problem from Wagenmakers’ prior is that it predicts too many statistical results with strong evidence against the null-hypothesis (z > 5, or the 5 sigma rule in physics).  To achieve reasonable predictions for success rates without publication bias (~50%), Wagenmakers’ prior has to assume that over 90% of statistical tests conducted in psychology test false hypothesis (i.e., predict an effect when H0 is true), and that the false-positive rate is close to 50%.

Implications

Bayesian statisticians have pointed out for a long time that the choice of a prior influences Bayes-Factors (Kass, 1993, p. 554).  It is therefore useful to carefully examine priors to assess the effect of priors on Bayesian inferences. Unreasonable priors will lead to unreasonable inferences.  This is also true for Wagenmakers’ prior.

The problem of using Bayes-Factors with Wagenmakers’ prior to test the null-hypothesis is apparent in a realistic scenario that assumes a moderate population effect size of d = .5 and a sample size of N = 80 in a between subject design. This study has a non-central t of 2.24 and 60% power to produce a significant result with p < .05, two-tailed.   I used R to simulate 10,000 test-statistics using the non-central t-distribution and then computed Bayes-Factors with Wagenmakers’ prior.

Figure 5 shows a histogram of log(BF). The log is being used because BF are ratios and have very skewed distributions.  The histogram shows that BF never favor the null-hypothesis with a BF of 10 in favor of H0 (1/10 in the histogram).  The reason is that even with Wagenmakers’ prior a sample size of N = 80 is too small to provide strong support for the null-hypothesis.  However, 21% of observed test statistics produce a Bayes-Factor less than 1/3, which is sometimes used as sufficient evidence to claim that the data support the null-hypothesis.  This means that the test has a 21% error rate to provide evidence for the null-hypothesis when the null-hypothesis is false.  A 21% error rate is 4 times larger than the 5% error rate in null-hypothesis significance testing. It is not clear why researchers should replace a statistical method with a 5% error rate for a false discovery of an effect with a 20% error rate of false discoveries of null effects.

Another 48% of the results produce Bayes-Factors that are considered inconclusive. This leaves 31% of results that favor H1 with a Bayes-Factor greater than 3, and only 17% of results produce a Bayes-Factor greater than 10.   This implies that even with the low standard of a BF > 3, the test has only 31% power to provide evidence for an effect that is present.

These results are not wrong because they correctly express the support that the observed data provide for H0 and H1.  The problem only occurs when the specification of H1 is ignored. Given Wagenmakers prior, it is much more likely that a t-value of 1 stems from the sampling distribution of H0 than from the sampling distribution of H1.  However, studies with 50% power when an effect is present are also much more likely to produce t-values of 1 than t-values of 6 or larger.   Thus, a different prior that is more consistent with the actual power of studies in psychology would produce different Bayes-Factors and reduce the percentage of false discoveries of null effects.  Thus, researchers who think Wagenmakers’ prior is not a realistic prior for their research domain should use a more suitable prior for their research domain.

Counterarguments

Wagenmakers’ has ignored previous criticisms of his prior.  It is therefore not clear what counterarguments he would make.  Below, I raise some potential counterarguments that might be used to defend the use of Wagenmakers’ prior.

One counterargument could be that the prior is not very important because the influence of priors on Bayes-Factors decreases as sample sizes increase.  However, this argument ignores the fact that Bayes-Factors are often used to draw inferences from small samples. In addition, Kass (1993) pointed out that “a simple asymptotic analysis shows that even in large samples Bayes factors remain sensitive to the choice of prior” (p. 555).

Another counterargument could be that a bias in favor of H0 is desirable because it keeps the rate of false-positives low. The problem with this argument is that Bayesian statistics does not provide information about false-positive rates.  Moreover, the cost for reducing false-positives is an increase in the rate of false negatives; that is, either inconclusive results or false evidence for H0 when an effect is actually present.  Finally, the choice of the correct prior will minimize the overall amount of errors.  Thus, it should be desirable for researchers interested in Bayesian statistics to find the most appropriate priors in order to minimize the rate of false inferences.

A third counterargument could be that Wagenmakers’ prior expresses a state of maximum uncertainty, which can be considered a reasonable default when no data are available.  If one considers each study as a unique study, a default prior of maximum uncertainty would be a reasonable starting point.  In contrast, it may be questionable to treat a new study as a randomly drawn study from a sample of studies with different population effect sizes.  However, Wagenmakers’ prior does not express a state of maximum uncertainty and makes assumptions about the probability of observing very large effect sizes.  It does so without any justification for this expectation.  It therefore seems more reasonable to construct priors that are consistent with past studies and to evaluate priors against actual results of studies.

A fourth counterargument is that Bayes-Factors are superior because they can provide evidence for the null-hypothesis and the alternative hypothesis.  However, this is not correct. Bayes-Factors only provide relative support for the null-hypothesis relative to a specific alternative hypothesis.  Researchers who are interested in testing the null-hypothesis can do so using parameter estimation with confidence or credibility intervals. If the interval falls within a specified region around zero, it is possible to affirm the null-hypothesis with a specified level of certainty that is determined by the precision of the study to estimate the population effect size.  Thus, it is not necessary to use Bayes-Factors to test the null-hypothesis.

In conclusion, Bayesian statistics and other statistics are not right or wrong. They combine assumptions and data to draw inferences.  Untrustworthy data and wrong assumptions can lead to false conclusions.  It is therefore important to test the integrity of data (e.g., presence of publication bias) and to examine assumptions.  The uncritical use of Bayes-Factors with default assumptions is not good scientific practice and can lead to false conclusions just like the uncritical use of p-values can lead to false conclusions.

# Dr. Ulrich Schimmack’s Blog about Replicability

For generalization, psychologists must finally rely, as has been done in all the older sciences, on replication” (Cohen, 1994).

DEFINITION OF REPLICABILITYIn empirical studies with random error variance replicability refers to the probability of a study with a significant result to produce a significant result again in an exact replication study of the first study using the same sample size and significance criterion.

BLOGS BY YEAR:  20192018, 2017, 2016, 2015, 2014

Featured Blog of the Month (January, 2019):
Why Ionnidis’s Claim “Most published research findings are false” is false

TOP TEN BLOGS

1. 2018 Replicability Rankings of 117 Psychology Journals (2010-2018)

Rankings of 117 Psychology Journals according to the average replicability of a published significant result. Also includes detailed analysis of time trends in replicability from 2010 to 2018).

This post presented the first replicability ranking and explains the methodology that is used to estimate the typical power of a significant result published in a journal.  The post provides an explanation of the new method to estimate observed power based on the distribution of test statistics converted into absolute z-scores.  The method has been developed further to estimate power for a wider range of z-scores by developing a model that allows for heterogeneity in power across tests.  A description of the new method will be published when extensive simulation studies are completed.

The R-Index can be used to predict whether a set of published results will replicate in a set of exact replication studies. It combines information about the observed power of the original studies with information about the amount of inflation in observed power due to publication bias (R-Index = Observed Median Power – Inflation). The R-Index has predicted the outcome of actual replication studies.

The Test of Insufficient Variance is the most powerful test of publication bias and/or dishonest reporting practices. It can be used even if only two independent statistical results are available, although power to detect bias increases with the number of studies. After converting test results into z-scores, z-scores are expected to have a variance of one.   Unless power is very high, some of these z-scores will not be statistically significant (z .05 two-tailed).  If these non-significant results are missing, the variance shrinks, and TIVA detects that the variance is insufficient.  The observed variance is compared against the expected variance of 1 with a left-tailed chi-square test. The usefulness of TIVA is illustrated with Bem’s (2011) “Feeling the Future” data.

5.  MOST VIEWED POST (with comment by Noble Laureate Daniel Kahneman)

This blog post examines the replicability of priming studies cited in Daniel Kahneman’s popular book “Thinking fast and slow.”   The results suggest that many of the cited findings are difficult to replicate.

Stereotype-threat has been used by social psychologists to explain gender differences in math performance. Accordingly, the stereotype that men are better at math than women is threatening to women and threat leads to lower performance.  This theory has produced a large number of studies, but a recent meta-analysis showed that the literature suffers from publication bias and dishonest reporting.  After correcting for these effects, the stereotype-threat effect was negligible.  This blog post shows a low R-Index for the first article that appeared to provide strong support for stereotype-threat.  These results show that the R-Index can warn readers and researchers that reported results are too good to be true.

7.  An attempt at explaining null-hypothesis testing and statistical power with 1 figure and 1500 words.   Null-hypothesis significance testing is old, widely used, and confusing. Many false claims have been used to suggest that NHST is a flawed statistical method. Others argue that the method is fine, but often misunderstood. Here I try to explain NHST and why it is important to consider power (type-II errors) using a picture from the free software GPower.

Some Bayesian statisticians have proposed Bayes-Factors to provide evidence for a Null-Hypothesis (i.e., there is no effect).  They used Bem’s (2011) “Feeling the Future” data to argue that Bayes-Factors would have demonstrated that extra-sensory perception does not exist.  This blog post shows that Bayes-Factors depend on the specification of the alternative hypothesis and that support for the null-hypothesis is often obtained by choosing an unrealistic alternative hypothesis (e.g., there is a 25% probability that effect size is greater than one standard deviation, d > 1).  As a result, Bayes-Factors can favor the null-hypothesis when there is an effect, but the effect size is small (d = .2).  A Bayes-Factor in favor of the null is more appropriately interpreted as evidence that the alternative hypothesis needs to decrease the probabilities assigned to large effect sizes. The post also shows that Bayes-Factors based on a meta-analysis of Bem’s data provide misleading evidence that an effect is present because Bayesian statistics do not take publication bias and dishonest reporting practices into account.

9. Hidden figures: Replication failures in the stereotype threat literature.  A widespread problem is that failed replication studies are often not published. This blog post shows that another problem is that failed replication studies are ignored even when they are published.  Selective publishing of confirmatory results undermines the credibility of science and claims about the importance of stereotype threat to explain gender differences in mathematics.

10. My journey towards estimation of replicability.  In this blog post I explain how I got interested in statistical power and replicability and how I developed statistical methods to reveal selection bias and to estimate replicability.

# Power Analysis for Bayes-Factor: What is the Probability that a Study Produces an Informative Bayes-Factor?

Jacob Cohen has warned fellow psychologists about the problem of conducting studies with insufficient statistical power to demonstrate predicted effects in 1962. The problem is simple enough. An underpowered study has only a small chance to produce the correct result; that is, a statistically significant result when an effect is present.

Many researchers have ignored Cohen’s advice to conduct studies with at least 80% power, that is, an 80% probability to produce the correct result when an effect is present because they were willing to pay low odds. Rather than conducting a single powerful study with 80% power, it seemed less risky to conduct three underpowered studies with 30% power. The chances of getting a significant result are similar (the power to get a significant result in at least 1 out of 3 studies with 30% power is 66%). Moreover, the use of smaller samples is even less problematic if a study tests multiple hypotheses. With 80% power to detect a single effect, a study with two hypotheses has a 96% probability that at least one of the two effects will produce a significant result. Three studies allow for six hypotheses tests. With 30% power to detect at least one of the two effects in six attempts, power to obtain at least one significant result is 88%. Smaller samples also provide additional opportunities to increase power by increasing sample sizes until a significant result is obtained (optional stopping) or by eliminating outliers. The reason is that these questionable practices have larger effects on the results in smaller samples. Thus, for a long time researchers did not feel a need to conduct adequately powered studies because there was no shortage of significant results to report (Schimmack, 2012).

Psychologists have ignored the negative consequences of relying on underpowered studies to support their conclusions. The problem is that the reported p-values are no longer valid. A significant result that was obtained by conducting three studies no longer has a 5% chance to be a random event. By playing the sampling-error lottery three times, the probability of obtaining a significant result by chance alone is now 15%. By conducting three studies with two hypothesis tests, the probability of obtaining a significant result by chance alone is 30%. When researchers use questionable research practices, the probability of obtaining a significant result by chance can further increase. As a result, a significant result no longer provides strong statistical evidence that the result was not just a random event.

It would be easy to distinguish real effects from type-I errors (significant results when the null-hypothesis is true) by conducting replication studies. Even underpowered studies with 30% power will replicate in every third study. In contrast, when the null-hypothesis is true, type-I errors will replicate only in 1 out of 20 studies, when the criterion is set to 5%. This is what a 5% criterion means. There is only a 5% chance (1 out of 20) to get a significant result when the null-hypothesis is true. However, this self-correcting mechanism failed because psychologists considered failed replication studies as uninformative. The perverse logic was that failed replications are to be expected because studies have low power. After all, if a study has only 30% power, a non-significant result is more likely than a significant result. So, non-significant results in underpowered studies cannot be used to challenge a significant result in an underpowered study. By this perverse logic, even false hypothesis will only receive empirical support because only significant results will be reported, no matter whether an effect is present or not.

The perverse consequences of abusing statistical significance tests became apparent when Bem (2011) published 10 studies that appeared to demonstrate that people can anticipate random future events and that practicing for an exam after writing an exam can increase grades. These claims were so implausible that few researchers were willing to accept Bem’s claims despite his presentation of 9 significant results in 10 studies. Although the probability that this even occurred by chance alone is less than 1 in a billion, few researchers felt compelled to abandon the null-hypothesis that studying for an exam today can increase performance on yesterday’s exam.   In fact, most researchers knew all too well that these results could not be trusted because they were aware that published results are not an honest report of what happens in a lab. Thus, a much more plausible explanation for Bem’s incredible results was that he used questionable research practices to obtain significant results. Consistent with this hypothesis, closer inspection of Bem’s results shows statistical evidence that Bem used questionable research practices (Schimmack, 2012).

As the negative consequences of underpowered studies have become more apparent, interest in statistical power has increased. Computer programs make it easy to conduct power analysis for simple designs. However, so far power analysis has been limited to conventional statistical methods that use p-values and a criterion value to draw conclusions about the presence of an effect (Neyman-Pearson Significance Testing, NPST).

Some researchers have proposed Bayesian statistics as an alternative approach to hypothesis testing. As far as I know, these researchers have not provided tools for the planning of sample sizes. One reason is that Bayesian statistics can be used with optional stopping. That is, a study can be terminated early when a criterion value is reached. However, an optional stopping rule also needs a rule when data collection will be terminated in case the criterion value is not reached. It may sound appealing to be able to finish a study at any moment, but if this event is unlikely to occur in a reasonably sized sample, the study would produce an inconclusive result. Thus, even Bayesian statisticians may be interested in the effect of sample sizes on the ability to obtain a desired Bayes-Factor. Thus, I wrote some r-code to conduct power analysis for Bayes-Factors.

The code uses the Bayes-Factor package in r for the default Bayesian t-test (see also blog post on Replication-Index blog). The code is posted at the end of this blog. Here I present results for typical sample sizes in the between-subject design for effect sizes ranging from 0 (the null-hypothesis is true) to Cohen’s d = .5 (a moderate effect). Larger effect sizes are not reported because large effects are relatively easy to detect.

The first table shows the percentage of studies that meet a specified criterion value based on 10,000 simulations of a between-subject design. For Bayes-Factors the criterion values are 3 and 10. For p-values the criterion values are .05, .01, and .001. For Bayes-Factors, a higher number provides stronger support for a hypothesis. For p-values, lower values provide stronger support for a hypothesis. For p-values, percentages correspond to the power of a study. Bayesian statistics has no equivalent concept, but percentages can be used in the same way. If a researcher aims to provide empirical support for a hypothesis with a Bayes-Factor greater than 3 or 10, the table gives the probability of obtaining the desired outcome (success) as a function of the effect size and sample size.

d   n     N     3   10     .05 .01     .001
.5   20   40   17   06     31     11     02
.4   20   40   12   03     22     07     01
.3   20   40   07   02     14     04     00
.2   20   40   04   01     09     02     00
.1   20   40   02   00     06     01     00
.0   20   40   33   00     95     99   100

For an effect size of zero, the interpretation of results switches. Bayes-Factors of 1/3 or 1/10 are interpreted as evidence for the null-hypothesis. The table shows how often Bayes-Factors provide support for the null-hypothesis as a function of the effect size, which is zero, and sample size. For p-values, the percentage is 1 – p. That is, when the effect is zero, the p-value will correctly show a non-significant result with a probability of 1 – p and it will falsely reject the null-hypothesis with the specified type-I error.

Typically, researchers do not interpret non-significant results as evidence for the null-hypothesis. However, it is possible to interpret non-significant results in this way, but it is important to take the type-II error rate into account. Practically, it makes little difference whether a non-significant result is not interpreted or whether it is taken as evidence for the null-hypothesis with a high type-II error probability. To illustrate this consider a study with N = 40 (n = 20 per group) and an effect size of d = .2 (a small effect). As there is a small effect, the null-hypothesis is false. However, the power to detect this effect in a small sample is very low. With p = .05 as the criterion, power is only 9%. As a result, there is a 91% probability to end up with a non-significant result even though the null-hypothesis is false. This probability is only slightly lower than the probability to get a non-significant result when the null-hypothesis is true (95%). Even if the effect size were d = .5, a moderate effect, power is only 31% and the type-II error rate is 69%. With type-II error rates of this magnitude, it makes practically no difference whether a null-hypothesis is accepted with a warning that the type-II error rate is high or whether the non-significant result is simply not interpreted because it provides insufficient information about the presence or absence of small to moderate effects.

The main observation in Table 1 is that small samples provide insufficient information to distinguish between the null-hypothesis and small to moderate effects. Small studies with N = 40 are only meaningful to demonstrate the presence of moderate to large effects, but they have insufficient power to show effects and insufficient power to show the absence of effects. Even when the null-hypothesis is true, a Bayes-Factor of 3 is reached only 33% of the time. A Bayes-Factor of 10 is never reached because the sample size is too small to provide such strong evidence for the null-hypothesis when the null-hypothesis is true. Even more problematic is that a Bayes-Factor of 3 is reached only 17% of the time when a moderate effect is present. Thus, the most likely outcome in small samples is an inconclusive result unless a strong effect is present. This means that Bayes-Factors in these studies have the same problem as p-values. They can only provide evidence that an effect is present when a strong effect is present, but they cannot provide sufficient evidence for the null-hypothesis when the null-hypothesis is true.

d   n     N     3   10     .05 .01     .001
.5   50 100   49   29     68     43     16
.4   50 100   30   15     49     24     07
.3   50 100   34   18     56     32     12
.2   50 100   07   02     16     05     01
.1   50 100   03   01     08     02     00
.0   50 100   68   00     95     99   100

In Table 2 the sample size has been increased to N = 100 participants (n = 50 per cell). This is already a large sample size by past standards in social psychology. Moreover, in several articles Wagenmakers has implemented a stopping rule that terminates data collection at this point. The table shows that a sample size of N = 100 in a between-subject design has modest power to demonstrate even moderate effect sizes of d = .5 with a Bayes-Factor of 3 as a criterion (49%). In comparison, a traditional p-value of .05 would provide 68% power.

The main argument for using Bayesian statistics is that it can also provide evidence for the null-hypothesis. With a criterion value of BF = 3, the default test correctly favors the null-hypothesis 68% of the time (see last row of the table). However, the sample size is too small to produce Bayes-Factors greater than 10. In sum, the default-Bayesian t-test with N = 100 can be used to demonstrate the presence of a moderate to large effects and with a criterion value of 3 it can be used to provide evidence for the null-hypothesis when the null-hypothesis is true. However it cannot be used to demonstrate that provide evidence for small to moderate effects.

The Neyman-Pearson approach to significance testing would reveal this fact in terms of the type-I I error rates associated with non-significant results. Using the .05 criterion, a non-significant result would be interpreted as evidence for the null-hypothesis. This conclusion is correct in 95% of all tests when the null-hypothesis is actually true. This is higher than the 68% criterion for a Bayes-Factor of 3. However, the type-II error rates associated with this inference when the null-hypothesis is false are 32% for d = .5, 51% for d = .4, 44% for d = .3, 84% for d = .2, and 92% for d = .1. If we consider effect size of d = .2 as important enough to be detected (small effect size according to Cohen), the type-II error rate could be as high as 84%.

In sum, a sample size of N = 100 in a between-subject design is still insufficient to test for the presence of a moderate effect size (d = .5) with a reasonable chance to find it (80% power). Moreover, a non-significant result is unlikely to occur for moderate to large effect sizes, but the sample size is insufficient to discriminate accurately between the null-hypothesis and small to moderate effects. A Bayes-Factor greater than 3 in favor of the null-hypothesis is most likely to occur when the null-hypothesis is true, but it can also occur when a small effect is present (Simonsohn, 2015).

The next table increases the total sample size to 200 for a between-subject design. The pattern doesn’t change qualitatively. So the discussion will be brief and focus on the power of a study with 200 participants to provide evidence for small to moderate effects and to distinguish small to moderate effects from the null-hypothesis.

d   n     N     3   10     .05 .01     .001
.5 100 200   83   67     94     82     58
.4 100 200   60   41     80     59     31
.3 100 200   16   06     31     13     03
.2 100 200   13   06     29     12     03
.1 100 200   04   01     11     03     00
.0 100 200   80   00     95     95     95

Using Cohen’s guideline of 80% success rate (power), a study with N = 200 participants has sufficient power to show a moderate effect of d = .5 with p = .05, p = .01, and Bayes-Factor = 3 as criterion values. For d = .4, only the criterion value of p = .05 has sufficient power. For all smaller effects, the sample size is still too small to have 80% power. A sample of N = 200 also provides 80% power to provide evidence for the null-hypothesis with a Bayes-Factor of 3. Power for a Bayes-Factor of 10 is still 0 because this value cannot be reached with N = 200. Finally, with N = 200, the type-II error rate for d = .5 is just shy of .05 (1 – .94 = .06). Thus, it is justified to conclude from a non-significant result with a 6% error rate that the true effect size cannot be moderate to large (d >= .5). However, type-II error rates for smaller effect sizes are too high to test the null-hypothesis against these effect sizes.

d   n     N     3   10     .05 .01     .001
.5 200 400   99   97   100     99     95
.4 200 400   92   82     98     92     75
.3 200 400   64   46     85     65     36
.2 200 400   27   14     52     28     10
.1 200 400   05   02     17     06     01
.0 200 400   87   00     95     99     95

The next sample size doubles the number of participants. The reason is that sampling error decreases in a log-function and large increases in sample sizes are needed to further decrease sampling error. A sample size of N = 200 yields a standard error of 2 / sqrt(200) = .14. (14/100 of a standard deviation). A sample size of N = 400 is needed to reduce this to .10 (2 / sqrt (400) = 2 / 20 = .10; 2/10 of a standard deviation).   This is the reason why it is so difficult to find small effects.

Even with N = 400, power is only sufficient to show effect sizes of .3 or greater with p = .05, or effect sizes of d = .4 with p = .01 or Bayes-Factor 3. Only d = .5 can be expected to meet the criterion p = .001 more than 80% of the time. Power for Bayes-Factors to show evidence for the null-hypothesis also hardly changed. It increased from 80% to 87% with Bayes-Factor = 3 as criterion. The chance to get a Bayes-Factor of 10 is still 0 because the sample size is too small to produce such extreme values. Using Neyman-Pearson’s approach with a 5% type-II error rate as criterion, it is possible to interpret non-significant results as evidence that the true effect size cannot be .4 or larger. With a 1% criterion it is possible to say that a moderate to large effect would produce a significant result 99% of the time and the null-hypothesis would produce a non-significant result 99% of the time.

Doubling the sample size to N = 800 reduces sampling error from SE = .1 to SE = .07.

d   n     N     3     10     .05   .01     .001
.5 400 800 100 100   100  100     100
.4 400 800 100   99   100  100       99
.3 400 800   94   86     99     95      82
.2 400 800   54   38     81     60      32
.1 400 800   09   04     17     06      01
.0 400 800   91   52     95     95      95

A sample size of N = 800 is sufficient to have 80% power to detect a small effect according to Cohen’s classification of effect sizes (d = .2) with p = .05 as criterion. Power to demonstrate a small effect with Bayes-Factor = 3 as criterion is only 54%. Power to demonstrate evidence for the null-hypothesis with Bayes-Factor = 3 as criterion increased only slightly from 87% to 91%, but a sample size of N = 100 is sufficient to produce Bayes-Factors greater than 10 in favor of the null-hypothesis 52% of the time. Thus, researchers who aim for this criterion value need to plan their studies with N = 800. Smaller samples cannot produce these values with the default Bayesian t-test. Following Neyman-Pearson, a non-significant result can be interpreted as evidence that the true effect cannot be larger than d = .3, with a type-II error rate of 1%.

Conclusion

A common argument in favor of Bayes-Factors has been that Bayes-Factors can be used to test the null-hypothesis, whereas p-values can only reject the null-hypothesis. There are two problems with this claim. First, it confuses Null-Significance-Testing (NHST) and Neyman-Pearson-Significance-Testing (NPST). NPST also allows researchers to accept the null-hypothesis. In fact, it makes it easier to accept the null-hypothesis because every non-significant result favors the null-hypothesis. Of course, this does not mean that all non-significant results show that the null-hypothesis is true. In NPST the error of falsely accepting the null-hypothesis depends on the amount of sampling error. The tables here make it possible to compare Bayes-Factors and NPST. No matter which statistical approach is being used, it is clear that meaningful evidence for the null-hypothesis requires rather large samples. The r-code below can be used to compute power for different criterion values, effect sizes, and sample sizes. Hopefully, this will help researchers to better plan sample sizes and to better understand Bayes-Factors that favor the null-hypothesis.

########################################################################
###                       R-Code for Power Analysis for Bayes-Factor and P-Values                ###
########################################################################

## setup
rm(list = ls())                       # clear memory

## set parameters
nsim = 10000      #set number of simulations
es 1 favor effect)
BF10_crit = 3      #set criterion value for BF favoring effect (> 1 = favor null)
p_crit = .05          #set criterion value for two-tailed p-value (e.g., .05

## computations
Z <- matrix(rnorm(groups*n*nsim,mean=0,sd=1),nsim,groups*n)   # create observations
Z[,1:n] <- Z[,1:n] + es                                                                                                #add effect size
tt <- function(x) {                                                                                                       #compute t-statistic (t-test)
oes <- mean(x[1:n])                                                                                    #compute mean group 1
if (groups == 2) oes = oes – mean(x[(n+1):(2*n)])                                  #compute mean for 2 groups
oes <- oes / sd(x[1:n*groups])                                                                  #compute observed effect size
t <- abs(oes) / (groups / sqrt(n*groups))                                                 #compute t-value
}

t <- apply(Z,1,function(x) tt(x))                                                                                 #get t-values for all simulations
df <- t – t + n*groups-groups                                                                                    #get degrees of freedom
p2t <- (1 – pt(abs(t),df))*2                                                                                         #compute two-tailed p-value
getBF <- function(x) {                                                                                                 #function to get Bayes-Factor
t <- x[1]
df <- x[2]
bf <- exp(ttest.tstat(t,(df+2)/2,(df+2)/2,rscale=rsc)\$bf)
}              # end of function to get Bayes-Factor

input = matrix(cbind(t,df),,2)                                                                  # combine t and df values
BF10 <- apply(input,1, function(x) getBF(x) )                                        # get BF10 for all simulations
powerBF10 = length(subset(BF10, BF10 > BF10_crit))/nsim*100        # % results support for effect
powerBF01 = length(subset(BF10, BF10 < 1/BF10))/nsim*100            # % results support for null
powerP = length(subset(p2t, p2t < .05))/nsim*100                                # % significant, p < p-criterion

##output of results
cat(
” Power to support effect with BF10 >”,BF10_crit,”: “,powerBF10,
“\n”,
“Power to support null with BF01 >”,BF01_crit,” : “,powerBF01,
“\n”,
“Power to show effect with p < “,p_crit,” : “,powerP,
“\n”)