#R Code behind Figure 2 in #Simonsohn, Simmons and Nelson, "P-Curve and Effect Size: Correcting for Publication Bias Using Only Significant Results" # #Prepared by Uri Simonsohn, uws@wharton.upenn.edu # #Last updated: 2014 05 13 ##################################################################################################### # Overview # Figures 3A 3B and 3C have each a custom program. Below they are presented in that order. # They can be run independently, only the metafor library and loss function needs to be loaded, and then each of them # can be run ##################################################################################################### library(metafor) #Needed for trim-and-fill ##################################################################################################### #SIMPLIFIED LOSS FUNCTION, USED THROUGHOUT BELOW #see appendix in paper for explanations and more robust version (robust to user input with t<0 and/or p>.05) loss=function(t_obs,df_obs,d_est) { ncp_est=sqrt((df_obs+2)/4)*d_est tc=qt(.975,df_obs) power_est=1-pt(tc,df_obs,ncp_est) p_larger=pt(t_obs,df=df_obs,ncp=ncp_est) ppr=(p_larger-(1-power_est))/power_est KSD=ks.test(ppr,punif)$statistic return(KSD) } ##################################################################################################### # FIGURE 3A - P-HACKING BY DATA PEEKING ##################################################################################################### #Simulation of data peeking, conducting test on samples of increasibly larger size #Note: this function is much more flexible than needed for what we used in Figure 3A peekd=function(simtot,n0,n1,every,trued) { #Syntax: #Simtot: number of simulations #n0: smallest sample size #n1: largests sample size #every: how often is data checked #trued: true effect size time0=Sys.time() #keep track of when simulation started set.seed(232) #seed to always get the same results p_publish=c() #initialize p-value vector p_drawered=c() # "" for omnscient test df_publish=c() #d.f. of significant simulations df_drawered=c() #d.f. of n.s. simulations for (simk in 1:simtot){ s1=rnorm(n1) #draw control condition raw data for total sample size s2=rnorm(n1)+trued #draw treatment for total sample size i=0 #counter of observations having been peeked repeat{ #Run test, if n.s. inrease sample size by every, until p<.05 or until n1 is reached; pi=t.test(s1[1:(n0+i)],s2[1:(n0+i)],alternative="less",var.equal = TRUE)$p.value #compute p-value from t-test #if significant, publish it if(pi<.025) { df_publish=c(df_publish,2*(n0+i)-2) p_publish=c(p_publish,pi) break } i=i+every #if when you get to n=n1 still not p<.05, end study and file-drawer it if((n0+i)>n1) { p_drawered=c(p_drawered,pi) #filedrawer df_drawered=c(df_drawered,2*n1-2) break } } } #At this point we have a vector of published and filed-drawered studies t_publish=qt(1-p_publish,df_publish) t_drawered=qt(1-p_drawered,2*n1-2) t_all=c(t_publish,t_drawered) df_all=c(df_publish,df_drawered) #Find d-hat for p-curve dp=optimize(loss,c(-.3,2),df_obs=df_publish,t_obs=t_publish)$minimum #For details and better optimization see Appendix #Naive estimate dt=(2*t_publish)/sqrt(df_publish+2) n=(df_publish+2)/2 vardt=(2/n+ (dt**2)/(4*n-4))*(2*n/(2*n-2)) #variance formula for cohen-d naive=rma(yi=dt,vi=vardt,method="FE") #run meta-analysis, fixed effect #Trim and Fill tf=trimfill(naive) #Trim and fill correction #Omniscient estimate dtall=(2*t_all)/sqrt(df_all+2) nall=(df_all+2)/2 vardtall=(2/nall+ (dtall**2)/(4*nall-4))*(2*nall/(2*nall-2)) gods_bs=rma(yi=dtall,vi=vardtall,method="FE")$b #Report results cat("\n # of p<.05:", length(t_publish)) #the size of the t-publish vector is the # of p<.05 cat("\n # of p>.05:", length(t_drawered)) cat("\n Naive estimate:",naive$b) #Meta-analysis on p<.05 results cat("\n Trim and fill:",tf$b) #Meta-analysis on p<.05 results cat("\n God's estimate:",gods_bs) #Meta-analysis on all results cat("\n p-curve estimate:",dp,"\n") #p-curve cat("\n ===========================================") time1=Sys.time() #See how long it took cat("\n") print(time1-time0) #Report how long it took table(df_publish) #Useful to know what's going on, how big are samples that were p<.05? return(c(tf$b,dp,gods_bs)) #output is a vector with the naive, p-curve, and omniscient estimat } #verifiying it works without any peeking r1=peekd(simtot=1000,n0=20,n1=20,every=10,trued=.0) r2=peekd(simtot=1000,n0=20,n1=20,every=10,trued=.2) r3=peekd(simtot=1000,n0=20,n1=20,every=10,trued=.4) r4=peekd(simtot=1000,n0=20,n1=20,every=10,trued=.6) r5=peekd(simtot=1000,n0=20,n1=20,every=10,trued=.8) #Actual simulations of peeking #Trim-and-fill is the bottle-neck: r6=peekd(simtot=10000,n0=20,n1=30,every=10,trued=0) #Takes 23 seconds in Uri's laptop r7=peekd(simtot=10000,n0=20,n1=30,every=10,trued=.2) #Takes 26 seconds r8=peekd(simtot=10000,n0=20,n1=30,every=10,trued=.4) #Takes 51 seconds r9=peekd(simtot=10000,n0=20,n1=30,every=10,trued=.6) #Takes 2 minutes r10=peekd(simtot=10000,n0=20,n1=30,every=10,trued=.8) #Takes ~4 minutes mpeek=matrix(c(r6,r7,r8,r9,r10),5,3,byrow=TRUE) ##################################################################################################### # FIGURE 3B - P-HACKING BY CHOOSING FROM 3 DVS ##################################################################################################### fig3b=function(simtot,n,rho,trued) { #EXPLANATION: # A situation where a researcher collectes 3dvs and performs sequential tests is modeled. # If the first test is n.s. she tries another, and then another. The dvs are correlated with each other # This simulates an extremely common form of p-hacking as researchers often collect multiple measures of the same construct # and do not need to report all of them. #SYNTAX: # simtot is the number of simulations # n is the sample size of each cell in each simulation # rho is used to vary how correlated the multiple dvs are, for r(x1,x2)=.5 rho~.7 # trued is the actual effect size in the latent variables #STEPS #1. Latent variables: Two variables y1,y2 are the latent variables which the researcher does not observe #2. Observed variables: she measures x11,x12,x13 as proxies of y1, and x21,x22,x23 as proxies of y2 # Each of the xs is correlated rho with y, so xs are correlated about sqrt(rho) whicch each other, to get them to be r()=.5 # we set rho=.7. # We then add trued to each of the x21 x22 x23 #3. Run three t-tests, comparing (x11,x21), (x12,x22), (x13,x23) #4. If the first comparison is significant it is kept, if not, the 2nd is tried, and if it is not significant the third is tried # The meta-analysis is sent the last analysis performed, so if the study is n.s. overall, the meta-analysis gest x13 vs x33. #5. Assign the simmulated studies to the file-drawer (if no test is signifciant) or publish it, if at least one is #seed set.seed(222) #initialize vectors where results will be stored p_publish=c() #initialize p-value vector for published studies p_drawer=c() #for the file-drawer #do a loop to performed three tests on a given simualted study for (simk in 1:simtot){ #1)TWO LATENT VARIABLES y1=rnorm(n) y2=rnorm(n) #2) CREATE OBSERVABLE VARIABLES #noise to be added to each latent variable r11=rnorm(n) r12=rnorm(n) r13=rnorm(n) r21=rnorm(n) r22=rnorm(n) r23=rnorm(n) #observed variables=latent+noise #to achive r(x1,x2)=rho one uses the formula below (well known formula for creating correlated random variables) #see e.g., http://www.sitmo.com/article/generating-correlated-random-numbers/ #noisy measures of Y1 x11=rho*y1+sqrt((1-rho**2))*r11 x12=rho*y1+sqrt((1-rho**2))*r12 x13=rho*y1+sqrt((1-rho**2))*r13 #Noisy measures of Y2 adding the true effect size x21=rho*y2+sqrt((1-rho**2))*r21+trued x22=rho*y2+sqrt((1-rho**2))*r22+trued x23=rho*y2+sqrt((1-rho**2))*r23+trued #STEP 3: Do the 3 t-test #with if statements one could choose to run a test only if the previous one was not significant, that seemed harder to code #and possibly slower to run so did this instead. # #As is the case throughout, we are interested in directional predictions tested with two-sided tests (this helps) #false-positive rates be more comparable with power, so we do a one sided t-test and consider it significant if p<.025, p1=t.test(x11,x21,alternative="less")$p.value p2=t.test(x12,x22,alternative="less")$p.value p3=t.test(x13,x23,alternative="less")$p.value #STEP 4: Report the first one that is significant, or file-drawer p3 pi=p1 #pi is the reported p-value of this simualted study, start with the 1st test if (pi>.025 & p2.025 & p3.025) {p_drawer=c(p_drawer,pi)} #drawer otherwise } #this closes the loop for simtot simulations, next we aggregate all ther results into a meta-analysis #published and drawered tests df=2*n-2 #for various calculations need df, easier to compute once here #note that in this function n is a scalar, in others it is a vector #t-values t_publish=qt(1-p_publish,df) #creates a vector with significant t-values t_drawer =qt(1-p_drawer,df) #creates a vector with n.s. t-values t_all=c(t_publish,t_drawer) #combines them both #d values d_publish=(2*t_publish)/sqrt(df+2) #to obtain naive and omniscient estimates we convert t-->d d_drawer=(2*t_drawer)/sqrt(df+2) d_all=c(d_publish,d_drawer) #estimate effect size from p-curve dp=optimize(loss,c(-.3,2),df_obs=df,t_obs=t_publish)$minimum #for details see comments on fig1() #Average effect sizes naive=mean(d_publish) god =mean(d_all) #print results cat("\n # of p<.05:", length(p_publish)) cat("\n # of p>.05:", length(p_drawer)) cat("\n p-curve estimate:",dp) cat("\n Naive estimate:",naive) cat("\n God's estimate:",god) cat("\n =========================================== \n") #"SAVE" them to the vector with all results return(c(naive,dp,god)) } r1=fig3b(simtot=20000,n=20,rho=.7,trued=0) r2=fig3b(simtot=15000,n=20,rho=.7,trued=.2) r3=fig3b(simtot=15000,n=20,rho=.7,trued=.4) r4=fig3b(simtot=10000,n=20,rho=.7,trued=.6) r5=fig3b(simtot=10000,n=20,rho=.7,trued=.8) figure3b=matrix(c(r1,r2,r3,r4,r5),5,3,byrow=TRUE) ##################################################################################################### # FIGURE 3C - P-HACKING BY DROPPING OUTLIERS ##################################################################################################### #This function takes a vector and excludes observations more than xsd SD away from local mean noout=function(y,xsd) return=(subset(y,abs((y-mean(y)/sd(y))).025 & p2.025 & p3.025 & p4.025) { p_drawer=c(p_drawer,pi) df_drawer=c(df_drawer,dfi)} #drawer otherwise } #published and drawered tests #t-values t_publish=qt(1-p_publish,df_publish) t_drawer =qt(1-p_drawer,df_drawer) #d values d_publish=(2*t_publish)/sqrt(df_publish+2) d_drawer=(2*t_drawer)/sqrt(df_drawer+2) d_all=c(d_publish,d_drawer) #DF AND N df_all=c(df_publish,df_drawer) n_all=(df_all+2)/2 n_publish=(df_publish+2)/2 #Variance formula for d vard_all=(2/n_all+ (d_all**2)/(4*n_all-4))*(2*n_all/(2*n_all-2)) vard_publish=(2/n_publish+ (d_publish**2)/(4*n_publish-4))*(2*n_publish/(2*n_publish-2)) #Estimate effect size using all studies) god=rma(yi=d_all,vi=vard_all,method="FE")$b #Significant subset naive=rma(yi=d_publish,vi=vard_publish,method="FE")$b #estimate effect size from p-curve dp=optimize(loss,c(-.3,2),df_obs=df_publish,t_obs=t_publish)$minimum #print results t2=Sys.time() cat("\n # of p<.05:", length(p_publish)) cat("\n # of p>.05:", length(p_drawer)) cat("\n p-curve estimate:",dp) cat("\n Naive estimate:",naive) cat("\n God's estimate:",god) cat("\n =========================================== \n") print(round(t2-t1,digits=1)) cat("\n =========================================== \n") cat("\n Distribution of d.f. of tests of p<.05 studies") print(table(df_publish)) #"SAVE" them to the vector with all results return(c(naive,dp,god)) } #Here I exclude 10 SD away, i.e., do not exclude, to make sure the code is running properly. r1=outlierd(simtot=1000,n=20,xsd=10,trued=0) r2=outlierd(simtot=50000,n=20,xsd=10,trued=0.2) r3=outlierd(simtot=50000,n=20,xsd=10,trued=0.4) r4=outlierd(simtot=50000,n=20,xsd=10,trued=0.6) r5=outlierd(simtot=50000,n=20,xsd=10,trued=0.8) #here i exclude based on 2 SD, the actual simulation of interest r6=outlierd(simtot=10000,n=20,xsd=2,trued=0) r7=outlierd(simtot=10000,n=20,xsd=2,trued=0.2) r8=outlierd(simtot=10000,n=20,xsd=2,trued=0.4) r9=outlierd(simtot=10000,n=20,xsd=2,trued=0.6) r10=outlierd(simtot=10000,n=20,xsd=2,trued=0.8) figure3c=matrix(c(r6,r7,r8,r9,r10),5,3,byrow=TRUE)