#R Code behind Figure 2 in
#Simonsohn, Simmons and Nelson,  "P-Curve and Effect Size: Correcting for Publication Bias Using Only Significant Results"
#
#Prepared by Uri Simonsohn, uws@wharton.upenn.edu
#
#Last updated: 2014 05 13

#####################################################################################################
# Overview
# Figures 3A 3B and 3C have each a custom program. Below they are presented in that order.
# They can be run independently, only the metafor library and loss function needs to be loaded, and then each of them
# can be run
#####################################################################################################


library(metafor)  #Needed for trim-and-fill

#####################################################################################################
#SIMPLIFIED LOSS FUNCTION, USED THROUGHOUT BELOW
#see appendix in paper for explanations and more robust version (robust to user input with t<0 and/or p>.05)
loss=function(t_obs,df_obs,d_est) {  
  ncp_est=sqrt((df_obs+2)/4)*d_est
  tc=qt(.975,df_obs)              
  power_est=1-pt(tc,df_obs,ncp_est)
  p_larger=pt(t_obs,df=df_obs,ncp=ncp_est)	
  ppr=(p_larger-(1-power_est))/power_est 	
  KSD=ks.test(ppr,punif)$statistic       	
  return(KSD)     }


#####################################################################################################
# FIGURE 3A -  P-HACKING BY DATA PEEKING
#####################################################################################################

#Simulation of data peeking, conducting test on samples of increasibly larger size
#Note: this function is much more flexible than needed for what we used in Figure 3A

peekd=function(simtot,n0,n1,every,trued) {
    #Syntax:
    #Simtot: number of simulations
    #n0:     smallest sample size
    #n1:     largests sample size
    #every:  how often is data checked
    #trued: true effect size
  
  time0=Sys.time()    #keep track of when simulation started
  set.seed(232)       #seed to always get the same results
  
  p_publish=c()             #initialize p-value vector
  p_drawered=c()            # "" for omnscient test
  df_publish=c()            #d.f. of significant simulations
  df_drawered=c()           #d.f. of n.s. simulations
  
  for (simk in 1:simtot){
    s1=rnorm(n1)           #draw control condition raw data for total sample size
    s2=rnorm(n1)+trued     #draw treatment for total sample size
    
    i=0               #counter of observations having been peeked
    repeat{
      #Run test, if n.s. inrease sample size by every, until p<.05 or until n1 is reached;
        pi=t.test(s1[1:(n0+i)],s2[1:(n0+i)],alternative="less",var.equal = TRUE)$p.value  #compute p-value from t-test
      #if significant, publish it    
        if(pi<.025) {
          df_publish=c(df_publish,2*(n0+i)-2)
          p_publish=c(p_publish,pi)
          break
        }
      i=i+every
      
      #if when you get to n=n1 still not p<.05, end study and file-drawer it
      if((n0+i)>n1) {
        p_drawered=c(p_drawered,pi)                  #filedrawer
        df_drawered=c(df_drawered,2*n1-2)
        break          }
    }
    
  }
  #At this point we have a vector of published and filed-drawered studies
    t_publish=qt(1-p_publish,df_publish)
    t_drawered=qt(1-p_drawered,2*n1-2)
    t_all=c(t_publish,t_drawered)      
    df_all=c(df_publish,df_drawered)
  
  #Find d-hat for p-curve 
    dp=optimize(loss,c(-.3,2),df_obs=df_publish,t_obs=t_publish)$minimum  #For details and better optimization see Appendix

  #Naive estimate 
    dt=(2*t_publish)/sqrt(df_publish+2)
    n=(df_publish+2)/2                         
    vardt=(2/n+ (dt**2)/(4*n-4))*(2*n/(2*n-2))   #variance formula for cohen-d
    naive=rma(yi=dt,vi=vardt,method="FE")     #run meta-analysis, fixed effect
  
  #Trim and Fill
    tf=trimfill(naive)                        #Trim and fill correction
  #Omniscient estimate
    dtall=(2*t_all)/sqrt(df_all+2)
    nall=(df_all+2)/2
    vardtall=(2/nall+ (dtall**2)/(4*nall-4))*(2*nall/(2*nall-2))
    gods_bs=rma(yi=dtall,vi=vardtall,method="FE")$b
  
  #Report results
    cat("\n # of p<.05:", length(t_publish))                  #the size of the t-publish vector is the # of p<.05
    cat("\n # of p>.05:", length(t_drawered))
    cat("\n Naive estimate:",naive$b)                        #Meta-analysis on p<.05 results
    cat("\n Trim and fill:",tf$b)                        #Meta-analysis on p<.05 results
    cat("\n God's estimate:",gods_bs)                         #Meta-analysis on all results
    cat("\n p-curve estimate:",dp,"\n")                       #p-curve
    cat("\n ===========================================")
    time1=Sys.time()                                          #See how long it took
    cat("\n")       
    print(time1-time0)                                        #Report how long it took
    table(df_publish)                                          #Useful to know what's going on, how big are samples that were p<.05?
    return(c(tf$b,dp,gods_bs))   #output is a vector with the naive, p-curve, and omniscient estimat
}

#verifiying it works without any peeking
  r1=peekd(simtot=1000,n0=20,n1=20,every=10,trued=.0)
  r2=peekd(simtot=1000,n0=20,n1=20,every=10,trued=.2)
  r3=peekd(simtot=1000,n0=20,n1=20,every=10,trued=.4)
  r4=peekd(simtot=1000,n0=20,n1=20,every=10,trued=.6)
  r5=peekd(simtot=1000,n0=20,n1=20,every=10,trued=.8)

#Actual simulations of peeking                          #Trim-and-fill is the bottle-neck:
  r6=peekd(simtot=10000,n0=20,n1=30,every=10,trued=0)   #Takes 23  seconds in Uri's laptop
  r7=peekd(simtot=10000,n0=20,n1=30,every=10,trued=.2)  #Takes 26 seconds
  r8=peekd(simtot=10000,n0=20,n1=30,every=10,trued=.4)  #Takes 51 seconds
  r9=peekd(simtot=10000,n0=20,n1=30,every=10,trued=.6)  #Takes  2 minutes
  r10=peekd(simtot=10000,n0=20,n1=30,every=10,trued=.8) #Takes ~4 minutes
  mpeek=matrix(c(r6,r7,r8,r9,r10),5,3,byrow=TRUE)  


#####################################################################################################
#  FIGURE 3B -  P-HACKING BY CHOOSING FROM 3 DVS
#####################################################################################################

fig3b=function(simtot,n,rho,trued) {
  #EXPLANATION:
  #      A situation where a researcher collectes 3dvs and performs sequential tests is modeled.
  #      If the first test is n.s. she tries another, and then another. The dvs are correlated with each other
  #      This simulates an extremely common form of p-hacking as researchers often collect multiple measures of the same construct
  #      and do not need to report all of them.
  #SYNTAX:
  #     simtot is the number of simulations
  #     n is the sample size of each cell in each simulation
  #     rho is used to vary how correlated the multiple dvs are, for r(x1,x2)=.5 rho~.7
  #     trued is the actual effect size in the latent variables
  #STEPS
  #1.    Latent variables: Two variables y1,y2 are the latent variables which the researcher does not observe
  #2.    Observed variables: she measures x11,x12,x13 as proxies of y1, and x21,x22,x23 as proxies of y2
  #      Each of the xs is correlated rho with y, so xs are correlated about sqrt(rho) whicch each other, to get them to be r()=.5
  #      we set rho=.7.
  #      We then add trued to each of the x21 x22 x23 
  #3.    Run three t-tests, comparing (x11,x21), (x12,x22), (x13,x23)
  #4.    If the first comparison is significant it is kept, if not, the 2nd is tried, and if it is not significant the third is tried
  #      The meta-analysis is sent the last analysis performed, so if the study is n.s. overall, the meta-analysis gest x13 vs x33.
  #5.    Assign the simmulated studies to the file-drawer (if no test is signifciant) or publish it, if at least one is  
  
  #seed
    set.seed(222)
  #initialize vectors where results will be stored
		p_publish=c()             #initialize p-value vector for published studies
		p_drawer=c()              #for the file-drawer
	  
  #do a loop to performed three tests on a given simualted study
		for (simk in 1:simtot){
    #1)TWO LATENT VARIABLES
		y1=rnorm(n)
		y2=rnorm(n)
		
    #2) CREATE OBSERVABLE VARIABLES
    #noise to be added to each latent variable
		r11=rnorm(n)
		r12=rnorm(n)
		r13=rnorm(n)
		r21=rnorm(n)
		r22=rnorm(n)
		r23=rnorm(n)
    
    #observed variables=latent+noise
    #to achive r(x1,x2)=rho one uses the formula below (well known formula for creating correlated random variables)
    #see e.g., http://www.sitmo.com/article/generating-correlated-random-numbers/
    
    #noisy measures of Y1
		x11=rho*y1+sqrt((1-rho**2))*r11
		x12=rho*y1+sqrt((1-rho**2))*r12
		x13=rho*y1+sqrt((1-rho**2))*r13
		
    #Noisy measures of Y2 adding the true effect size
		x21=rho*y2+sqrt((1-rho**2))*r21+trued
		x22=rho*y2+sqrt((1-rho**2))*r22+trued
		x23=rho*y2+sqrt((1-rho**2))*r23+trued
		
		
    #STEP 3: Do the 3 t-test     
    #with if statements one could choose to run a test only if the previous one was not significant, that seemed harder to code
    #and possibly slower to run so did this instead.
    #
    #As is the case throughout, we are interested in directional predictions tested with two-sided tests (this helps)
    #false-positive rates be more comparable with power, so we do a one sided t-test and consider it significant if p<.025, 
		p1=t.test(x11,x21,alternative="less")$p.value
		p2=t.test(x12,x22,alternative="less")$p.value
		p3=t.test(x13,x23,alternative="less")$p.value
		
    #STEP 4: Report the first one that is significant, or file-drawer p3
		pi=p1                                 #pi is the reported p-value of this simualted study, start with the 1st test
		if (pi>.025 & p2<pi) {pi=p2}          #If the first was not significant, check if the 2nd one was
		if (pi>.025 & p3<pi) {pi=p3}          #If the second was not significant, use the 3rd either to publish or file-drawer
    #Note that ifa  3rd test is considered, it is the one that is kept.
    
    
    #      Note: the meta-analyst who asked for unpublihsed research paper is simulated as getting the last anaylses performed, x13 vs x23
    #            this is what introduces bias into the omniscient estimate. This seems like a reasonable assumption, while possibly
    #            not the last analysis attempted, if there is any p-hacking involved, at least some researchers will provide not their
    #            first analysis run to the meta-analyst, introducing the bias we are simulating.
    
    #File this simulated final p-value into either the file-drawer or the set of significant findings, "published"
		if (pi<.025) {p_publish=c(p_publish,pi)}         #publish if significant
		if (pi>.025) {p_drawer=c(p_drawer,pi)}           #drawer otherwise
    
  }  #this closes the loop for simtot simulations, next we aggregate all ther results into a meta-analysis                                                
  
  
  #published and drawered tests
	df=2*n-2                                        #for various calculations need df, easier to compute once here
  #note that in this function n is a scalar, in others it is a vector
  
  #t-values
	  t_publish=qt(1-p_publish,df)                   #creates a vector with significant t-values
	  t_drawer =qt(1-p_drawer,df)                    #creates a vector with n.s. t-values
	  t_all=c(t_publish,t_drawer)                    #combines them both
  
  #d values
	  d_publish=(2*t_publish)/sqrt(df+2)              #to obtain naive and omniscient estimates we convert t-->d
	  d_drawer=(2*t_drawer)/sqrt(df+2)
	  d_all=c(d_publish,d_drawer)
	  
  #estimate effect size from p-curve
	dp=optimize(loss,c(-.3,2),df_obs=df,t_obs=t_publish)$minimum  #for details see comments on fig1()
  
  #Average effect sizes
	  naive=mean(d_publish)
	  god =mean(d_all)
  
  #print results
	  cat("\n # of p<.05:", length(p_publish))
	  cat("\n # of p>.05:", length(p_drawer))
	  cat("\n p-curve estimate:",dp)
	  cat("\n Naive estimate:",naive)
	  cat("\n God's estimate:",god)
	  cat("\n =========================================== \n")
	  
  #"SAVE" them to the vector with all results
	return(c(naive,dp,god))
  
}

	r1=fig3b(simtot=20000,n=20,rho=.7,trued=0)
	r2=fig3b(simtot=15000,n=20,rho=.7,trued=.2)
	r3=fig3b(simtot=15000,n=20,rho=.7,trued=.4)
	r4=fig3b(simtot=10000,n=20,rho=.7,trued=.6)
	r5=fig3b(simtot=10000,n=20,rho=.7,trued=.8)

	figure3b=matrix(c(r1,r2,r3,r4,r5),5,3,byrow=TRUE)   


#####################################################################################################
# FIGURE 3C -  P-HACKING BY DROPPING OUTLIERS
#####################################################################################################


#This function takes a vector and excludes observations more than xsd SD away from local mean
		noout=function(y,xsd) 	  return=(subset(y,abs((y-mean(y)/sd(y)))<xsd))
		
#This function generates data, runts t-test, drops outliers from one conditoin, then from the other, then from both, doing 4 t-tests
	outlierd=function(simtot,n,xsd,trued) {
  #SYNTAX
  #simtot:    #of simulations
  #n:         sample size
  #xsd:       observersations more than xsd SD away from mean are dropped
  #trued:     real effect size
  
  set.seed(1510)      #to always get same results
  t1=Sys.time()       #start the clock
  p_publish=c()       #initialize p-value vector
  p_drawer=c()        # "" for omniscient test
  df_publish=c()
  df_drawer=c()
  
  for (simk in 1:simtot){
    
    #full set of data
      y1=rnorm(n)
      y2=rnorm(n)+trued
    #drop outliers further than xsd SD from the condition mean using noout() function defined above
      y1c=noout(y1,xsd)  #This is the y1 vector excluding observations more than xsd SD away from mean
      y2c=noout(y2,xsd)
    
    
    #Do the 4 t-test    
      test1=t.test(y1,y2,alternative="less",var.equal = TRUE)     #compare full data
      test2=t.test(y1c,y2c,alternative="less",var.equal = TRUE)   #drop outliers from both conditions
      test3=t.test(y1,y2c,alternative="less",var.equal = TRUE)    #drop only from condition 2
      test4=t.test(y1c,y2,alternative="less",var.equal = TRUE)    #drop only from condition 1
    
    #Get the p-values 
      p1=test1$p.value
      p2=test2$p.value
      p3=test3$p.value
      p4=test4$p.value
    #Get the d.f. from the test (they vary depending on how many outliers were excluded)
      df1=test1$parameter
      df2=test2$parameter
      df3=test3$parameter
      df4=test4$parameter
    
    #Figure out which of the four tests will be reported
    dfi=df1                               #start with full reporting
		pi=p1                                 #pi is the p-value of this simualted study
		if (pi>.025 & p2<pi) {               #if it is not significant, and the 2n test is better, report the 2nd
		  pi=p2
		  dfi=df2}
		if (pi>.025 & p3<pi) {                #if it is not significant, and the 3rd test looks better, report the 3rd
		  pi=p3
		  dfi=df3}
		if (pi>.025 & p4<pi) {
		  pi=p4
		  dfi=df4}
    
    
    #At this point pi has teh p-value one is reporting, either teh first p<.05, or the  best looking n.s. one
    #publish or filedrawer?
		if (pi<.025) {
		  p_publish=c(p_publish,pi)
		  df_publish=c(df_publish,dfi)}        #publish if significant
		
		if (pi>.025) {
		  p_drawer=c(p_drawer,pi)
		  df_drawer=c(df_drawer,dfi)}           #drawer otherwise
    
  }                                                   
  
  
  #published and drawered tests
  #t-values
    t_publish=qt(1-p_publish,df_publish)
    t_drawer =qt(1-p_drawer,df_drawer)
  
  #d values
    d_publish=(2*t_publish)/sqrt(df_publish+2)
    d_drawer=(2*t_drawer)/sqrt(df_drawer+2)
    d_all=c(d_publish,d_drawer)
  #DF AND N
    df_all=c(df_publish,df_drawer)  
    n_all=(df_all+2)/2
    n_publish=(df_publish+2)/2
  
  #Variance formula for d
    vard_all=(2/n_all+ (d_all**2)/(4*n_all-4))*(2*n_all/(2*n_all-2))
    vard_publish=(2/n_publish+ (d_publish**2)/(4*n_publish-4))*(2*n_publish/(2*n_publish-2))
  
  #Estimate effect size using all studies)
    god=rma(yi=d_all,vi=vard_all,method="FE")$b
  #Significant subset
    naive=rma(yi=d_publish,vi=vard_publish,method="FE")$b
  #estimate effect size from p-curve
    dp=optimize(loss,c(-.3,2),df_obs=df_publish,t_obs=t_publish)$minimum  
  
  
  
  #print results
  t2=Sys.time()
  
  cat("\n # of p<.05:", length(p_publish))
  cat("\n # of p>.05:", length(p_drawer))
  cat("\n p-curve estimate:",dp)
  cat("\n Naive estimate:",naive)
  cat("\n God's estimate:",god)
  cat("\n =========================================== \n")
  print(round(t2-t1,digits=1))
  cat("\n =========================================== \n") 
  cat("\n Distribution of d.f. of tests of p<.05 studies")
  print(table(df_publish))
  #"SAVE" them to the vector with all results
  return(c(naive,dp,god))
  
}

#Here I exclude 10 SD away, i.e., do not exclude, to make sure the code is running properly.
  r1=outlierd(simtot=1000,n=20,xsd=10,trued=0) 
  r2=outlierd(simtot=50000,n=20,xsd=10,trued=0.2) 
  r3=outlierd(simtot=50000,n=20,xsd=10,trued=0.4) 
  r4=outlierd(simtot=50000,n=20,xsd=10,trued=0.6) 
  r5=outlierd(simtot=50000,n=20,xsd=10,trued=0.8) 

#here i exclude based on 2 SD, the actual simulation of interest
  r6=outlierd(simtot=10000,n=20,xsd=2,trued=0) 
  r7=outlierd(simtot=10000,n=20,xsd=2,trued=0.2) 
  r8=outlierd(simtot=10000,n=20,xsd=2,trued=0.4) 
  r9=outlierd(simtot=10000,n=20,xsd=2,trued=0.6) 
  r10=outlierd(simtot=10000,n=20,xsd=2,trued=0.8) 

  figure3c=matrix(c(r6,r7,r8,r9,r10),5,3,byrow=TRUE)