#EXAMPLE

#source("http://www.picb.ac.cn/Comparative/data_methods/neoteny/somel_R_functions_1.r")
#library(affy)

#a = ReadAffy()
#a = prepare.CDF.fromTabTable.1.f(a, cdfTableLocation="HGU133Plus2_Hs_ENSG_11.0.0_h18c2r2umask_cdfTable.tsv", cdfName="HGU133Plus2_Hs_ENSG_11.0.0_h18c2r2umask", exclude.single=T, grouping="probeset", min.number=8)
#ExpMat1 = exprs(rma(a))
#DetMat1 = mas5calls.AffyBatch.2(a)

#Humans = c("Hs_pfc_1.CEL", "Hs_pfc_2.CEL", "Hs_pfc_3.CEL", "Hs_pfc_4.CEL", "Hs_pfc_5.CEL", "Hs_pfc_8.CEL", "Hs_pfc_13.CEL", "Hs_pfc_19.CEL", "Hs_pfc_20.CEL", "Hs_pfc_22.CEL", "Hs_pfc_23.CEL", "Hs_pfc_24.CEL", "Hs_pfc_35.CEL", "Hs_pfc_37.CEL")
#Chimps = c("PanTro_pfc_1.CEL", "PanTro_pfc_2.CEL", "PanTro_pfc_3.CEL", "PanTro_pfc_4.CEL", "PanTro_pfc_5.CEL", "PanTro_pfc_6.CEL", "PanTro_pfc_7.CEL", "PanTro_pfc_8.CEL", "PanTro_pfc_9.CEL", "PanTro_pfc_10.CEL", "PanTro_pfc_11.CEL", "PanTro_pfc_12.CEL", "PanTro_pfc_13.CEL", "PanTro_pfc_14.CEL")
#Macaques = c("RheMac_pfc_1.CEL", "RheMac_pfc_2.CEL", "RheMac_pfc_3.CEL", "RheMac_pfc_4.CEL", "RheMac_pfc_5.CEL", "RheMac_pfc_6.CEL", "RheMac_pfc_7.CEL", "RheMac_pfc_8.CEL", "RheMac_pfc_9.CEL")

#Ages1 = c(39, 54, 56, 62, 89, 139, 332, 1969, 2920, 4213, 4534, 4733, 13138, 15672, 0, 1, 8, 40, 186, 45, 525, 2447, 2313, 4361, 4415, 4480, 12784, 16131, 467, 1760, 1269, 2816, 2814, 4894, 6230, 6230, 6524)
#Ages1 = Ages1 + c(rep(280, 14), rep(280, 14), rep(165, 9))		#add inferred conception times
#Ages1 = log2(Ages1)
#names(Ages1) = c(Humans, Chimps, Macaques)

#P = 0.05; PROP = 3		#proportion of individuals passing detection in each species
#Detected1 = apply(DetMat1 < 0.05, 1, function(y) (sum(y[Humans]) > l(Humans)/PROP | sum(y[Chimps]) > l(Chimps)/PROP))

#comparative.expression.tests.1.f(ExpMat = ExpMat1[1:20,], Ages = Ages1, RefSpecies = Humans, Species2 = Chimps, Species3 = Macaques, Detected = Detected1[1:20], RefSpeciesname = "H", Species2name = "C", Species3name = "R", Name="Experiment1", FileName="Experiment1.RData")


#############################################################
#FUNCTIONS TO PREPARE CDF ENVIRONMENTS FROM PROBE TABLES

prepare.CDF.fromTabTable.1.f = function (
#courtesy of Michael Lachmann
  A, 
  parent=globalenv(), 
  cdfTableLocation="MyPath/MyCDFTable", #path and name
  cdfName="mycdfname",
  exclude.single=TRUE, #should we exclude groupings (e.g. probesets) with single probes?
  grouping="probeset", #type of grouping the probes: probe, probeset, exon, etc. 
  min.number = 8			 #minimum number of probe sets 
  )
{
  require(affy)
  CDFenv = cdf.env.fromTable( A@nrow, cdfTableLocation, EXCLUDE.SINGLE = exclude.single, GROUPING = grouping, MIN.NUMBER = min.number)
  A@cdfName = cdfName
  assign( cdfName(A), CDFenv, pos=globalenv())
  A
}


cdf.env.fromTable = function (
  chip.size,
  cdfTableLocation,
  env = new.env(hash=T,parent=baseenv()), 
  make.env = TRUE,
  EXCLUDE.SINGLE,    #should we exclude groupings with single probes?
  GROUPING,	 #type of grouping the probes: probe, probeset, exon, etc.
  MIN.NUMBER = 8			#minimum number of probes per probeset to be accepted
)  
{
  Ftab = read.table(cdfTableLocation, as.is=T, header=T, comment.char="") 
  cnFtab = colnames(Ftab)
  
  if (ncol(Ftab) > 3) {	#if there are more than 3 columns, choose the appropriate ones given GROUPING
	Ftab = Ftab[, cnFtab %in% c(GROUPING, "x", "y", "px", "py", "mx", "my")]
	cnFtab = colnames(Ftab)
  }
  
  
  if (EXCLUDE.SINGLE) {
    gn = Ftab[,1]	#grouping names
    tgn = table(gn)
    include = gn %in% names(tgn)[tgn > MIN.NUMBER-1]	#groupings with >= MIN.NUMBER probes
    if (sum(include) == 0) {  	#if the CDF table is based on probes 
      stop("Wrong choice: Change exclude.single to FALSE!")
    } else  cat(paste("Excluded ", length(gn) - sum(include), " groupings with less than ", MIN.NUMBER, " probes\n", sep=""))
  Ftab = Ftab[include,] 
  }
  
  if (ncol(Ftab)==3) {
    cat("Inferring mismatch positions from perfect match\n")
    if (sum(cnFtab == c(GROUPING, "px", "py")) != ncol(Ftab))
    cat(paste("Renaming columns: ", paste(cnFtab, collapse=" "), "as", GROUPING, "px py \n", collapse=" "))
	colnames(Ftab) = c(GROUPING, "px", "py")
    nn.pm=Ftab$"px"+ chip.size*Ftab$"py"+1
    nn.mm=Ftab$"px" + chip.size*(Ftab$"py"+1)+1
  } else if (ncol(Ftab)==5) {
    if (sum(colnames(Ftab) == c(GROUPING, "px", "py", "mx", "my")) != ncol(Ftab))
    cat(paste("Renaming columns: ", paste(colnames(Ftab), collapse=" "), "as", GROUPING, "px py mx my\n", collapse=" ")) 
    colnames(Ftab) = c(GROUPING, "px", "py", "mx", "my")
    nn.pm=Ftab$"px"+ chip.size*Ftab$"py"+1
    nn.mm=Ftab$"mx" + chip.size*Ftab$"my"+1
  } else {cat("Wrong table format! Stopping! \n"); stop}
  
  pmm = cbind(nn.pm,nn.mm)
  colnames(pmm) = c("pm","mm")

  if (length(unique(Ftab[,1])) != length(Ftab[,1])) {
    cat("Regular run: more than one probe per grouping\n")
    l=tapply(1:(dim(pmm)[1]),Ftab[,1],
      function(v) pmm[v,,drop=FALSE])
    cat("Made list.\n")
    if( make.env ) {
      cat("Making env\n")
      nn=names(l)
      for(i in(1:length(nn) ) ) {
        assign(nn[i],l[[i]],pos=env,inherit=F)
      }
      cat("finished making CDF!\n")
      return(env)
    } else {
      cat("finished making CDF!\n")
      return(l)
    }
  } else {
    cat("One probe per grouping\n")
    if( make.env ) {
      cat("Making env\n")
      nn=as.character(Ftab[,1])
      for(i in 1:length(nn)) {
        assign(nn[i],pmm[i,,drop=FALSE],pos=env,inherit=F)
      }
      cat("finished making CDF!\n")
      return(env)
    } else {
      nn=as.character(Ftab[,1])
      l=lapply(1:length(nn), function(i) pmm[i,,drop=FALSE])
      nams(l) = nn
      return(l)
    }
                                      
  }
}


########################################################################
#TESTS FOR AGE EFFECTS, DIFFERENTIAL EXPRESSION, HETEROCHRONY

comparative.expression.tests.1.f = function(
#Test to calculate age, differential expression, lineage and heterochrony tests, and sort genes into classes based on the test results. It stores all resulting objects in an R object (FileName): The test results, and the gene sets. The object with extension "_gene_sets" contains the gene sets.
  ExpMat, 						#Expression data matrix, with samples (arrays) in columns
  Ages, 							#numeric vector of ages. the names of the vector should be the same as column names in ExpMat
  RefSpecies, 				#character vector, names of samples belonging to the reference species (e.g. human)
  Species2, 					#character vector, names of samples belonging to the second species (e.g. chimp)
  Species3 = c(), 		#character vector, names of samples belonging to the second species (e.g. rhesus) 
  RefSpeciesname, 		#character vector (e.g. "Human")
  Species2name, 			#character vector (e.g. "Chimp")
  Species3name, 			#character vector (e.g. "Rhesus")
  P_age=0.05,         #age-rel test p-value cutoff
  P_sp=0.05,          #species diff test p-value cutoff
  P_lin=0.05,         #lineage test p-value cutoff
  P_het=0.05,         #heterochrony test p-value cutoff
  Est.Genes=TRUE,     #Estimate gene sets and divergence based on these cutoffs?
  Detected=c(),				#logical vector, same length as the rows of ExpMat, indicating whether genes in ExpMat are expressed or not.
  Name = "Experiment1", 							#character vector to name objects 
  FileName = "~/Experiment1/RData"		#location to save the R data file
) {

	rnExpMat = rn(ExpMat); 
	if (l(Detected) == 0) {
		Detected = rep(T, l(rnExpMat))
	}

	#RUNNING TESTS FOR AGE, SPECIES, HETEROCHRONY AND LINEAGE	

	#Test for age effect - polynomial regression
	Exp_a1 = list()
	Exp_a1[[1]] = age.test.polynomials.1.f(Mat = ExpMat[,RefSpecies], Ages = Ages[RefSpecies], pvals=T, PCUTOFF=1);		#age test for the reference species
	Exp_a1[[2]] = age.test.polynomials.1.f(Mat = ExpMat[,Species2], Ages = Ages[Species2], pvals=T, PCUTOFF=1);		#age test for the second species

	#Test for age effect - splines
	Exp_s1 = list()
	Exp_s1[[1]] = tryCatch( age.test.splines.1.f(Mat = ExpMat[,RefSpecies], Ages = Ages[RefSpecies], DEGREE=8), error = function(e) NA) 		#testing the reference species
	Exp_s1[[2]] = tryCatch( age.test.splines.1.f(Mat = ExpMat[,Species2], Ages = Ages[Species2], DEGREE=8)	, error = function(e) NA)		#testing the second species
	cat ("\tage tests finished\n")

	#Test for differential expression between reference species and 2nd species
	Exp_diff1 = tryCatch( species.test.multiregression.1.f(E = ExpMat, x = Ages, RefSpecies = RefSpecies, Species2 = Species2, EACH = TRUE, Ready.Bm=Exp_a1[[1]][,1])	, error = function(e) NA)	#multiple regression test for differential expression in age-related genes
	Exp_diff2 = tryCatch( apply(ExpMat, 1, function(y) t.test(y[RefSpecies], y[Species2], alt="t")$p.val) , error = function(e) NA)		#t-test p-values for non-age-related genes
	cat ("\tspecies difference tests finished\n")

	#Test for heterochrony between reference species and 2nd species
	Exp_het1 = tryCatch( heterochrony.test.1.f(E = ExpMat, x = Ages, RefSpecies = RefSpecies, Species2 = Species2, method = "spline", Ready.Bm=Exp_s1[[1]])	, error = function(e) NA); 
	cat ("\theterochrony test finished\n")		#estimating the age-shift and expression-shift between the reference and the second species
	Exp_het1_ws = suppressWarnings( tryCatch( heterochrony.summary.2.f(heterochrony.summary.1.f(Exp_het1)), error = function(e) NA));		#summarizing the heterochrony.test.1.f result, using "condition set 1" described in the Supp Info.

	#Estimating expression divergence between the reference species and 2nd species across ages
	Agesp = seq(max(c(min(Ages[RefSpecies]), min(Ages[Species2]))), min(c(max(Ages[RefSpecies]), max(Ages[Species2]))), l=20)
	Exp_seq1 = tryCatch( curve.estimate.f (ExpMat, Ages, Agesp, RefSpecies, Species2, Exp_a1), error = function(e) NA)
  cat ("\tdivergence estimation finished\n")

	#Test for lineage assignment of expression differences, using the 3rd species as outgroup
  if (l(Species3) != 0) {
    Exp_lin1 = tryCatch( lineage.test.1.f(E = ExpMat, x = Ages, Species1 = RefSpecies, Species2 = Species2, Species3 = Species3, Species1name = RefSpeciesname, Species2name = Species2name, Species3name = Species3name, bmmat1 = Exp_a1[[1]], bmmat2 = Exp_a1[[2]]), error = function(e) NA);
    cat ("\tlineage test finished\n")
  }

	###
	#ESTIMATING GENE SETS
  if (Est.Genes & l(Species3) != 0) {
    All = rnExpMat[Detected]
    GAp = rnExpMat[Exp_a1[[1]][,2] < P_age & Detected]
    GAm = rnExpMat[Exp_a1[[1]][,2] >= P_age & Detected]
    GApSp = rnExpMat[Exp_diff1[,1] < P_sp & Exp_a1[[1]][,2] < P_age & Detected]
    GApSm = rnExpMat[Exp_diff1[,1] >= P_sp & Exp_a1[[1]][,2] < P_age & Detected]
    GAmSp = rnExpMat[Exp_diff2 < P_sp & Exp_a1[[1]][,2] >= P_age & Detected]
    GAmSm = rnExpMat[Exp_diff2 >= P_sp & Exp_a1[[1]][,2] >= P_age & Detected]
    GSp = c(GApSp, GAmSp)
    GHl = intersect(rn(Exp_het1_ws)[ Exp_het1_ws[,"pval"] < P_het & Exp_het1_ws[,"age_shift_est"] > 1 ], All) 
    GHe = intersect(rn(Exp_het1_ws)[ Exp_het1_ws[,"pval"] < P_het & Exp_het1_ws[,"age_shift_est"] < 1 ], All)
    GHp = rn(Exp_lin1)[ Exp_lin1[,"pval"] < P_lin/2 & Detected ]
    GHm = rn(Exp_lin1)[ Exp_lin1[,"pval"] > (1 - P_lin/2) & Detected ]
    HlHp = llin(list(GHl, GSp, GHp))
    HeHp = llin(list(GHe, GSp, GHp))
    HlHm = llin(list(GHl, GSp, GHm))
    HeHm = llin(list(GHe, GSp, GHm))
    
    PhyOntList_1 = list(HlHp, HeHp, HlHm, HeHm, GAp, GAm, GSp, GApSp, GHp, GHm, GHl, GHe)
    PhyOntList_1 = lapply(PhyOntList_1, as.character)
    names(PhyOntList_1) = c("Human-neotenic", "Human-accelerated", "Chimp-accelerated", "Chimp-neotenic", "Age_related", "Non_age_related", "Differentially_expressed", "Age_rel_and_diff_expr", "On_human_lineage", "On_chimp_lineage", "Delayed_in_human", "Delayed_in_chimp")

		#ESTIMATING EXPRESSION DIVERGENCE BETWEEN REFERENCE AND 2ND SPECIES ACROSS AGES    
    Exp_div1 = lapply(PhyOntList_1[c("Human-neotenic", "Human-accelerated", "Chimp-accelerated", "Age_rel_and_diff_expr")], function(x) {
      t(sapply(Exp_seq1[x], function(xx) (abs(xx[[1]] - xx[[2]])))) })
    Exp_div1_boots = tryCatch( bootstrap.summary.1.f(Exp_div1, BOOTS=100) , error = function(e) NA)
  
    vallist = list(Exp_a1, Exp_s1, Exp_diff1, Exp_diff2, Exp_lin1, Exp_het1, Exp_seq1, Exp_het1_ws, PhyOntList_1, Exp_div1_boots, Ages, Agesp)
    namelist = c("age_test_polynomial", "age_test_splines", "species_test_multireg", "species_test_ttest", "lineage_test", "heterochrony_test", "expression_age_curves", "heterochrony_test_summary", "gene_sets", "divergence_across_age", "ages", "ages_predicted")

  } else {

    vallist = list(Exp_a1, Exp_s1, Exp_diff1, Exp_diff2, Exp_het1, Exp_het1_ws, Exp_seq1, Ages, Agesp)
    namelist = c("age_test_polynomial", "age_test_splines", "species_test_multireg", "species_test_ttest", "heterochrony_test", "heterochrony_test_summary", "expression_age_curves", "ages", "ages_predicted")
  
  }

  for (i in 1:l(vallist)) {
    val = vallist[[i]]
    assign(paste(Name, "_", namelist, sep="")[i], val)
  }
	save(list=grep(Name, ls(), value=TRUE), file=FileName)
}


###################################################################
#FUNCTION FOR TESTING POLYNOMIAL MODELS WITH AGE

age.test.polynomials.1.f = function(Mat, Ages, verbose=F, pvals=FALSE, PCUTOFF=0.05) {
#tries different models using one variable up to 3rd degree. Returns the index of the best model in Models[[3]]
   parI = list(Ages, Ages^2, Ages^3) 
   Genes = nrow(Mat); PI = length(parI); LI = 2^ (PI); Models = Models[[3]]
   LmListInt1 = function (y, par, m) 
   { lm(y ~ I(par[[1]] * m[1]) + I(par[[2]] * m[2]) + I(par[[3]] * m[3])) }
   if (pvals==FALSE) {
      Result = apply(Mat, 1, function(y) {
         lr = lapply(1:8, function(j) {LmListInt1(y, parI, Models[j,])})
         mod = sapply(2:8, function(k) {anova(lr[[1]], lr[[k]])$Pr[2]})
         mod[is.na(mod)] = 1
         ifelse(min(mod) <= PCUTOFF, which.min(mod)+1, 1)
      })
      names(Result) = rownames(Mat)
   } else {
      Result = t(apply(Mat, 1, function(y) {
         lr = lapply(1:8, function(j) {LmListInt1(y, parI, Models[j,])})
         mod = sapply(2:8, function(k) {anova(lr[[1]], lr[[k]])$Pr[2]})
         mod[is.na(mod)] = 1
         c(ifelse(min(mod) <= PCUTOFF, which.min(mod)+1, 1), min(mod))
      }))
      rownames(Result) = rownames(Mat)
      colnames(Result) = c("Model", "P-value")
   }
   return(Result)
}

polynomial.model.predict.f = function(Best_model, x, y, low, high, Sequence=NULL, Model=3, Fit=F) {
#function to make predictions (between values "low" and "high") or calculate models based on best polynomial regression models estimated in age.test.polynomials.1.f
   if (Model == 3) Best_model = 1 + (Best_model - 1)*4 
   if (Fit == F) {
      if (!length(Sequence)) {
         Seq = seq(low,high,by=0.1)
      } else {
         Seq = Sequence
      }
      if (Best_model == 1) yp = rep(mean(y), length(Seq)) 
      if (Best_model == 5) yp = predict(lm(y ~ I(x^3)), data.frame(x=Seq)) #predicted values for a range of ages
      if (Best_model == 9) yp = predict(lm(y ~ I(x^2)), data.frame(x=Seq))
      if (Best_model == 13) yp = predict(lm(y ~ I(x^2) + I(x^3)), data.frame(x=Seq))
      if (Best_model == 17) yp = predict(lm(y ~ x), data.frame(x=Seq))
      if (Best_model == 21) yp = predict(lm(y ~ x + I(x^3)), data.frame(x=Seq))
      if (Best_model == 25) yp = predict(lm(y ~ x + I(x^2)), data.frame(x=Seq))
      if (Best_model == 29) yp = predict(lm(y ~ x + I(x^2) + I(x^3)), data.frame(x=Seq))

   } else {
      if (Best_model == 1) yp = rep(mean(y), length(x)) 
      if (Best_model == 5) yp = lm(y ~ I(x^3))$fit #predicted values for a range of ages
      if (Best_model == 9) yp = lm(y ~ I(x^2))$fit
      if (Best_model == 13) yp = lm(y ~ I(x^2) + I(x^3))$fit
      if (Best_model == 17) yp = lm(y ~ x)$fit
      if (Best_model == 21) yp = lm(y ~ x + I(x^3))$fit
      if (Best_model == 25) yp = lm(y ~ x + I(x^2))$fit
      if (Best_model == 29) yp = lm(y ~ x + I(x^2) + I(x^3))$fit
   }
   return(yp)
}


###################################################################
#FUNCTION FOR TESTING SPLINE MODELS WITH AGE
age.test.splines.1.f = function(Mat, Ages, DEGREE=8, VERBOSE=F) {
#test smooth spline models with different degrees of freedom (2:8) and get the one with the lowest F-test p-value
   x = Ages
   Res = t(sapply(1:nrow(Mat), function(i) {
      if (VERBOSE == TRUE) print(i)
      y = Mat[i, ]
      null = lm(y ~ 1)
      res1 = sum(null$resi^2)
      df1 = null$df
      res2l = sapply(2:DEGREE, function(i) 
         sum((predict(smooth.spline(x, y, df=i), x)$y - y)^2))
      dfl = df1 - 1:(DEGREE-1)
      pvals = sapply(1:(DEGREE-1), function(i) {
         res2 = res2l[[i]]
         df2 = dfl[[i]]
         fstat = ((res1 - res2)/(df1 - df2))/(res2/df2)
         pf(q = fstat, df1 = df1-df2, df2 = df2, lower.tail = F)
      })
      c(which.min(pvals)+1, pvals[which.min(pvals)])
   }))
   rownames(Res) = rownames(Mat)
   colnames(Res) = c("Model", "P-value")
   Res
}


###################################################################
#FUNCTIONS FOR TESTING DIFFERENTIAL EXPRESSION 

species.test.multiregression.1.f = function(
#F-test for species difference, where the regression curve is chosen for refspecies. 
   E,           #expression matrix
   x,           #ages
   RefSpecies,  #indicator of species 1
   Species2,    #indicator of species 2
   DEGREE = 3,            #degree of the polynomial regression model to use. obsolete. 
   EACH = TRUE,           #if EACH is TRUE a list of submodels are tested for species difference, where the second species can have its own parameters. If EACH is FALSE, only one model is tested, where the second species has its own parameters for each element of the model.
   Ready.Bm = c(),         #vector of previously calculated best expression-age regression models. if left empty, calculates the models de novo. 
   SIG.MODELS = TRUE       #should we only use significant expression-age models, or just the best model irrespective of significance
) {
   mff = age.test.polynomials.1.f; mo1 = multireg.models.1.f; 
   	if (EACH) mo2 = ancova.models.02.f else mo2 = ancova.models.01.f

	 if (l(Ready.Bm) != 0) {					#find best model or get it
		 BmMat = Ready.Bm
	 } else { BmMat = mff(E[,RefSpecies], x[RefSpecies], verbose=F, PCUTOFF=ifelse(SIG.MODELS, 0.05, 1)) }									

   Sp = factor(c(rep(1, l(RefSpecies)), rep(0, l(Species2))))
   XYZ = c(RefSpecies, Species2)
   Res = t(sapply(1:nrow(E), function(i) {
      y = E[i,]
      Bm = BmMat[i]                                             
      null = mo1(Bm, x[XYZ], y[XYZ])
      altlist = mo2(Bm, x[XYZ], y[XYZ], Sp)
      if (EACH & Bm != 1) pval = sapply(altlist, function(alt) anova(alt, null)$Pr[2]) else pval = anova(altlist, null)$Pr[2]
      c(min(pval), Bm, which.min(pval))
   }))
   rownames(Res) = rownames(E)
   colnames(Res) = c("P-value", "Multireg_Model", "Ancova_Model")
   return(Res)
}

multireg.models.1.f = function(BestModels, x, y)  {
   if (BestModels == 1) m2 = lm(y ~ rep(1, l(x)) * 1) 
   if (BestModels == 2) m2 = lm(y ~ I(x^3) * 1)
   if (BestModels == 3) m2 = lm(y ~ I(x^2) * 1)
   if (BestModels == 4) m2 = lm(y ~ I(x^2) * 1 + I(x^3) * 1)
   if (BestModels == 5) m2 = lm(y ~ I(x) * 1)
   if (BestModels == 6) m2 = lm(y ~ I(x) * 1 + I(x^3) * 1)
   if (BestModels == 7) m2 = lm(y ~ I(x) * 1 + I(x^2) * 1)
   if (BestModels == 8) m2 = lm(y ~ I(x) * 1 + I(x^2) * 1 + I(x^3) * 1)
   m2
}

ancova.models.01.f = function(BestModels, x, y, Sp)  {
#each species has specific slope and intercept
   if (BestModels == 1) m2 = lm(y ~ rep(1, l(x)) * Sp) 
   if (BestModels == 2) m2 = lm(y ~ I(x^3) * Sp)
   if (BestModels == 3) m2 = lm(y ~ I(x^2) * Sp)
   if (BestModels == 4) m2 = lm(y ~ I(x^2) * Sp + I(x^3) * Sp)
   if (BestModels == 5) m2 = lm(y ~ I(x) * Sp)
   if (BestModels == 6) m2 = lm(y ~ I(x) * Sp + I(x^3) * Sp)
   if (BestModels == 7) m2 = lm(y ~ I(x) * Sp + I(x^2) * Sp)
   if (BestModels == 8) m2 = lm(y ~ I(x) * Sp + I(x^2) * Sp + I(x^3) * Sp)
   m2
}

ancova.models.02.f = function(BestModels, x, y, Sp)  {
#a list of models where each species *may have* a specific slope and intercept
   Sp2 = rep(1, l(Sp))
   if (BestModels == 1) 
      m2 = lm(y ~ rep(1, l(x)) * Sp) 
   if (BestModels == 2) 
      m2 = lapply(1:2, function(h) {modlist = list(Sp2,Sp)[(Models[[1]]+1)[h,]]; lm(y ~ I(x^3) * modlist[[1]] + Sp)})
   if (BestModels == 3) 
      m2 = lapply(1:2, function(h) {modlist = list(Sp2,Sp)[(Models[[1]]+1)[h,]]; lm(y ~ I(x^2) * modlist[[1]] + Sp)})
   if (BestModels == 4) 
      m2 = lapply(1:4, function(h) {modlist = list(Sp2,Sp)[(Models[[2]]+1)[h,]]; lm(y ~ I(x^2) * modlist[[1]] + I(x^3) * modlist[[2]] + Sp)})
   if (BestModels == 5) 
      m2 = lapply(1:2, function(h) {modlist = list(Sp2,Sp)[(Models[[1]]+1)[h,]]; lm(y ~ I(x) * modlist[[1]] + Sp)})
   if (BestModels == 6) 
      m2 = lapply(1:4, function(h) {modlist = list(Sp2,Sp)[(Models[[2]]+1)[h,]]; lm(y ~ I(x) * modlist[[1]] + I(x^3) * modlist[[2]] + Sp)})
   if (BestModels == 7) 
      m2 = lapply(1:4, function(h) {modlist = list(Sp2,Sp)[(Models[[2]]+1)[h,]]; lm(y ~ I(x) * modlist[[1]] + I(x^2) * modlist[[2]] + Sp)})
   if (BestModels == 8) 
      m2 = lapply(1:8, function(h) {modlist = list(Sp2,Sp)[(Models[[3]]+1)[h,]]; lm(y ~ I(x) * modlist[[1]] + I(x^2) * modlist[[2]] + I(x^3) * modlist[[3]] + Sp)})
   m2
}


###################################################################
#LINEAGE TEST - FUNCTION TO ASSIGN EXPRESSION DIFFERENCES TO A LINEAGE
lineage.test.1.f = function(E, x = Ages, Species1, Species2, Species3, Species1name, Species2name, Species3name, bmmat1, bmmat2) {
#compares two species' curves with an outgroup's expression values, and decides on which of the two is closer to the outgroup
	TRes = t(sapply(1:nrow(E), function(i) { 
		 x3 = x[Species3]; y3 = E[i,Species3]; 
		 x1 = x[Species1]; y1 = E[i,Species1]; bm1 = ifelse(bmmat1[i,2] < 0.05, bmmat1[i,1], 1)
		 yp1 = polynomial.model.predict.f(bm1, x1, y1, Seq=x3, Model=3)
		 x2 = x[Species2]; y2 = E[i,Species2]; bm2 = ifelse(bmmat2[i,2] < 0.05, bmmat2[i,1], 1) 
		 yp2 = polynomial.model.predict.f(bm2, x2, y2, Seq=x3, Model=3)
		 sum1 = sum(abs(yp1 - y3) > abs(yp2 - y3))
		 pval1 = suppressWarnings( wilcox.test(abs(yp1 - y3), abs(yp2 - y3), paired=T, alt="g")$p.val )		 
		 Res = c(yp1, yp2, y3, sum1, pval1)
		 names(Res) = c(rep(Species1name, l(Species3)), rep(Species2name, l(Species3)), rep(Species3name, l(Species3)), "sum1", "pval")
		 Res }))
	rownames(TRes) = rownames(E)
	TRes
}


###################################################################
#HETEROCHRONY TEST
heterochrony.test.1.f = function(
   E,           #expression matrix
   x,           #ages
   RefSpecies,  #indicator of species 1
   Species2,    #indicator of species 2
   method = "spline", #which method to use to calculate best fitting curves, either "spline" or "lm" (polynomial regression).
   Ready.Bm = c(),         #matrix of previously calculated best expression-age regression models and p-values. if left empty, calculates the models de novo. Be careful to maintain that previously calculated models are of the same degree as specified in DEGREE!!!
   VERBOSE = FALSE
) {
#C is the age-shift estimate, M is the expression-shift estimate, 
   Res9 = lapply(1:nrow(E), function(i) {
      if (VERBOSE == TRUE) print(i)

      y = E[i,]         
      y1 = y[RefSpecies]   
      x1 = x[RefSpecies]
      y2 = y[Species2]
      x2 = x[Species2]

      SIGNIFICANT = FALSE
      if (method == "spline") {
         if (l(Ready.Bm) != 0) { Bm1 = Ready.Bm[i,] }                                                #get best model, or...
            else { Bm1 = age.test.splines.1.f(t(y1), x1, VERBOSE=F)[1,] }                     #find best model

         if (Bm1[2] < 0.05) {  
            SIGNIFICANT = TRUE                                                                       #if there is significant change with age
            dd = smooth.spline(x1, y1, df=Bm1[1])
            yp2 = predict(dd, x2)$y
      
            exp_comp = function(x2, y2, pC, pM) {                                                    #the function to optimize
               xp2 = x2 + pC
               yp2 = predict(dd, xp2)$y
               y2 - yp2 + pM
            }
         }
      }

      if (method == "lm") {
         mff = age.test.polynomials.1.f; mpf = polynomial.model.predict.f
         if (l(Ready.Bm) != 0) { Bm1 = Ready.Bm[i] }                                                  #get best model, or...
            else { Bm1 = mff(t(y1), x1, verbose=F)[1] }                                               #find best model

         if (Bm1 != 1) {   
            SIGNIFICANT = TRUE                                                                        #if there is significant change with age
            yp2 = mpf(Bm1, x1, y1, Sequence=x2)

            exp_comp = function(x2, y2, pC, pM) {                                                     #the function to optimize
               xp2 = x2 + pC
               yp2 = mpf(Bm1, x1, y1, Sequence=xp2)
               y2 - yp2 + pM
            }
         }
      }

      if (SIGNIFICANT == TRUE) {                                                                    #if there is significant change with age

         ResList1 = list(NA, NA, NA, NA)
         names(ResList1) = c("C", "CM", "M", "null")
   #### null
         res2 = sum((yp2 - y2)^2)                                                                        #the sum of squared residuals
         df2 = l(y2)                                                                                     #the degrees of freedom of the model
         ResList1[["null"]] = list(res2, df2)
         names(ResList1[["null"]]) = c("sumsq", "df")

   #### the A model (estimate only age-shift)
         Nls1 <- tryCatch(nls( ~ exp_comp(x2, y2, pC, pM=0), algorithm = "port", data = data.frame(cbind(x2, y2)), start = list(pC = 0), trace = F), error = function(e) NA)                              #estimate only pC
         cc = NA; names(cc)="pC"; res = res2 = df2 = pval = NA
         if (!is.na(Nls1[1])) {
			NlsConv = Nls1$convergence
            if (NlsConv==0) cc[1] = tryCatch(summary(Nls1)$coef[,1], error = function(e) NA);   #the optimal parameter value(s) from nls
            res = tryCatch(sm(Nls1)$resi, error = function(e) NA)                                        #the residuals from the model
            res2 = tryCatch(sum(res^2), error = function(e) NA)                                          #the sum of squared residuals
            df2 = tryCatch(sm(Nls1)$df[2], error = function(e) NA)                                       #the degrees of freedom of the model
            pval = f.pval(res1 = ResList1[["null"]][["sumsq"]], res2 = res2, df1 = ResList1[["null"]][["df"]], df2 = df2)   #the F-test p-values
         } else { NlsConv = NA }
         ResList1[["C"]] = list(res2, df2, cc, as.numeric(chartr("01","10", as.character(NlsConv))), pval, res)

   #### the B model (estimate only expression-shift)
         Nls1 <- tryCatch(nls( ~ exp_comp(x2, y2, pC=0, pM), algorithm = "port", data = data.frame(cbind(x2, y2)), start = list(pM = 0), trace = F), error = function(e) NA)                              #estimate only pM
         cc = NA; names(cc)="pM"; res = res2 = df2 = pval = NA
         if (!is.na(Nls1[1])) {
			NlsConv = Nls1$convergence
            if (NlsConv==0) cc[1] = tryCatch(summary(Nls1)$coef[,1], error = function(e) NA);
            res = tryCatch(sm(Nls1)$resi, error = function(e) NA)                                        #the residuals from the model
            res2 = tryCatch(sum(res^2), error = function(e) NA)                                          #the sum of squared residuals
            df2 = tryCatch(sm(Nls1)$df[2], error = function(e) NA)                                       #the degrees of freedom of the model
            pval = f.pval(res1 = ResList1[["null"]][[1]], res2 = res2, df1 = ResList1[["null"]][[2]], df2 = df2)   #the F-test p-values
         }  else { NlsConv = NA }
         ResList1[["M"]] = list(res2, df2, cc, as.numeric(chartr("01","10", as.character(NlsConv))), pval, res)

   #### the AB model (estimate both age-shift and expression-shift)
         Nls1 <- tryCatch(nls( ~ exp_comp(x2, y2, pC, pM), algorithm = "port", data = data.frame(cbind(x2, y2)), start = list(pC = 0, pM = 0), trace = F), error = function(e) NA)                              #estimate pC and pM simultaneously
         cc = c(NA, NA); names(cc) = c("pC","pM"); res = res2 = df2 = pval = NA
         if (!is.na(Nls1[1])) {
			NlsConv = Nls1$convergence
            if (NlsConv==0) cc[1:2] = tryCatch(summary(Nls1)$coef[,1], error = function(e) NA); 
            res = tryCatch(sm(Nls1)$resi, error = function(e) NA)                                        #the residuals from the model
            res2 = tryCatch(sum(res^2), error = function(e) NA)                                          #the sum of squared residuals
            df2 = tryCatch(sm(Nls1)$df[2], error = function(e) NA)                                       #the degrees of freedom of the model
            pval = f.pval(res1 = ResList1[["null"]][[1]], res2 = res2, df1 = ResList1[["null"]][[2]], df2 = df2)   #the F-test p-values
         }  else { NlsConv = NA }
         ResList1[["CM"]] = list(res2, df2, cc, as.numeric(chartr("01","10", as.character(NlsConv))), pval, res)

         names(ResList1[["C"]]) = names(ResList1[["CM"]]) = names(ResList1[["M"]]) = c("sumsq", "df", "coef", "conv", "pval", "resi")
         ResList1
      }
   })
   names(Res9) = rownames(E)
   return(Res9)
}


###################################################################
#FUNCTIONS TO SUMMARIZE HETEROCHRONY TEST RESULTS

heterochrony.summary.1.f = function(Mat) {
#function to summarize heterochrony.test.1.f output
   Res = t(sapply(Mat, function(x) {
      if (l(x) != 0 & sum(!is.na(c(x$C$pval, x$CM$pval, x$M$pval))) > 0) { 
         a = c(x$C$pval, x$CM$pval, x$M$pval,
         which.min(c(x$C$pval, x$CM$pval, x$M$pval)),
         f.pval(res1 = x$C$sumsq, res2 = x$CM$sumsq, df1 = x$C$df, df2 = x$CM$df),
         f.pval(res1 = x$M$sumsq, res2 = x$CM$sumsq, df1 = x$M$df, df2 = x$CM$df),
         x$C$coef["pC"], x$CM$coef["pC"],
         ifelse(l(x$C$conv)==1, x$C$conv, NA), ifelse(l(x$CM$conv)==1, x$CM$conv, NA), x$CM$coef["pM"])
      } else a = rep(NA, 11)
      a }))
   colnames(Res) = c("pvalC", "pvalCM", "pvalM", "minp", "pvalCxCM", "pvalMxCM", "estC", "estCM", "convC", "convCM", "mestM")
   Res }

heterochrony.summary.2.f = function(Mat) {
# determining wide-sense neotenic/accelerated genes from opt.match.5.f results
# Wide-sense neotenic/accelerated
# Condition 1: AB model best. Get "C" estimate and p-value from the AB model.
# Condition 2: A model best. Get "C" estimate and p-value from the A model.
# Condition 3: B model is best but AB model not better than A model (at p<0.05). Then the A model is good enough (J-test logic). Get "C" estimate and p-value from A model.
# Condition 4: B model is best and the AB model doesn't converge. Non-convergence implies models A and B are similar, so model A is good enough (except cases where model B is significant at p<0.05 but A is not significant). Get "C" estimate and p-value from model A.
   COND1 = !is.na(Mat[,"minp"]) & Mat[,"minp"] == 2 & Mat[,"convCM"] == 1
   COND2 = !is.na(Mat[,"minp"]) & Mat[,"minp"] == 1 & Mat[,"convC"] == 1
   COND3 = !is.na(Mat[,"pvalCxCM"]) & Mat[,"minp"] == 3 & Mat[,"convC"] == 1 & Mat[,"pvalCxCM"] > 0.05
   COND4 = is.na(Mat[,"pvalCM"]) & !is.na(Mat[,"pvalC"]) & Mat[,"minp"] == 3 & Mat[,"convC"] == 1 & !(Mat[,"pvalC"] > 0.05 &  Mat[,"pvalM"] < 0.05)
   #cat(sapply(list(COND1, COND2, COND3, COND4), sum), "\n")
   Res = rbind(Mat[COND1, c("pvalCM", "estCM", "mestM")], cbind(Mat[c(COND2 | COND3 | COND4), c("pvalC", "estC")], rep(NA, sum(c(COND2 | COND3 | COND4)))))
   colnames(Res) = c("pval", "age_shift_est", "exprs_shift_est")
   Res[,"age_shift_est"] = 2^Res[,"age_shift_est"]
   Res }

#####################################################################
#FUNCTION TO CALCULATE CURVES
curve.estimate.f = function(ExpMat, Ages, Agesp, RefSpecies, Species2, Exp_a1) {
	Res = lapply(1:nrow(ExpMat), function(i) { 
		y = ExpMat[i,]
		bm1 = Exp_a1[[1]][i,1]  
		yp1 = polynomial.model.predict.f(bm1, Ages[RefSpecies], y[RefSpecies], Sequence=Agesp)		#expression-age curve for the reference species
		bm2 = Exp_a1[[2]][i,1]   
		yp2 = polynomial.model.predict.f(bm2, Ages[Species2], y[Species2], Sequence=Agesp)   	#expression-age curve for the second species
		list(yp1, yp2) })
	names(Res) = rownames(ExpMat)
	Res
}


#OBJECTS FOR POLYNOMIAL REGRESSION TESTS
LmList = alist()
LmList[[5]] = function (y, par, m) 
{ lm(y ~ I(par[[1]] * m[1]) + I(par[[2]] * m[2]) + I(par[[3]] * m[3]) + I(par[[4]] * m[4]) + I(par[[5]] * m[5])) }
LmList[[4]] = function (y, par, m) 
{ lm(y ~ I(par[[1]] * m[1]) + I(par[[2]] * m[2]) + I(par[[3]] * m[3]) + I(par[[4]] * m[4])) }
LmList[[3]] = function (y, par, m) 
{ lm(y ~ I(par[[1]] * m[1]) + I(par[[2]] * m[2]) + I(par[[3]] * m[3])) }
LmList[[2]] = function (y, par, m) 
{ lm(y ~ I(par[[1]] * m[1]) + I(par[[2]] * m[2])) }
LmList[[1]] = function (y, par, m) 
{ lm(y ~ I(par[[1]] * m[1])) }

Models = list()
Models[[6]] = matrix(0, 64, 6)
{ j = 1     
{
for (i1 in 0:1) {
   for (i2 in 0:1) {
      for (i3 in 0:1) {
         for (i4 in 0:1) {
            for (i5 in 0:1) {
               for (i6 in 0:1) {
               Models[[6]][j,] = c(i1,i2,i3,i4,i5,i6)
               j = j + 1 } } } } } } } }

Models[[5]] = matrix(0, 32, 5)
{ j = 1     
{
for (i1 in 0:1) {
   for (i2 in 0:1) {
      for (i3 in 0:1) {
         for (i4 in 0:1) {
            for (i5 in 0:1) {
               Models[[5]][j,] = c(i1,i2,i3,i4, i5)
               j = j + 1 } } } } } } }
               
Models[[4]] = matrix(0, 16, 4)
{ j = 1     
{
for (i1 in 0:1) {
   for (i2 in 0:1) {
      for (i3 in 0:1) {
         for (i4 in 0:1) {
            Models[[4]][j,] = c(i1,i2,i3,i4)
               j = j + 1 } } } } } }

Models[[3]] = matrix(0, 8, 3)
{ j = 1     
{
for (i1 in 0:1) {
   for (i2 in 0:1) {
      for (i3 in 0:1) {
         Models[[3]][j,] = c(i1,i2,i3)
               j = j + 1 } } } } }
                              
Models[[2]] = matrix(0, 4, 2)
{ j = 1     
{
for (i1 in 0:1) {
   for (i2 in 0:1) {
      Models[[2]][j,] = c(i1,i2)
               j = j + 1 } } } }

Models[[1]] = matrix(0, 2, 1)
{ j = 1     
{
for (i1 in 0:1) {
   Models[[1]][j,] = c(i1)
               j = j + 1 } } }


#ADDITIONAL FUNCTIONS + SHORTCUTS
normalize.rows.f = function (Mat) { if (nrow(Mat) < 1) { Mat = matrix(Mat, 1, ) }; t(apply(Mat, 1, function(y) (y-mean(y))/sd(y))) }
l = length
sm = summary
luq = function(x) l(unique(x))
asnc = function(x) as.numeric(as.character(x))
lin = function(x, y) l(intersect(x, y))
rn = rownames
cn = colnames

bootstrap.summary.1.f = function(TabDiffList, BOOTS = 1000, Q1 = 0.025, Q2 = 0.975, NORMALIZE=T) {
#function for normalizing and bootstrapping over the table
   Res = lapply(TabDiffList, function(x) {
      if (l(unlist(x)) > 1) {	#if there are > 1 genes in the gene set
				if (NORMALIZE) xx = normalize.rows.f(x) else xx = x;
				Boots1 = sapply(1:BOOTS, function(i) {
					rx = sort(sample(nrow(xx), rep=T)); apply(xx[rx,,drop=F], 2, mean) })
				Boots1_qnt = apply(Boots1, 1, function(xx) c(quantile(xx, 0.025), quantile(xx, 0.5), quantile(xx, 0.975)))
				return(Boots1_qnt) 
			} else { 
				return(t(t(c(NA, NA, NA)))) 
			} })
		return(Res)	
}

f.pval = function(res1, res2, df1, df2) {
#function to calculate F-test p-values for the difference between two models. res1 and res2: sums of squares, df1 and d2: degrees of freedoms.
   fstat = ((res1 - res2)/(df1 - df2))/(res2/df2)
   pf(q = fstat, df1 = df1-df2, df2 = df2, lower.tail = F)
}

llin = function(
#function to calculate the number of common element in vectors. A proportion of commonality can be determined
   x = list(), #list of vectors
   Q = c()  #proportion of reoccurrance
) {
   x = lapply(x, unique); 
   if (l(Q) == 0) Q = 1
   YYY = table(unlist(x)) >= l(x)*Q
   #print(sum(YYY))
   names(table(unlist(x)) [YYY])
}

slin = function(
#function to calculate the number of overlapping elements among pairs  
   x = list() #list of vectors
) {
	L = l(x)
	Mat = sapply(1:L, function(i) { sapply(1:L, function(j) lin(x[[i]], x[[j]])) })
	if (l(names(x))) { rownames(Mat) = colnames(Mat) = names(x) } else { rownames(Mat) = colnames(Mat) = 1:L }
	Mat
}


mas5calls.AffyBatch.2 = function (object, ids = NULL, verbose = TRUE, tau = 0.015, alpha1 = 0.04, alpha2 = 0.06, ignore.saturated = TRUE)
#this is just part of the original mas5calls function. Only difference: doesn't calculate the presence/absence calls
{
    if (alpha1 < 0) {
        stop("alpha1 must be  > 0 ")
    }
    if (alpha1 > alpha2) {
        stop("alpha2 must be  > alpha1 ")
    }
    if (alpha2 > 1) {
        stop("alpha2 must be  <1 ")
    }
    if (verbose)
        cat("Getting probe level data...\n")
    pms <- as.matrix(pm(object))
    mms <- as.matrix(mm(object))
    if (ignore.saturated) {
        sat <- 46000
    }
    else {
        sat <- -1
    }
    pns <- probeNames(object)
    o <- order(pns)
    pns <- pns[o]
    pms <- pms[o, , drop = FALSE]
    mms <- mms[o, , drop = FALSE]
    unique.pns <- sort(unique(pns))
    if (verbose)
        cat("Computing p-values\n")
    p <- sapply(1:length(pms[1, ]), function(x) {
        .C("DetectionPValue", as.double(pms[, x]), as.double(mms[,
            x]), as.character(pns), as.integer(length(mms[, x])),
            as.double(tau), as.double(sat), dpval = double(length(unique.pns)),
            length(unique.pns), PACKAGE = "affy")$dpval
    })
    rownames(p) <- unique.pns
    colnames(p) <- sampleNames(object)
    if (!is.null(ids)) {
        p <- p[ids, , drop = FALSE]
    }
    return(p)
}