#EXAMPLE #source("http://www.picb.ac.cn/Comparative/data_methods/neoteny/somel_R_functions_1.r") #library(affy) #a = ReadAffy() #a = prepare.CDF.fromTabTable.1.f(a, cdfTableLocation="HGU133Plus2_Hs_ENSG_11.0.0_h18c2r2umask_cdfTable.tsv", cdfName="HGU133Plus2_Hs_ENSG_11.0.0_h18c2r2umask", exclude.single=T, grouping="probeset", min.number=8) #ExpMat1 = exprs(rma(a)) #DetMat1 = mas5calls.AffyBatch.2(a) #Humans = c("Hs_pfc_1.CEL", "Hs_pfc_2.CEL", "Hs_pfc_3.CEL", "Hs_pfc_4.CEL", "Hs_pfc_5.CEL", "Hs_pfc_8.CEL", "Hs_pfc_13.CEL", "Hs_pfc_19.CEL", "Hs_pfc_20.CEL", "Hs_pfc_22.CEL", "Hs_pfc_23.CEL", "Hs_pfc_24.CEL", "Hs_pfc_35.CEL", "Hs_pfc_37.CEL") #Chimps = c("PanTro_pfc_1.CEL", "PanTro_pfc_2.CEL", "PanTro_pfc_3.CEL", "PanTro_pfc_4.CEL", "PanTro_pfc_5.CEL", "PanTro_pfc_6.CEL", "PanTro_pfc_7.CEL", "PanTro_pfc_8.CEL", "PanTro_pfc_9.CEL", "PanTro_pfc_10.CEL", "PanTro_pfc_11.CEL", "PanTro_pfc_12.CEL", "PanTro_pfc_13.CEL", "PanTro_pfc_14.CEL") #Macaques = c("RheMac_pfc_1.CEL", "RheMac_pfc_2.CEL", "RheMac_pfc_3.CEL", "RheMac_pfc_4.CEL", "RheMac_pfc_5.CEL", "RheMac_pfc_6.CEL", "RheMac_pfc_7.CEL", "RheMac_pfc_8.CEL", "RheMac_pfc_9.CEL") #Ages1 = c(39, 54, 56, 62, 89, 139, 332, 1969, 2920, 4213, 4534, 4733, 13138, 15672, 0, 1, 8, 40, 186, 45, 525, 2447, 2313, 4361, 4415, 4480, 12784, 16131, 467, 1760, 1269, 2816, 2814, 4894, 6230, 6230, 6524) #Ages1 = Ages1 + c(rep(280, 14), rep(280, 14), rep(165, 9)) #add inferred conception times #Ages1 = log2(Ages1) #names(Ages1) = c(Humans, Chimps, Macaques) #P = 0.05; PROP = 3 #proportion of individuals passing detection in each species #Detected1 = apply(DetMat1 < 0.05, 1, function(y) (sum(y[Humans]) > l(Humans)/PROP | sum(y[Chimps]) > l(Chimps)/PROP)) #comparative.expression.tests.1.f(ExpMat = ExpMat1[1:20,], Ages = Ages1, RefSpecies = Humans, Species2 = Chimps, Species3 = Macaques, Detected = Detected1[1:20], RefSpeciesname = "H", Species2name = "C", Species3name = "R", Name="Experiment1", FileName="Experiment1.RData") ############################################################# #FUNCTIONS TO PREPARE CDF ENVIRONMENTS FROM PROBE TABLES prepare.CDF.fromTabTable.1.f = function ( #courtesy of Michael Lachmann A, parent=globalenv(), cdfTableLocation="MyPath/MyCDFTable", #path and name cdfName="mycdfname", exclude.single=TRUE, #should we exclude groupings (e.g. probesets) with single probes? grouping="probeset", #type of grouping the probes: probe, probeset, exon, etc. min.number = 8 #minimum number of probe sets ) { require(affy) CDFenv = cdf.env.fromTable( A@nrow, cdfTableLocation, EXCLUDE.SINGLE = exclude.single, GROUPING = grouping, MIN.NUMBER = min.number) A@cdfName = cdfName assign( cdfName(A), CDFenv, pos=globalenv()) A } cdf.env.fromTable = function ( chip.size, cdfTableLocation, env = new.env(hash=T,parent=baseenv()), make.env = TRUE, EXCLUDE.SINGLE, #should we exclude groupings with single probes? GROUPING, #type of grouping the probes: probe, probeset, exon, etc. MIN.NUMBER = 8 #minimum number of probes per probeset to be accepted ) { Ftab = read.table(cdfTableLocation, as.is=T, header=T, comment.char="") cnFtab = colnames(Ftab) if (ncol(Ftab) > 3) { #if there are more than 3 columns, choose the appropriate ones given GROUPING Ftab = Ftab[, cnFtab %in% c(GROUPING, "x", "y", "px", "py", "mx", "my")] cnFtab = colnames(Ftab) } if (EXCLUDE.SINGLE) { gn = Ftab[,1] #grouping names tgn = table(gn) include = gn %in% names(tgn)[tgn > MIN.NUMBER-1] #groupings with >= MIN.NUMBER probes if (sum(include) == 0) { #if the CDF table is based on probes stop("Wrong choice: Change exclude.single to FALSE!") } else cat(paste("Excluded ", length(gn) - sum(include), " groupings with less than ", MIN.NUMBER, " probes\n", sep="")) Ftab = Ftab[include,] } if (ncol(Ftab)==3) { cat("Inferring mismatch positions from perfect match\n") if (sum(cnFtab == c(GROUPING, "px", "py")) != ncol(Ftab)) cat(paste("Renaming columns: ", paste(cnFtab, collapse=" "), "as", GROUPING, "px py \n", collapse=" ")) colnames(Ftab) = c(GROUPING, "px", "py") nn.pm=Ftab$"px"+ chip.size*Ftab$"py"+1 nn.mm=Ftab$"px" + chip.size*(Ftab$"py"+1)+1 } else if (ncol(Ftab)==5) { if (sum(colnames(Ftab) == c(GROUPING, "px", "py", "mx", "my")) != ncol(Ftab)) cat(paste("Renaming columns: ", paste(colnames(Ftab), collapse=" "), "as", GROUPING, "px py mx my\n", collapse=" ")) colnames(Ftab) = c(GROUPING, "px", "py", "mx", "my") nn.pm=Ftab$"px"+ chip.size*Ftab$"py"+1 nn.mm=Ftab$"mx" + chip.size*Ftab$"my"+1 } else {cat("Wrong table format! Stopping! \n"); stop} pmm = cbind(nn.pm,nn.mm) colnames(pmm) = c("pm","mm") if (length(unique(Ftab[,1])) != length(Ftab[,1])) { cat("Regular run: more than one probe per grouping\n") l=tapply(1:(dim(pmm)[1]),Ftab[,1], function(v) pmm[v,,drop=FALSE]) cat("Made list.\n") if( make.env ) { cat("Making env\n") nn=names(l) for(i in(1:length(nn) ) ) { assign(nn[i],l[[i]],pos=env,inherit=F) } cat("finished making CDF!\n") return(env) } else { cat("finished making CDF!\n") return(l) } } else { cat("One probe per grouping\n") if( make.env ) { cat("Making env\n") nn=as.character(Ftab[,1]) for(i in 1:length(nn)) { assign(nn[i],pmm[i,,drop=FALSE],pos=env,inherit=F) } cat("finished making CDF!\n") return(env) } else { nn=as.character(Ftab[,1]) l=lapply(1:length(nn), function(i) pmm[i,,drop=FALSE]) nams(l) = nn return(l) } } } ######################################################################## #TESTS FOR AGE EFFECTS, DIFFERENTIAL EXPRESSION, HETEROCHRONY comparative.expression.tests.1.f = function( #Test to calculate age, differential expression, lineage and heterochrony tests, and sort genes into classes based on the test results. It stores all resulting objects in an R object (FileName): The test results, and the gene sets. The object with extension "_gene_sets" contains the gene sets. ExpMat, #Expression data matrix, with samples (arrays) in columns Ages, #numeric vector of ages. the names of the vector should be the same as column names in ExpMat RefSpecies, #character vector, names of samples belonging to the reference species (e.g. human) Species2, #character vector, names of samples belonging to the second species (e.g. chimp) Species3 = c(), #character vector, names of samples belonging to the second species (e.g. rhesus) RefSpeciesname, #character vector (e.g. "Human") Species2name, #character vector (e.g. "Chimp") Species3name, #character vector (e.g. "Rhesus") P_age=0.05, #age-rel test p-value cutoff P_sp=0.05, #species diff test p-value cutoff P_lin=0.05, #lineage test p-value cutoff P_het=0.05, #heterochrony test p-value cutoff Est.Genes=TRUE, #Estimate gene sets and divergence based on these cutoffs? Detected=c(), #logical vector, same length as the rows of ExpMat, indicating whether genes in ExpMat are expressed or not. Name = "Experiment1", #character vector to name objects FileName = "~/Experiment1/RData" #location to save the R data file ) { rnExpMat = rn(ExpMat); if (l(Detected) == 0) { Detected = rep(T, l(rnExpMat)) } #RUNNING TESTS FOR AGE, SPECIES, HETEROCHRONY AND LINEAGE #Test for age effect - polynomial regression Exp_a1 = list() Exp_a1[[1]] = age.test.polynomials.1.f(Mat = ExpMat[,RefSpecies], Ages = Ages[RefSpecies], pvals=T, PCUTOFF=1); #age test for the reference species Exp_a1[[2]] = age.test.polynomials.1.f(Mat = ExpMat[,Species2], Ages = Ages[Species2], pvals=T, PCUTOFF=1); #age test for the second species #Test for age effect - splines Exp_s1 = list() Exp_s1[[1]] = tryCatch( age.test.splines.1.f(Mat = ExpMat[,RefSpecies], Ages = Ages[RefSpecies], DEGREE=8), error = function(e) NA) #testing the reference species Exp_s1[[2]] = tryCatch( age.test.splines.1.f(Mat = ExpMat[,Species2], Ages = Ages[Species2], DEGREE=8) , error = function(e) NA) #testing the second species cat ("\tage tests finished\n") #Test for differential expression between reference species and 2nd species Exp_diff1 = tryCatch( species.test.multiregression.1.f(E = ExpMat, x = Ages, RefSpecies = RefSpecies, Species2 = Species2, EACH = TRUE, Ready.Bm=Exp_a1[[1]][,1]) , error = function(e) NA) #multiple regression test for differential expression in age-related genes Exp_diff2 = tryCatch( apply(ExpMat, 1, function(y) t.test(y[RefSpecies], y[Species2], alt="t")$p.val) , error = function(e) NA) #t-test p-values for non-age-related genes cat ("\tspecies difference tests finished\n") #Test for heterochrony between reference species and 2nd species Exp_het1 = tryCatch( heterochrony.test.1.f(E = ExpMat, x = Ages, RefSpecies = RefSpecies, Species2 = Species2, method = "spline", Ready.Bm=Exp_s1[[1]]) , error = function(e) NA); cat ("\theterochrony test finished\n") #estimating the age-shift and expression-shift between the reference and the second species Exp_het1_ws = suppressWarnings( tryCatch( heterochrony.summary.2.f(heterochrony.summary.1.f(Exp_het1)), error = function(e) NA)); #summarizing the heterochrony.test.1.f result, using "condition set 1" described in the Supp Info. #Estimating expression divergence between the reference species and 2nd species across ages Agesp = seq(max(c(min(Ages[RefSpecies]), min(Ages[Species2]))), min(c(max(Ages[RefSpecies]), max(Ages[Species2]))), l=20) Exp_seq1 = tryCatch( curve.estimate.f (ExpMat, Ages, Agesp, RefSpecies, Species2, Exp_a1), error = function(e) NA) cat ("\tdivergence estimation finished\n") #Test for lineage assignment of expression differences, using the 3rd species as outgroup if (l(Species3) != 0) { Exp_lin1 = tryCatch( lineage.test.1.f(E = ExpMat, x = Ages, Species1 = RefSpecies, Species2 = Species2, Species3 = Species3, Species1name = RefSpeciesname, Species2name = Species2name, Species3name = Species3name, bmmat1 = Exp_a1[[1]], bmmat2 = Exp_a1[[2]]), error = function(e) NA); cat ("\tlineage test finished\n") } ### #ESTIMATING GENE SETS if (Est.Genes & l(Species3) != 0) { All = rnExpMat[Detected] GAp = rnExpMat[Exp_a1[[1]][,2] < P_age & Detected] GAm = rnExpMat[Exp_a1[[1]][,2] >= P_age & Detected] GApSp = rnExpMat[Exp_diff1[,1] < P_sp & Exp_a1[[1]][,2] < P_age & Detected] GApSm = rnExpMat[Exp_diff1[,1] >= P_sp & Exp_a1[[1]][,2] < P_age & Detected] GAmSp = rnExpMat[Exp_diff2 < P_sp & Exp_a1[[1]][,2] >= P_age & Detected] GAmSm = rnExpMat[Exp_diff2 >= P_sp & Exp_a1[[1]][,2] >= P_age & Detected] GSp = c(GApSp, GAmSp) GHl = intersect(rn(Exp_het1_ws)[ Exp_het1_ws[,"pval"] < P_het & Exp_het1_ws[,"age_shift_est"] > 1 ], All) GHe = intersect(rn(Exp_het1_ws)[ Exp_het1_ws[,"pval"] < P_het & Exp_het1_ws[,"age_shift_est"] < 1 ], All) GHp = rn(Exp_lin1)[ Exp_lin1[,"pval"] < P_lin/2 & Detected ] GHm = rn(Exp_lin1)[ Exp_lin1[,"pval"] > (1 - P_lin/2) & Detected ] HlHp = llin(list(GHl, GSp, GHp)) HeHp = llin(list(GHe, GSp, GHp)) HlHm = llin(list(GHl, GSp, GHm)) HeHm = llin(list(GHe, GSp, GHm)) PhyOntList_1 = list(HlHp, HeHp, HlHm, HeHm, GAp, GAm, GSp, GApSp, GHp, GHm, GHl, GHe) PhyOntList_1 = lapply(PhyOntList_1, as.character) names(PhyOntList_1) = c("Human-neotenic", "Human-accelerated", "Chimp-accelerated", "Chimp-neotenic", "Age_related", "Non_age_related", "Differentially_expressed", "Age_rel_and_diff_expr", "On_human_lineage", "On_chimp_lineage", "Delayed_in_human", "Delayed_in_chimp") #ESTIMATING EXPRESSION DIVERGENCE BETWEEN REFERENCE AND 2ND SPECIES ACROSS AGES Exp_div1 = lapply(PhyOntList_1[c("Human-neotenic", "Human-accelerated", "Chimp-accelerated", "Age_rel_and_diff_expr")], function(x) { t(sapply(Exp_seq1[x], function(xx) (abs(xx[[1]] - xx[[2]])))) }) Exp_div1_boots = tryCatch( bootstrap.summary.1.f(Exp_div1, BOOTS=100) , error = function(e) NA) vallist = list(Exp_a1, Exp_s1, Exp_diff1, Exp_diff2, Exp_lin1, Exp_het1, Exp_seq1, Exp_het1_ws, PhyOntList_1, Exp_div1_boots, Ages, Agesp) namelist = c("age_test_polynomial", "age_test_splines", "species_test_multireg", "species_test_ttest", "lineage_test", "heterochrony_test", "expression_age_curves", "heterochrony_test_summary", "gene_sets", "divergence_across_age", "ages", "ages_predicted") } else { vallist = list(Exp_a1, Exp_s1, Exp_diff1, Exp_diff2, Exp_het1, Exp_het1_ws, Exp_seq1, Ages, Agesp) namelist = c("age_test_polynomial", "age_test_splines", "species_test_multireg", "species_test_ttest", "heterochrony_test", "heterochrony_test_summary", "expression_age_curves", "ages", "ages_predicted") } for (i in 1:l(vallist)) { val = vallist[[i]] assign(paste(Name, "_", namelist, sep="")[i], val) } save(list=grep(Name, ls(), value=TRUE), file=FileName) } ################################################################### #FUNCTION FOR TESTING POLYNOMIAL MODELS WITH AGE age.test.polynomials.1.f = function(Mat, Ages, verbose=F, pvals=FALSE, PCUTOFF=0.05) { #tries different models using one variable up to 3rd degree. Returns the index of the best model in Models[[3]] parI = list(Ages, Ages^2, Ages^3) Genes = nrow(Mat); PI = length(parI); LI = 2^ (PI); Models = Models[[3]] LmListInt1 = function (y, par, m) { lm(y ~ I(par[[1]] * m[1]) + I(par[[2]] * m[2]) + I(par[[3]] * m[3])) } if (pvals==FALSE) { Result = apply(Mat, 1, function(y) { lr = lapply(1:8, function(j) {LmListInt1(y, parI, Models[j,])}) mod = sapply(2:8, function(k) {anova(lr[[1]], lr[[k]])$Pr[2]}) mod[is.na(mod)] = 1 ifelse(min(mod) <= PCUTOFF, which.min(mod)+1, 1) }) names(Result) = rownames(Mat) } else { Result = t(apply(Mat, 1, function(y) { lr = lapply(1:8, function(j) {LmListInt1(y, parI, Models[j,])}) mod = sapply(2:8, function(k) {anova(lr[[1]], lr[[k]])$Pr[2]}) mod[is.na(mod)] = 1 c(ifelse(min(mod) <= PCUTOFF, which.min(mod)+1, 1), min(mod)) })) rownames(Result) = rownames(Mat) colnames(Result) = c("Model", "P-value") } return(Result) } polynomial.model.predict.f = function(Best_model, x, y, low, high, Sequence=NULL, Model=3, Fit=F) { #function to make predictions (between values "low" and "high") or calculate models based on best polynomial regression models estimated in age.test.polynomials.1.f if (Model == 3) Best_model = 1 + (Best_model - 1)*4 if (Fit == F) { if (!length(Sequence)) { Seq = seq(low,high,by=0.1) } else { Seq = Sequence } if (Best_model == 1) yp = rep(mean(y), length(Seq)) if (Best_model == 5) yp = predict(lm(y ~ I(x^3)), data.frame(x=Seq)) #predicted values for a range of ages if (Best_model == 9) yp = predict(lm(y ~ I(x^2)), data.frame(x=Seq)) if (Best_model == 13) yp = predict(lm(y ~ I(x^2) + I(x^3)), data.frame(x=Seq)) if (Best_model == 17) yp = predict(lm(y ~ x), data.frame(x=Seq)) if (Best_model == 21) yp = predict(lm(y ~ x + I(x^3)), data.frame(x=Seq)) if (Best_model == 25) yp = predict(lm(y ~ x + I(x^2)), data.frame(x=Seq)) if (Best_model == 29) yp = predict(lm(y ~ x + I(x^2) + I(x^3)), data.frame(x=Seq)) } else { if (Best_model == 1) yp = rep(mean(y), length(x)) if (Best_model == 5) yp = lm(y ~ I(x^3))$fit #predicted values for a range of ages if (Best_model == 9) yp = lm(y ~ I(x^2))$fit if (Best_model == 13) yp = lm(y ~ I(x^2) + I(x^3))$fit if (Best_model == 17) yp = lm(y ~ x)$fit if (Best_model == 21) yp = lm(y ~ x + I(x^3))$fit if (Best_model == 25) yp = lm(y ~ x + I(x^2))$fit if (Best_model == 29) yp = lm(y ~ x + I(x^2) + I(x^3))$fit } return(yp) } ################################################################### #FUNCTION FOR TESTING SPLINE MODELS WITH AGE age.test.splines.1.f = function(Mat, Ages, DEGREE=8, VERBOSE=F) { #test smooth spline models with different degrees of freedom (2:8) and get the one with the lowest F-test p-value x = Ages Res = t(sapply(1:nrow(Mat), function(i) { if (VERBOSE == TRUE) print(i) y = Mat[i, ] null = lm(y ~ 1) res1 = sum(null$resi^2) df1 = null$df res2l = sapply(2:DEGREE, function(i) sum((predict(smooth.spline(x, y, df=i), x)$y - y)^2)) dfl = df1 - 1:(DEGREE-1) pvals = sapply(1:(DEGREE-1), function(i) { res2 = res2l[[i]] df2 = dfl[[i]] fstat = ((res1 - res2)/(df1 - df2))/(res2/df2) pf(q = fstat, df1 = df1-df2, df2 = df2, lower.tail = F) }) c(which.min(pvals)+1, pvals[which.min(pvals)]) })) rownames(Res) = rownames(Mat) colnames(Res) = c("Model", "P-value") Res } ################################################################### #FUNCTIONS FOR TESTING DIFFERENTIAL EXPRESSION species.test.multiregression.1.f = function( #F-test for species difference, where the regression curve is chosen for refspecies. E, #expression matrix x, #ages RefSpecies, #indicator of species 1 Species2, #indicator of species 2 DEGREE = 3, #degree of the polynomial regression model to use. obsolete. EACH = TRUE, #if EACH is TRUE a list of submodels are tested for species difference, where the second species can have its own parameters. If EACH is FALSE, only one model is tested, where the second species has its own parameters for each element of the model. Ready.Bm = c(), #vector of previously calculated best expression-age regression models. if left empty, calculates the models de novo. SIG.MODELS = TRUE #should we only use significant expression-age models, or just the best model irrespective of significance ) { mff = age.test.polynomials.1.f; mo1 = multireg.models.1.f; if (EACH) mo2 = ancova.models.02.f else mo2 = ancova.models.01.f if (l(Ready.Bm) != 0) { #find best model or get it BmMat = Ready.Bm } else { BmMat = mff(E[,RefSpecies], x[RefSpecies], verbose=F, PCUTOFF=ifelse(SIG.MODELS, 0.05, 1)) } Sp = factor(c(rep(1, l(RefSpecies)), rep(0, l(Species2)))) XYZ = c(RefSpecies, Species2) Res = t(sapply(1:nrow(E), function(i) { y = E[i,] Bm = BmMat[i] null = mo1(Bm, x[XYZ], y[XYZ]) altlist = mo2(Bm, x[XYZ], y[XYZ], Sp) if (EACH & Bm != 1) pval = sapply(altlist, function(alt) anova(alt, null)$Pr[2]) else pval = anova(altlist, null)$Pr[2] c(min(pval), Bm, which.min(pval)) })) rownames(Res) = rownames(E) colnames(Res) = c("P-value", "Multireg_Model", "Ancova_Model") return(Res) } multireg.models.1.f = function(BestModels, x, y) { if (BestModels == 1) m2 = lm(y ~ rep(1, l(x)) * 1) if (BestModels == 2) m2 = lm(y ~ I(x^3) * 1) if (BestModels == 3) m2 = lm(y ~ I(x^2) * 1) if (BestModels == 4) m2 = lm(y ~ I(x^2) * 1 + I(x^3) * 1) if (BestModels == 5) m2 = lm(y ~ I(x) * 1) if (BestModels == 6) m2 = lm(y ~ I(x) * 1 + I(x^3) * 1) if (BestModels == 7) m2 = lm(y ~ I(x) * 1 + I(x^2) * 1) if (BestModels == 8) m2 = lm(y ~ I(x) * 1 + I(x^2) * 1 + I(x^3) * 1) m2 } ancova.models.01.f = function(BestModels, x, y, Sp) { #each species has specific slope and intercept if (BestModels == 1) m2 = lm(y ~ rep(1, l(x)) * Sp) if (BestModels == 2) m2 = lm(y ~ I(x^3) * Sp) if (BestModels == 3) m2 = lm(y ~ I(x^2) * Sp) if (BestModels == 4) m2 = lm(y ~ I(x^2) * Sp + I(x^3) * Sp) if (BestModels == 5) m2 = lm(y ~ I(x) * Sp) if (BestModels == 6) m2 = lm(y ~ I(x) * Sp + I(x^3) * Sp) if (BestModels == 7) m2 = lm(y ~ I(x) * Sp + I(x^2) * Sp) if (BestModels == 8) m2 = lm(y ~ I(x) * Sp + I(x^2) * Sp + I(x^3) * Sp) m2 } ancova.models.02.f = function(BestModels, x, y, Sp) { #a list of models where each species *may have* a specific slope and intercept Sp2 = rep(1, l(Sp)) if (BestModels == 1) m2 = lm(y ~ rep(1, l(x)) * Sp) if (BestModels == 2) m2 = lapply(1:2, function(h) {modlist = list(Sp2,Sp)[(Models[[1]]+1)[h,]]; lm(y ~ I(x^3) * modlist[[1]] + Sp)}) if (BestModels == 3) m2 = lapply(1:2, function(h) {modlist = list(Sp2,Sp)[(Models[[1]]+1)[h,]]; lm(y ~ I(x^2) * modlist[[1]] + Sp)}) if (BestModels == 4) m2 = lapply(1:4, function(h) {modlist = list(Sp2,Sp)[(Models[[2]]+1)[h,]]; lm(y ~ I(x^2) * modlist[[1]] + I(x^3) * modlist[[2]] + Sp)}) if (BestModels == 5) m2 = lapply(1:2, function(h) {modlist = list(Sp2,Sp)[(Models[[1]]+1)[h,]]; lm(y ~ I(x) * modlist[[1]] + Sp)}) if (BestModels == 6) m2 = lapply(1:4, function(h) {modlist = list(Sp2,Sp)[(Models[[2]]+1)[h,]]; lm(y ~ I(x) * modlist[[1]] + I(x^3) * modlist[[2]] + Sp)}) if (BestModels == 7) m2 = lapply(1:4, function(h) {modlist = list(Sp2,Sp)[(Models[[2]]+1)[h,]]; lm(y ~ I(x) * modlist[[1]] + I(x^2) * modlist[[2]] + Sp)}) if (BestModels == 8) m2 = lapply(1:8, function(h) {modlist = list(Sp2,Sp)[(Models[[3]]+1)[h,]]; lm(y ~ I(x) * modlist[[1]] + I(x^2) * modlist[[2]] + I(x^3) * modlist[[3]] + Sp)}) m2 } ################################################################### #LINEAGE TEST - FUNCTION TO ASSIGN EXPRESSION DIFFERENCES TO A LINEAGE lineage.test.1.f = function(E, x = Ages, Species1, Species2, Species3, Species1name, Species2name, Species3name, bmmat1, bmmat2) { #compares two species' curves with an outgroup's expression values, and decides on which of the two is closer to the outgroup TRes = t(sapply(1:nrow(E), function(i) { x3 = x[Species3]; y3 = E[i,Species3]; x1 = x[Species1]; y1 = E[i,Species1]; bm1 = ifelse(bmmat1[i,2] < 0.05, bmmat1[i,1], 1) yp1 = polynomial.model.predict.f(bm1, x1, y1, Seq=x3, Model=3) x2 = x[Species2]; y2 = E[i,Species2]; bm2 = ifelse(bmmat2[i,2] < 0.05, bmmat2[i,1], 1) yp2 = polynomial.model.predict.f(bm2, x2, y2, Seq=x3, Model=3) sum1 = sum(abs(yp1 - y3) > abs(yp2 - y3)) pval1 = suppressWarnings( wilcox.test(abs(yp1 - y3), abs(yp2 - y3), paired=T, alt="g")$p.val ) Res = c(yp1, yp2, y3, sum1, pval1) names(Res) = c(rep(Species1name, l(Species3)), rep(Species2name, l(Species3)), rep(Species3name, l(Species3)), "sum1", "pval") Res })) rownames(TRes) = rownames(E) TRes } ################################################################### #HETEROCHRONY TEST heterochrony.test.1.f = function( E, #expression matrix x, #ages RefSpecies, #indicator of species 1 Species2, #indicator of species 2 method = "spline", #which method to use to calculate best fitting curves, either "spline" or "lm" (polynomial regression). Ready.Bm = c(), #matrix of previously calculated best expression-age regression models and p-values. if left empty, calculates the models de novo. Be careful to maintain that previously calculated models are of the same degree as specified in DEGREE!!! VERBOSE = FALSE ) { #C is the age-shift estimate, M is the expression-shift estimate, Res9 = lapply(1:nrow(E), function(i) { if (VERBOSE == TRUE) print(i) y = E[i,] y1 = y[RefSpecies] x1 = x[RefSpecies] y2 = y[Species2] x2 = x[Species2] SIGNIFICANT = FALSE if (method == "spline") { if (l(Ready.Bm) != 0) { Bm1 = Ready.Bm[i,] } #get best model, or... else { Bm1 = age.test.splines.1.f(t(y1), x1, VERBOSE=F)[1,] } #find best model if (Bm1[2] < 0.05) { SIGNIFICANT = TRUE #if there is significant change with age dd = smooth.spline(x1, y1, df=Bm1[1]) yp2 = predict(dd, x2)$y exp_comp = function(x2, y2, pC, pM) { #the function to optimize xp2 = x2 + pC yp2 = predict(dd, xp2)$y y2 - yp2 + pM } } } if (method == "lm") { mff = age.test.polynomials.1.f; mpf = polynomial.model.predict.f if (l(Ready.Bm) != 0) { Bm1 = Ready.Bm[i] } #get best model, or... else { Bm1 = mff(t(y1), x1, verbose=F)[1] } #find best model if (Bm1 != 1) { SIGNIFICANT = TRUE #if there is significant change with age yp2 = mpf(Bm1, x1, y1, Sequence=x2) exp_comp = function(x2, y2, pC, pM) { #the function to optimize xp2 = x2 + pC yp2 = mpf(Bm1, x1, y1, Sequence=xp2) y2 - yp2 + pM } } } if (SIGNIFICANT == TRUE) { #if there is significant change with age ResList1 = list(NA, NA, NA, NA) names(ResList1) = c("C", "CM", "M", "null") #### null res2 = sum((yp2 - y2)^2) #the sum of squared residuals df2 = l(y2) #the degrees of freedom of the model ResList1[["null"]] = list(res2, df2) names(ResList1[["null"]]) = c("sumsq", "df") #### the A model (estimate only age-shift) Nls1 <- tryCatch(nls( ~ exp_comp(x2, y2, pC, pM=0), algorithm = "port", data = data.frame(cbind(x2, y2)), start = list(pC = 0), trace = F), error = function(e) NA) #estimate only pC cc = NA; names(cc)="pC"; res = res2 = df2 = pval = NA if (!is.na(Nls1[1])) { NlsConv = Nls1$convergence if (NlsConv==0) cc[1] = tryCatch(summary(Nls1)$coef[,1], error = function(e) NA); #the optimal parameter value(s) from nls res = tryCatch(sm(Nls1)$resi, error = function(e) NA) #the residuals from the model res2 = tryCatch(sum(res^2), error = function(e) NA) #the sum of squared residuals df2 = tryCatch(sm(Nls1)$df[2], error = function(e) NA) #the degrees of freedom of the model pval = f.pval(res1 = ResList1[["null"]][["sumsq"]], res2 = res2, df1 = ResList1[["null"]][["df"]], df2 = df2) #the F-test p-values } else { NlsConv = NA } ResList1[["C"]] = list(res2, df2, cc, as.numeric(chartr("01","10", as.character(NlsConv))), pval, res) #### the B model (estimate only expression-shift) Nls1 <- tryCatch(nls( ~ exp_comp(x2, y2, pC=0, pM), algorithm = "port", data = data.frame(cbind(x2, y2)), start = list(pM = 0), trace = F), error = function(e) NA) #estimate only pM cc = NA; names(cc)="pM"; res = res2 = df2 = pval = NA if (!is.na(Nls1[1])) { NlsConv = Nls1$convergence if (NlsConv==0) cc[1] = tryCatch(summary(Nls1)$coef[,1], error = function(e) NA); res = tryCatch(sm(Nls1)$resi, error = function(e) NA) #the residuals from the model res2 = tryCatch(sum(res^2), error = function(e) NA) #the sum of squared residuals df2 = tryCatch(sm(Nls1)$df[2], error = function(e) NA) #the degrees of freedom of the model pval = f.pval(res1 = ResList1[["null"]][[1]], res2 = res2, df1 = ResList1[["null"]][[2]], df2 = df2) #the F-test p-values } else { NlsConv = NA } ResList1[["M"]] = list(res2, df2, cc, as.numeric(chartr("01","10", as.character(NlsConv))), pval, res) #### the AB model (estimate both age-shift and expression-shift) Nls1 <- tryCatch(nls( ~ exp_comp(x2, y2, pC, pM), algorithm = "port", data = data.frame(cbind(x2, y2)), start = list(pC = 0, pM = 0), trace = F), error = function(e) NA) #estimate pC and pM simultaneously cc = c(NA, NA); names(cc) = c("pC","pM"); res = res2 = df2 = pval = NA if (!is.na(Nls1[1])) { NlsConv = Nls1$convergence if (NlsConv==0) cc[1:2] = tryCatch(summary(Nls1)$coef[,1], error = function(e) NA); res = tryCatch(sm(Nls1)$resi, error = function(e) NA) #the residuals from the model res2 = tryCatch(sum(res^2), error = function(e) NA) #the sum of squared residuals df2 = tryCatch(sm(Nls1)$df[2], error = function(e) NA) #the degrees of freedom of the model pval = f.pval(res1 = ResList1[["null"]][[1]], res2 = res2, df1 = ResList1[["null"]][[2]], df2 = df2) #the F-test p-values } else { NlsConv = NA } ResList1[["CM"]] = list(res2, df2, cc, as.numeric(chartr("01","10", as.character(NlsConv))), pval, res) names(ResList1[["C"]]) = names(ResList1[["CM"]]) = names(ResList1[["M"]]) = c("sumsq", "df", "coef", "conv", "pval", "resi") ResList1 } }) names(Res9) = rownames(E) return(Res9) } ################################################################### #FUNCTIONS TO SUMMARIZE HETEROCHRONY TEST RESULTS heterochrony.summary.1.f = function(Mat) { #function to summarize heterochrony.test.1.f output Res = t(sapply(Mat, function(x) { if (l(x) != 0 & sum(!is.na(c(x$C$pval, x$CM$pval, x$M$pval))) > 0) { a = c(x$C$pval, x$CM$pval, x$M$pval, which.min(c(x$C$pval, x$CM$pval, x$M$pval)), f.pval(res1 = x$C$sumsq, res2 = x$CM$sumsq, df1 = x$C$df, df2 = x$CM$df), f.pval(res1 = x$M$sumsq, res2 = x$CM$sumsq, df1 = x$M$df, df2 = x$CM$df), x$C$coef["pC"], x$CM$coef["pC"], ifelse(l(x$C$conv)==1, x$C$conv, NA), ifelse(l(x$CM$conv)==1, x$CM$conv, NA), x$CM$coef["pM"]) } else a = rep(NA, 11) a })) colnames(Res) = c("pvalC", "pvalCM", "pvalM", "minp", "pvalCxCM", "pvalMxCM", "estC", "estCM", "convC", "convCM", "mestM") Res } heterochrony.summary.2.f = function(Mat) { # determining wide-sense neotenic/accelerated genes from opt.match.5.f results # Wide-sense neotenic/accelerated # Condition 1: AB model best. Get "C" estimate and p-value from the AB model. # Condition 2: A model best. Get "C" estimate and p-value from the A model. # Condition 3: B model is best but AB model not better than A model (at p<0.05). Then the A model is good enough (J-test logic). Get "C" estimate and p-value from A model. # Condition 4: B model is best and the AB model doesn't converge. Non-convergence implies models A and B are similar, so model A is good enough (except cases where model B is significant at p<0.05 but A is not significant). Get "C" estimate and p-value from model A. COND1 = !is.na(Mat[,"minp"]) & Mat[,"minp"] == 2 & Mat[,"convCM"] == 1 COND2 = !is.na(Mat[,"minp"]) & Mat[,"minp"] == 1 & Mat[,"convC"] == 1 COND3 = !is.na(Mat[,"pvalCxCM"]) & Mat[,"minp"] == 3 & Mat[,"convC"] == 1 & Mat[,"pvalCxCM"] > 0.05 COND4 = is.na(Mat[,"pvalCM"]) & !is.na(Mat[,"pvalC"]) & Mat[,"minp"] == 3 & Mat[,"convC"] == 1 & !(Mat[,"pvalC"] > 0.05 & Mat[,"pvalM"] < 0.05) #cat(sapply(list(COND1, COND2, COND3, COND4), sum), "\n") Res = rbind(Mat[COND1, c("pvalCM", "estCM", "mestM")], cbind(Mat[c(COND2 | COND3 | COND4), c("pvalC", "estC")], rep(NA, sum(c(COND2 | COND3 | COND4))))) colnames(Res) = c("pval", "age_shift_est", "exprs_shift_est") Res[,"age_shift_est"] = 2^Res[,"age_shift_est"] Res } ##################################################################### #FUNCTION TO CALCULATE CURVES curve.estimate.f = function(ExpMat, Ages, Agesp, RefSpecies, Species2, Exp_a1) { Res = lapply(1:nrow(ExpMat), function(i) { y = ExpMat[i,] bm1 = Exp_a1[[1]][i,1] yp1 = polynomial.model.predict.f(bm1, Ages[RefSpecies], y[RefSpecies], Sequence=Agesp) #expression-age curve for the reference species bm2 = Exp_a1[[2]][i,1] yp2 = polynomial.model.predict.f(bm2, Ages[Species2], y[Species2], Sequence=Agesp) #expression-age curve for the second species list(yp1, yp2) }) names(Res) = rownames(ExpMat) Res } #OBJECTS FOR POLYNOMIAL REGRESSION TESTS LmList = alist() LmList[[5]] = function (y, par, m) { lm(y ~ I(par[[1]] * m[1]) + I(par[[2]] * m[2]) + I(par[[3]] * m[3]) + I(par[[4]] * m[4]) + I(par[[5]] * m[5])) } LmList[[4]] = function (y, par, m) { lm(y ~ I(par[[1]] * m[1]) + I(par[[2]] * m[2]) + I(par[[3]] * m[3]) + I(par[[4]] * m[4])) } LmList[[3]] = function (y, par, m) { lm(y ~ I(par[[1]] * m[1]) + I(par[[2]] * m[2]) + I(par[[3]] * m[3])) } LmList[[2]] = function (y, par, m) { lm(y ~ I(par[[1]] * m[1]) + I(par[[2]] * m[2])) } LmList[[1]] = function (y, par, m) { lm(y ~ I(par[[1]] * m[1])) } Models = list() Models[[6]] = matrix(0, 64, 6) { j = 1 { for (i1 in 0:1) { for (i2 in 0:1) { for (i3 in 0:1) { for (i4 in 0:1) { for (i5 in 0:1) { for (i6 in 0:1) { Models[[6]][j,] = c(i1,i2,i3,i4,i5,i6) j = j + 1 } } } } } } } } Models[[5]] = matrix(0, 32, 5) { j = 1 { for (i1 in 0:1) { for (i2 in 0:1) { for (i3 in 0:1) { for (i4 in 0:1) { for (i5 in 0:1) { Models[[5]][j,] = c(i1,i2,i3,i4, i5) j = j + 1 } } } } } } } Models[[4]] = matrix(0, 16, 4) { j = 1 { for (i1 in 0:1) { for (i2 in 0:1) { for (i3 in 0:1) { for (i4 in 0:1) { Models[[4]][j,] = c(i1,i2,i3,i4) j = j + 1 } } } } } } Models[[3]] = matrix(0, 8, 3) { j = 1 { for (i1 in 0:1) { for (i2 in 0:1) { for (i3 in 0:1) { Models[[3]][j,] = c(i1,i2,i3) j = j + 1 } } } } } Models[[2]] = matrix(0, 4, 2) { j = 1 { for (i1 in 0:1) { for (i2 in 0:1) { Models[[2]][j,] = c(i1,i2) j = j + 1 } } } } Models[[1]] = matrix(0, 2, 1) { j = 1 { for (i1 in 0:1) { Models[[1]][j,] = c(i1) j = j + 1 } } } #ADDITIONAL FUNCTIONS + SHORTCUTS normalize.rows.f = function (Mat) { if (nrow(Mat) < 1) { Mat = matrix(Mat, 1, ) }; t(apply(Mat, 1, function(y) (y-mean(y))/sd(y))) } l = length sm = summary luq = function(x) l(unique(x)) asnc = function(x) as.numeric(as.character(x)) lin = function(x, y) l(intersect(x, y)) rn = rownames cn = colnames bootstrap.summary.1.f = function(TabDiffList, BOOTS = 1000, Q1 = 0.025, Q2 = 0.975, NORMALIZE=T) { #function for normalizing and bootstrapping over the table Res = lapply(TabDiffList, function(x) { if (l(unlist(x)) > 1) { #if there are > 1 genes in the gene set if (NORMALIZE) xx = normalize.rows.f(x) else xx = x; Boots1 = sapply(1:BOOTS, function(i) { rx = sort(sample(nrow(xx), rep=T)); apply(xx[rx,,drop=F], 2, mean) }) Boots1_qnt = apply(Boots1, 1, function(xx) c(quantile(xx, 0.025), quantile(xx, 0.5), quantile(xx, 0.975))) return(Boots1_qnt) } else { return(t(t(c(NA, NA, NA)))) } }) return(Res) } f.pval = function(res1, res2, df1, df2) { #function to calculate F-test p-values for the difference between two models. res1 and res2: sums of squares, df1 and d2: degrees of freedoms. fstat = ((res1 - res2)/(df1 - df2))/(res2/df2) pf(q = fstat, df1 = df1-df2, df2 = df2, lower.tail = F) } llin = function( #function to calculate the number of common element in vectors. A proportion of commonality can be determined x = list(), #list of vectors Q = c() #proportion of reoccurrance ) { x = lapply(x, unique); if (l(Q) == 0) Q = 1 YYY = table(unlist(x)) >= l(x)*Q #print(sum(YYY)) names(table(unlist(x)) [YYY]) } slin = function( #function to calculate the number of overlapping elements among pairs x = list() #list of vectors ) { L = l(x) Mat = sapply(1:L, function(i) { sapply(1:L, function(j) lin(x[[i]], x[[j]])) }) if (l(names(x))) { rownames(Mat) = colnames(Mat) = names(x) } else { rownames(Mat) = colnames(Mat) = 1:L } Mat } mas5calls.AffyBatch.2 = function (object, ids = NULL, verbose = TRUE, tau = 0.015, alpha1 = 0.04, alpha2 = 0.06, ignore.saturated = TRUE) #this is just part of the original mas5calls function. Only difference: doesn't calculate the presence/absence calls { if (alpha1 < 0) { stop("alpha1 must be > 0 ") } if (alpha1 > alpha2) { stop("alpha2 must be > alpha1 ") } if (alpha2 > 1) { stop("alpha2 must be <1 ") } if (verbose) cat("Getting probe level data...\n") pms <- as.matrix(pm(object)) mms <- as.matrix(mm(object)) if (ignore.saturated) { sat <- 46000 } else { sat <- -1 } pns <- probeNames(object) o <- order(pns) pns <- pns[o] pms <- pms[o, , drop = FALSE] mms <- mms[o, , drop = FALSE] unique.pns <- sort(unique(pns)) if (verbose) cat("Computing p-values\n") p <- sapply(1:length(pms[1, ]), function(x) { .C("DetectionPValue", as.double(pms[, x]), as.double(mms[, x]), as.character(pns), as.integer(length(mms[, x])), as.double(tau), as.double(sat), dpval = double(length(unique.pns)), length(unique.pns), PACKAGE = "affy")$dpval }) rownames(p) <- unique.pns colnames(p) <- sampleNames(object) if (!is.null(ids)) { p <- p[ids, , drop = FALSE] } return(p) }