/* Written by SL Kosakovsky Pond (sergeilkp@mac.com) October 2006 */ /* This analysis uses a 4 bin general bivariate discrete distributions, with 2 negatively selected classes, a positively selected class and a neutral class, to compare the distributions of dS and dN in two data sets. The tests done: - Are the distributions the same? - Are there the same proportions of sites with dN>dS in the data sets? - Do the sites under positive selection share the same dN/dS? - Two previous tests combined */ RequireVersion ("0.9920061001"); /*---------------------------------------------------------------------------------------------------------------------------------------------------*/ function BuildCodonFrequencies (obsF) { PIStop = 1.0; result = {ModelMatrixDimension,1}; hshift = 0; for (h=0; h<64; h=h+1) { first = h$16; second = h%16$4; third = h%4; if (_Genetic_Code[h]==10) { hshift = hshift+1; PIStop = PIStop-obsF[first][0]*obsF[second][1]*obsF[third][2]; continue; } result[h-hshift][0]=obsF[first][0]*obsF[second][1]*obsF[third][2]; } return result*(1.0/PIStop); } /*---------------------------------------------------------------------------------------------------------------------------------------------------*/ function ReportDistributionString (rc,freqStrMx,infix, skip0) { distroString = ""; distroString * 1024; reportMx = {rc,4}; distroString * (" dN/dS dS dN Prob\n"); for (mi=0; mi 0) { distroString * (Format(reportMx[mi][2],8,3)+Format(reportMx[mi][0],8,3)+Format(reportMx[mi][1],8,3)+Format(reportMx[mi][3],10,3)+"\n"); } } distroString * 0; return distroString; } /*---------------------------------------------------------------------------------------------------------------------------------------------------*/ function DefineNucleotideBiases (dummy) { ModelTitle = "MG94x"+modelDesc[0]; modelConstraintString = ""; for (customLoopCounter2=0; customLoopCounter2<6; customLoopCounter2=customLoopCounter2+1) { if (rateBiasTerms[customLoopCounter2] != "1") { ExecuteCommands ("global " + rateBiasTerms[customLoopCounter2] + "=1;"); } } for (customLoopCounter2=1; customLoopCounter2<6; customLoopCounter2=customLoopCounter2+1) { for (customLoopCounter=0; customLoopCounter0.0000001;global NS_"+infix+"0 = 0.1;"); for (mi=1; mi0.0000001;\nglobal NS_"+infix+mi+";\n"); if (randomizeInitValues) { categDef1*("global P_"+infix+mi+" = Random(0.05,0.95);\nP_"+infix+mi+":<1;\n"); categDef1*("global S_"+infix+mi+" = Random(0.05,1);"); } else { categDef1*("global P_"+infix+mi+" = 1/"+(resp+1-mi)+";\nP_"+infix+mi+":<1;\n"); } } freqStrMx = {resp,1}; if (resp>1) { freqStrMx[0] = "P_"+infix+"1"; for (mi=1; mi1;R_"+infix+mi+"="+Random(1.05,10)+";"); } } else { for (mi=respN+respP; mi1;R_"+infix+mi+"=2+"+mi+";"); } } } else { if (randomizeInitValues) { for (mi=0; mi 1) { lfParts = lfParts + ",filteredData_" + fileID + "," + treeID; } } } lfParts * 0; ExecuteCommands (lfParts + "," + lfDef1 + ");"); sumPath = resToPath + ".summary"; treeBranchParameters = 0; for (fileID = 1; fileID <= fileCount; fileID = fileID + 1) { ExecuteCommands ("treeBranchParameters = treeBranchParameters + BranchCount(tree_" + fileID + "_0) + TipCount(tree_" + fileID + "_0);"); } /*-------------- INDEPENDENT DISTRIBUTIONS ------------------*/ sop = Max(OPTIMIZATION_PRECISION,0.001); fprintf (stdout, "Running simpler distribution approximations to ensure good convergence...\n"); OPTIMIZATION_PRECISION = 0.1; P_1_1 := 0;P_1_2 := 0;P_1_3 := 0; P_2_1 := 0;P_2_2 := 0;P_2_3 := 0; S_1_0 := 1;S_1_1 := 1;S_1_2 := 1;S_1_3 := 1; S_2_0 := 1;S_2_1 := 1;S_2_2 := 1;S_2_3 := 1; R_1_0 := 1;R_1_1 := 1;R_1_2 := 1; R_2_0 := 1;R_2_1 := 1;R_2_2 := 1; Optimize (res,lf); USE_LAST_RESULTS = 1; SKIP_CONJUGATE_GRADIENT = 1; d1 = ReportDistributionString(4,freqStrMx_1,"1_",1); d2 = ReportDistributionString(4,freqStrMx_2,"2_",1); fprintf (stdout, "\n*** Done with pass 1. Log(L) = ", Format(res[1][0],10,3), " *** \n"); fprintf (stdout, "Approximate rates for data set 1:\n", d1); fprintf (stdout, "Approximate rates for data set 2:\n", d2); codonFactor_1 := codonFactor_1__; codonFactor_2 := codonFactor_2__; AC_1 := AC_1__; AT_1 := AT_1__; CG_1 := CG_1__; CT_1 := CT_1__; GT_1 := GT_1__; AC_2 := AC_2__; AT_2 := AT_2__; CG_2 := CG_2__; CT_2 := CT_2__; GT_2 := GT_2__; fprintf (stdout, "\nGateaux sampling positively selected directions\n"); P_1_1 = 2/filteredData_1.sites; S_1_3 = 1; baseLineLL = res[1][0]; LFCompute (lf,LF_START_COMPUTE); bestDiff = -1e100; bestAlpha = 1; bestBeta = 1; bestLL = -1e100; step = 0.1; step2 = 0.25; v1 = 0.05; for (v1c = 0; v1c < 10; v1c = v1c + 1) { v2 = v1+step; for (v2c = 0; v2c < 20; v2c = v2c+1) { checkASample ("1_0"); v2 = v2 + step2; } v1 = v1+step; } S_1_0 = bestAlpha; R_1_0 = bestBeta; bestAlpha = 1; bestBeta = 1; if (bestDiff <= 0) { P_1_1 = 0; } saveBD = bestDiff; P_2_1 = 1/filteredData_2.sites; S_2_3 = 1; v1 = 0.05; for (v1c = 0; v1c < 10; v1c = v1c + 1) { v2 = v1+step; for (v2c = 0; v2c < 20; v2c = v2c+1) { checkASample ("2_0"); v2 = v2 + step2; } v1 = v1+step; } LFCompute (lf,LF_DONE_COMPUTE); S_2_0 = bestAlpha; R_2_0 = bestBeta; if (bestDiff <= saveBD) { P_2_1 = 0; } if (bestDiff > 0) { fprintf (stdout, "\nFound a likelihood improvement in the direction (", S_1_0, ",", S_1_0*R_1_0, "), (", S_2_0, ",", S_2_0*R_2_0, "), ",bestDiff," likelihood points\n"); } Optimize (res,lf); d1 = ReportDistributionString(4,freqStrMx_1,"1_",1); d2 = ReportDistributionString(4,freqStrMx_2,"2_",1); fprintf (stdout, "\n*** Done with pass 2. Log(L) = ", Format(res[1][0],10,3), " *** \n"); fprintf (stdout, "Approximate rates for data set 1:\n", d1); fprintf (stdout, "Approximate rates for data set 2:\n", d2); fprintf (stdout, "\nGateaux sampling neutral directions\n"); baseLineLL = res[1][0]; LFCompute (lf,LF_START_COMPUTE); P_1_2 = 1/(filteredData_1.sites*(1-P_1_1)); bestDiff = -1e100; bestAlpha = 1; step = 0.02; v1 = 0; for (v1c = 0; v1c < 50; v1c = v1c + 1) { v1 = v1 + step; v2 = v1; checkASample ("1_1"); } S_1_1 = bestAlpha; bestAlpha = 1; if (bestDiff <= 0) { P_1_2 = 0; } saveBD = bestDiff; R_1_1 := 1; P_2_2 = 1/(filteredData_2.sites*(1-P_2_1)); v1 = 0; for (v1c = 0; v1c < 50; v1c = v1c + 1) { v1 = v1 + step; v2 = v1; checkASample ("2_1"); } LFCompute (lf,LF_DONE_COMPUTE); S_2_1 = bestAlpha; R_2_1 := 1; if (bestDiff <= saveBD) { P_2_2 = 0; } if (bestDiff > 0) { fprintf (stdout, "\nFound a likelihood improvement in the direction (", S_1_1, ",", S_1_1, "), (", S_2_1, ",", S_2_1, "), ",bestDiff," likelihood points\n"); } Optimize (res,lf); d1 = ReportDistributionString(4,freqStrMx_1,"1_",1); d2 = ReportDistributionString(4,freqStrMx_2,"2_",1); fprintf (stdout, "\n*** Done with pass 3. Log(L) = ", Format(res[1][0],10,3), " *** \n"); fprintf (stdout, "Approximate rates for data set 1:\n", d1); fprintf (stdout, "Approximate rates for data set 2:\n", d2); fprintf (stdout, "\nGateaux sampling negative selected directions\n"); baseLineLL = res[1][0]; LFCompute (lf,LF_START_COMPUTE); bestDiff = -1e100; bestAlpha = 1; bestBeta = 1; step = 1/16; v1 = 0; P_1_3 = 2/(filteredData_1.sites*(1-P_1_1)*(1-P_1_2)); S_1_2 = 1; for (v1c = 0; v1c < 15; v1c = v1c + 1) { v1 = v1+step; v2 = step/2; for (v2c = 0; v2c < v1c; v2c = v2c+1) { checkASample ("1_2"); v2 = v2 + step; } } S_1_2 = bestAlpha; R_1_2 = bestBeta; bestAlpha = 1; bestBeta = 1; if (bestDiff <= 0) { P_1_3 = 0; } saveBD = bestDiff; v1 = 0; P_2_3 = 2/(filteredData_2.sites*(1-P_2_1)*(1-P_2_2)); S_2_2 = 1; for (v1c = 0; v1c < 15; v1c = v1c + 1) { v1 = v1+step; v2 = step/2; for (v2c = 0; v2c < v1c; v2c = v2c+1) { checkASample ("2_2"); v2 = v2 + step; } } LFCompute (lf,LF_DONE_COMPUTE); S_2_2 = bestAlpha; R_2_2 = bestBeta; if (bestDiff <= saveBD) { P_2_2 = 0; } if (bestDiff > 0) { fprintf (stdout, "\nFound a likelihood improvement in the direction (", S_1_2, ",", S_1_2*R_1_2, "), (", S_2_2, ",", S_2_2*R_2_2, "), ",bestDiff," likelihood points\n"); } AC_1 = AC_1; AT_1 = AT_1; CG_1 = CG_1; CT_1 = CT_1; GT_1 = GT_1; AC_2 = AC_2; AT_2 = AT_2; CG_2 = CG_2; CT_2 = CT_2; GT_2 = GT_2; if (Abs(modelConstraintString_1)) { ExecuteCommands (modelConstraintString_1); } if (Abs(modelConstraintString_2)) { ExecuteCommands (modelConstraintString_2); } codonFactor_1 = codonFactor_1; codonFactor_2 = codonFactor_2; GetString (paramList, lf, -1); degF = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters; OPTIMIZATION_PRECISION = sop; fprintf (stdout, "Running an independent distributions model fit (", degF, " parameters)...\n"); Optimize (res,lf); LogL = res[1][0]; AIC = 2*(degF-LogL); AICc = 2*(degF*totalCodonCount/(totalCodonCount-degF-1) - LogL); fprintf (stdout, "\n\nIndependent distributions model fit summary\n", "\nLog likelihood:", Format (LogL, 15, 5), "\nParameters :", Format (degF, 15, 0), "\nAIC :", Format (AIC, 15, 5), "\nc-AIC :", Format (AICc, 15, 5),"\n" ); fprintf (sumPath,CLEAR_FILE,"Independent distributions model fit summary\n", "\nLog likelihood:", Format (LogL, 15, 5), "\nParameters :", Format (degF, 15, 0), "\nAIC :", Format (AIC, 15, 5), "\nc-AIC :", Format (AICc, 15, 5),"\n"); d1 = ReportDistributionString(4,freqStrMx_1,"1_",0); d2 = ReportDistributionString(4,freqStrMx_2,"2_",0); fprintf (stdout, "Inferred rates for data set 1:\n", d1); fprintf (sumPath, "Inferred rates for data set 1:\n", d1); fprintf (stdout, "Inferred rates for data set 2:\n", d2); fprintf (sumPath, "Inferred rates for data set 2:\n", d2); LIKELIHOOD_FUNCTION_OUTPUT = 6; fprintf (resToPath,CLEAR_FILE,lf); /*-------------- SHARED POSITIVE SELECTION STRENGTHS ------------------*/ for (k=0; k=respP+respN) { ExecuteCommands ("R_1_"+k+"=0.5(R_1_"+k+"+R_2_"+k+");"); } ExecuteCommands ("S_2_"+k+":=S_1_"+k+";R_2_"+k+":=R_1_"+k+";"); } for (k=1; k<=resp; k=k+1) { ExecuteCommands ("P_1_"+k+"=0.5(P_1_"+k+"+P_2_"+k+");"); ExecuteCommands ("P_2_"+k+":=P_1_"+k+";"); } GetString (paramList, lf, -1); degFJ = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters; fprintf (stdout, "\nRunning a shared distributions model fit (", degFJ, " parameters)...\n"); Optimize (res_J,lf); LogLJ = res_J[1][0]; AICJ = 2*(degFJ-LogLJ); AICcJ = 2*(degFJ*totalCodonCount/(totalCodonCount-degFJ-1) - LogLJ); fprintf (stdout, "\n\nShared distributions model fit summary\n", "\nLog likelihood:", Format (LogLJ, 15, 5), "\nParameters :", Format (degFJ, 15, 0), "\nAIC :", Format (AICJ, 15, 5), "\nc-AIC :", Format (AICcJ, 15, 5),"\n" ); fprintf (sumPath, "\n\nShared distributions model fit summary\n", "\nLog likelihood:", Format (LogLJ, 15, 5), "\nParameters :", Format (degFJ, 15, 0), "\nAIC :", Format (AICJ, 15, 5), "\nc-AIC :", Format (AICcJ, 15, 5),"\n" ); d1 = ReportDistributionString(4,freqStrMx_1,"1_",0); fprintf (stdout, "Inferred joint rates:\n", d1); fprintf (sumPath, "Inferred joint rates:\n", d1); fpath = resToPath + ".JointAll"; LIKELIHOOD_FUNCTION_OUTPUT = 6; fprintf (fpath,CLEAR_FILE,lf); USE_LAST_RESULTS = 0; SKIP_CONJUGATE_GRADIENT = 0; LIKELIHOOD_FUNCTION_OUTPUT = 2; fprintf (stdout, "\nDistribution comparison tests\n", "\n\tAre the distributions different?", "\n\t\tLR = ", Format (2*(LogL-LogLJ),10,3), " DF = ", degF-degFJ, " p = ", Format(1-CChi2(2*(LogL-LogLJ), degF-degFJ),8,3), "\n"); fprintf (stdout, "\n\tAre selective regimes (dN/dS and proportions) different?", "\n\t\tLR = ", Format (2*(LogL-LogLPSH),10,3), " DF = ", degF-degFPSH, " p = ", Format(1-CChi2(2*(LogL-LogLPSH), degF-degFPSH),8,3), "\n"); fprintf (stdout, "\n\tAre selection strengths (dN/dS) different?", "\n\t\tLR = ", Format (2*(LogL-LogLPSS),10,3), " DF = ", degF-degFPSS, " p = ", Format(1-CChi2(2*(LogL-LogLPSS), degF-degFPSH),8,3), "\n"); fprintf (stdout, "\n\tAre the proportions of codons under selection different?", "\n\t\tLR = ", Format (2*(LogL-LogLPSP),10,3), " DF = ", degF-degFPSP, " p = ", Format(1-CChi2(2*(LogL-LogLPSP), degF-degFPSP),8,3), "\n"); fprintf (sumPath, "\n\nDistribution comparison tests\n", "\n\tAre the distributions different?", "\n\t\tLR = ", Format (2*(LogL-LogLJ),10,3), " DF = ", degF-degFJ, " p = ", Format(1-CChi2(2*(LogL-LogLJ), degF-degFJ),8,3), "\n"); fprintf (sumPath, "\n\tAre selective regimes (dN/dS and proportions) different?", "\n\t\tLR = ", Format (2*(LogL-LogLPSH),10,3), " DF = ", degF-degFPSH, " p = ", Format(1-CChi2(2*(LogL-LogLPSH), degF-degFPSH),8,3), "\n"); fprintf (sumPath, "\n\tAre selection strengths (dN/dS) different?", "\n\t\tLR = ", Format (2*(LogL-LogLPSS),10,3), " DF = ", degF-degFPSS, " p = ", Format(1-CChi2(2*(LogL-LogLPSS), degF-degFPSH),8,3), "\n"); fprintf (sumPath, "\n\tAre the proportions of codons under selection different?", "\n\t\tLR = ", Format (2*(LogL-LogLPSP),10,3), " DF = ", degF-degFPSP, " p = ", Format(1-CChi2(2*(LogL-LogLPSP), degF-degFPSP),8,3), "\n"); /*------------------------------------------------------------------------------------------------------------*/ function checkASample (whichRate) { ExecuteCommands ("S_"+whichRate+"=v1;R_"+whichRate+"=v2/v1;"); LFCompute (lf,res_n); localDiff = res_n-baseLineLL; if (localDiff > bestDiff) { bestDiff = localDiff; bestAlpha = v1; bestBeta = v2/v1; } return 0; }