Files
linguist/samples/HyPhy/dNdSDistributionComparison.bf
2015-05-02 13:24:43 -07:00

1026 lines
29 KiB
Beef

/* Written by SL Kosakovsky Pond (sergeilkp@mac.com) October 2006 */
/* This analysis uses a 4 bin general bivariate discrete distributions,
with 2 negatively selected classes, a positively selected class and a neutral class,
to compare the distributions of dS and dN in two data sets.
The tests done:
- Are the distributions the same?
- Are there the same proportions of sites with dN>dS in the data sets?
- Do the sites under positive selection share the same dN/dS?
- Two previous tests combined
*/
RequireVersion ("0.9920061001");
/*---------------------------------------------------------------------------------------------------------------------------------------------------*/
function BuildCodonFrequencies (obsF)
{
PIStop = 1.0;
result = {ModelMatrixDimension,1};
hshift = 0;
for (h=0; h<64; h=h+1)
{
first = h$16;
second = h%16$4;
third = h%4;
if (_Genetic_Code[h]==10)
{
hshift = hshift+1;
PIStop = PIStop-obsF[first][0]*obsF[second][1]*obsF[third][2];
continue;
}
result[h-hshift][0]=obsF[first][0]*obsF[second][1]*obsF[third][2];
}
return result*(1.0/PIStop);
}
/*---------------------------------------------------------------------------------------------------------------------------------------------------*/
function ReportDistributionString (rc,freqStrMx,infix, skip0)
{
distroString = "";
distroString * 1024;
reportMx = {rc,4};
distroString * (" dN/dS dS dN Prob\n");
for (mi=0; mi<rc; mi=mi+1)
{
ExecuteCommands ("reportMx[mi][0]=S_"+infix+mi+"/c_"+infix+"scale;");
ExecuteCommands ("reportMx[mi][1]=NS_"+infix+mi+"/c_"+infix+"scale;");
reportMx[mi][2] = reportMx[mi][1]/reportMx[mi][0];
ExecuteCommands ("reportMx[mi][3]="+freqStrMx[mi]+";");
if (skip0 == 0 || reportMx[mi][3] > 0)
{
distroString * (Format(reportMx[mi][2],8,3)+Format(reportMx[mi][0],8,3)+Format(reportMx[mi][1],8,3)+Format(reportMx[mi][3],10,3)+"\n");
}
}
distroString * 0;
return distroString;
}
/*---------------------------------------------------------------------------------------------------------------------------------------------------*/
function DefineNucleotideBiases (dummy)
{
ModelTitle = "MG94x"+modelDesc[0];
modelConstraintString = "";
for (customLoopCounter2=0; customLoopCounter2<6; customLoopCounter2=customLoopCounter2+1)
{
if (rateBiasTerms[customLoopCounter2] != "1")
{
ExecuteCommands ("global " + rateBiasTerms[customLoopCounter2] + "=1;");
}
}
for (customLoopCounter2=1; customLoopCounter2<6; customLoopCounter2=customLoopCounter2+1)
{
for (customLoopCounter=0; customLoopCounter<customLoopCounter2; customLoopCounter=customLoopCounter+1)
{
if (modelDesc[customLoopCounter2]==modelDesc[customLoopCounter])
{
ModelTitle = ModelTitle+modelDesc[customLoopCounter2];
if (rateBiasTerms[customLoopCounter2] == "1")
{
modelConstraintString = modelConstraintString + rateBiasTerms[customLoopCounter]+":="+rateBiasTerms[customLoopCounter2]+";";
}
else
{
modelConstraintString = modelConstraintString + rateBiasTerms[customLoopCounter2]+":="+rateBiasTerms[customLoopCounter]+";";
}
break;
}
}
if (customLoopCounter==customLoopCounter2)
{
ModelTitle = ModelTitle+modelDesc[customLoopCounter2];
}
}
if (Abs(modelConstraintString))
{
ExecuteCommands (modelConstraintString);
}
return 0;
}
/*---------------------------------------------------------------------------------------------------------------------------------------------------*/
function defineRateDistribution (infix)
{
ExecuteCommands ("global S_"+infix+"0 = 0.5;S_"+infix+"0:>0.0000001;global NS_"+infix+"0 = 0.1;");
for (mi=1; mi<resp; mi=mi+1)
{
categDef1 * ("global S_"+infix+mi+"=0.5;S_"+infix+mi+":>0.0000001;\nglobal NS_"+infix+mi+";\n");
if (randomizeInitValues)
{
categDef1*("global P_"+infix+mi+" = Random(0.05,0.95);\nP_"+infix+mi+":<1;\n");
categDef1*("global S_"+infix+mi+" = Random(0.05,1);");
}
else
{
categDef1*("global P_"+infix+mi+" = 1/"+(resp+1-mi)+";\nP_"+infix+mi+":<1;\n");
}
}
freqStrMx = {resp,1};
if (resp>1)
{
freqStrMx[0] = "P_"+infix+"1";
for (mi=1; mi<resp-1; mi=mi+1)
{
freqStrMx[mi] = "";
for (mi2=1;mi2<=mi;mi2=mi2+1)
{
freqStrMx[mi] = freqStrMx[mi]+"(1-P_"+infix+mi2+")";
}
freqStrMx[mi] = freqStrMx[mi]+"P_"+infix+(mi+1);
}
freqStrMx[mi] = "";
for (mi2=1;mi2<mi;mi2=mi2+1)
{
freqStrMx[mi] = freqStrMx[mi]+"(1-P_"+infix+mi2+")";
}
freqStrMx[mi] = freqStrMx[mi]+"(1-P_"+infix+mi+")";
}
else
{
freqStrMx[0] = "1";
}
categDef1*( "\n\nglobal c_"+infix+"scale:=S_"+infix+"0*" + freqStrMx[0]);
for (mi=1; mi<resp; mi=mi+1)
{
categDef1*( "+S_"+infix+mi+"*" + freqStrMx[mi]);
}
categDef1 * ";";
for (mi=0; mi<resp; mi=mi+1)
{
categDef1 * ("\nglobal R_"+infix+mi+"=1;NS_"+infix+mi+":=R_"+infix+mi+"*S_"+infix+mi+";\n");
}
if (stratDNDS)
{
for (mi=respP; mi<respN+respP; mi=mi+1)
{
categDef1 * ("\nR_"+infix+mi+":=1;");
}
if (randomizeInitValues)
{
for (mi=respP+respN; mi<resp; mi=mi+1)
{
categDef1 * ("\nR_"+infix+mi+":<1;R_"+infix+mi+"="+Random(0.05,0.95)+";");
}
for (mi=0; mi<respP; mi=mi+1)
{
categDef1 * ("\nR_"+infix+mi+":>1;R_"+infix+mi+"="+Random(1.05,10)+";");
}
}
else
{
for (mi=respN+respP; mi<resp; mi=mi+1)
{
categDef1 * ("\nR_"+infix+mi+":<1;R_"+infix+mi+"=1/"+(2+mi-respN-respP)+";");
}
for (mi=0; mi<respP; mi=mi+1)
{
categDef1 * ("\nR_"+infix+mi+":>1;R_"+infix+mi+"=2+"+mi+";");
}
}
}
else
{
if (randomizeInitValues)
{
for (mi=0; mi<resp; mi=mi+1)
{
categDef1 * ("\nR_"+infix+mi+"="+Random(0.05,1.75)+";");
}
}
else
{
for (mi=0; mi<resp; mi=mi+1)
{
categDef1 * ("\nR_"+infix+mi+"="+(0.1+0.3*mi)+";");
}
}
}
categDef1 * 0;
return 0;
}
/*---------------------------------------------------------------------------------------------------------------------------------------------------*/
ChoiceList (branchLengths,"Branch Lengths",1,SKIP_NONE,
"Codon Model","Jointly optimize rate parameters and branch lengths (slow and thorough)",
"Nucleotide Model","Estimate branch lengths once, using an appropriate nucleotide model (quick and dirty)."
);
if (branchLengths<0)
{
return;
}
fileCount = 2;
#include "TemplateModels/chooseGeneticCode.def";
ExecuteAFile ("2RatesAnalyses/MG94xREVxBivariate.mdl");
fprintf (stdout, "\nReading input file 1/2\n");
SetDialogPrompt ("Please specify codon data #1:");
DataSet ds_1 = ReadDataFile (PROMPT_FOR_FILE);
fprintf (stdout, "\nData file 1: ", LAST_FILE_PATH, "\n\t", ds_1.species, " sequences and ", ds_1.sites, " nucleotides\n");
DataSetFilter filteredData_1 = CreateFilter (ds_1,3,"","",GeneticCodeExclusions);
HarvestFrequencies (observedFreq_1,filteredData_1,3,1,1);
HarvestFrequencies (observedFreqSingle_1,filteredData_1,1,1,1);
done = 0;
while (!done)
{
fprintf (stdout,"\nPlease enter a 6 character model designation (e.g:010010 defines HKY85):");
fscanf (stdin,"String", modelDesc);
if (Abs(modelDesc)==6)
{
done = 1;
}
}
ModelTitle = "MG94x"+modelDesc[0];
rateBiasTerms = {{"AC_1","1","AT_1","CG_1","CT_1","GT_1"}};
DefineNucleotideBiases (0);
modelConstraintString_1 = modelConstraintString;
ModelTitle_1 = ModelTitle;
fprintf (stdout, "Using model ", ModelTitle_1, " on alignment 1\n\n");
#include "queryTree.bf";
treeString_1 = treeString;
fprintf (stdout, "\nReading input file 2/2\n");
SetDialogPrompt ("Please specify codon data #2:");
DataSet ds_2 = ReadDataFile (PROMPT_FOR_FILE);
fprintf (stdout, "\nData file 2: ", LAST_FILE_PATH, "\n\t", ds_2.species, " sequences and ", ds_2.sites, " nucleotides\n");
DataSetFilter filteredData_2 = CreateFilter (ds_2,3,"","",GeneticCodeExclusions);
HarvestFrequencies (observedFreq_2,filteredData_2,3,1,1);
HarvestFrequencies (observedFreqSingle_2,filteredData_2,1,1,1);
done = 0;
while (!done)
{
fprintf (stdout,"\nPlease enter a 6 character model designation (e.g:010010 defines HKY85):");
fscanf (stdin,"String", modelDesc);
if (Abs(modelDesc)==6)
{
done = 1;
}
}
ModelTitle = "MG94x"+modelDesc[0];
rateBiasTerms = {{"AC_2","1","AT_2","CG_2","CT_2","GT_2"}};
DefineNucleotideBiases (0);
modelConstraintString_2 = modelConstraintString;
ModelTitle_2 = ModelTitle;
fprintf (stdout, "Using model ", ModelTitle_2, " on alignment 2\n\n");
#include "queryTree.bf";
treeString_2 = treeString;
totalCodonCount = filteredData_1.sites + filteredData_2.sites;
ChoiceList (randomizeInitValues, "Initial Value Options",1,SKIP_NONE,
"Default", "Use default inital values for rate distribution parameters.",
"Randomized", "Select initial values for rate distribution parameters at random.");
if (randomizeInitValues < 0)
{
return;
}
stratDNDS = 1;
resp = 4;
respM = 2;
respN = 1;
respP = 1;
fprintf (stdout, "\nUsing\n\t", respM, " negatively selected classes\n\t", respN, " neutrally evolving classes\n\t",
respP, " positively selected classes\n\n");
categDef1 = "";
categDef1 * 1024;
lfDef = {};
for (fileID = 0; fileID < fileCount; fileID = fileID + 1)
{
lfDef[fileID] = "";
lfDef[fileID] * 1024;
lfDef[fileID] * "Log(";
}
defineRateDistribution ("1_");
categDef_1 = categDef1;
freqStrMx_1 = freqStrMx;
for (mi=0; mi<resp; mi=mi+1)
{
fileID = 0;
if (mi)
{
lfDef[fileID] * "+";
}
lfDef[fileID]*(freqStrMx_1[mi]+"*SITE_LIKELIHOOD["+(fileID*resp+mi)+"]");
}
defineRateDistribution ("2_");
categDef_2 = categDef1;
freqStrMx_2 = freqStrMx;
for (mi=0; mi<resp; mi=mi+1)
{
fileID = 1;
if (mi)
{
lfDef[fileID] * "+";
}
lfDef[fileID]*(freqStrMx_2[mi]+"*SITE_LIKELIHOOD["+(fileID*resp+mi)+"]");
}
lfDef1 = "";
lfDef1 * 128;
lfDef1 * "\"";
for (fileID = 0; fileID < fileCount; fileID = fileID + 1)
{
lfDef[fileID] * ")";
lfDef[fileID] * 0;
lfDef1 * lfDef[fileID];
if (fileID < fileCount - 1)
{
lfDef1 * "+";
}
}
lfDef1 * "\"";
lfDef1 * 0;
ExecuteCommands (categDef_1);
ExecuteCommands (categDef_2);
ModelMatrixDimension = 64;
for (h = 0 ;h<64; h=h+1)
{
if (_Genetic_Code[h]==10)
{
ModelMatrixDimension = ModelMatrixDimension-1;
}
}
SetDialogPrompt ("Save resulting fits to:");
fprintf (PROMPT_FOR_FILE, CLEAR_FILE);
resToPath = LAST_FILE_PATH;
vectorOfFrequencies_1 = BuildCodonFrequencies (observedFreq_1);
vectorOfFrequencies_2 = BuildCodonFrequencies (observedFreq_2);
fprintf (stdout, "Fitting a nucleotide model to approximate branch lengths and get starting nucleotide bias estimates...\n");
nucModelMatrix_1 = {{*,AC_1*t,t,AT_1*t}{AC_1*t,*,CG_1*t,CT_1*t}{t,CG_1*t,*,GT_1*t}{AT_1*t,CT_1*t,GT_1*t,*}};
Model nucModel_1 = (nucModelMatrix_1,observedFreqSingle_1);
DataSetFilter nucFilter_1 = CreateFilter (filteredData_1,1);
Tree nucTree_1 = treeString_1;
nucModelMatrix_2 = {{*,AC_2*t,t,AT_2*t}{AC_2*t,*,CG_2*t,CT_2*t}{t,CG_2*t,*,GT_2*t}{AT_2*t,CT_2*t,GT_2*t,*}};
Model nucModel_2 = (nucModelMatrix_2,observedFreqSingle_2);
DataSetFilter nucFilter_2 = CreateFilter (filteredData_2,1);
Tree nucTree_2 = treeString_2;
LikelihoodFunction nuc_lf = (nucFilter_1, nucTree_1, nucFilter_2, nucTree_2);
Optimize (nuc_res, nuc_lf);
global codonFactor_1 = 0.33;
global codonFactor_2 = 0.33;
lfParts = "";
lfParts * 128;
lfParts * "LikelihoodFunction lf = (filteredData_1,tree_1_0";
for (fileID = 1; fileID <= fileCount; fileID = fileID + 1)
{
rateMultipliers = {{"" ,"AC_"+fileID+"*","" ,"AT_"+fileID+"*"}
{"AC_"+fileID+"*","" ,"CG_"+fileID+"*","CT_"+fileID+"*"}
{"" ,"CG_"+fileID+"*","" ,"GT_"+fileID+"*"}
{"AT_"+fileID+"*","CT_"+fileID+"*","GT_"+fileID+"*","" }
};
for (part = 0; part < resp; part = part + 1)
{
ExecuteCommands ("PopulateModelMatrix(\"rate_matrix_"+fileID+"_"+part+"\",observedFreq_"+fileID+",\"S_"+fileID+"_"+part+"/c_"+fileID+"_scale\",\"NS_"+fileID+"_"+part+"/c_"+fileID+"_scale\");");
ExecuteCommands ("Model MG94model_"+fileID+"_"+part+"= (rate_matrix_"+fileID+"_"+part+",vectorOfFrequencies_"+fileID+",0);");
treeID = "tree_"+fileID+"_"+part;
ExecuteCommands ("Tree "+treeID+"=treeString_" + fileID +";");
if (branchLengths)
{
ExecuteCommands ("ReplicateConstraint (\"this1.?.synRate:=this2.?.t__/codonFactor_"+fileID+"\","+treeID+",nucTree_"+fileID+");");
}
else
{
if (part == 0)
{
ExecuteCommands ("bnames = BranchName(nucTree_" + fileID + ",-1);");
nlfDef = "";
nlfDef * 128;
for (lc = 0; lc < Columns (bnames); lc=lc+1)
{
nlfDef * (treeID + "." + bnames[lc] + ".synRate = nucTree_" + fileID + "." + bnames[lc] + ".t/codonFactor_"+fileID+";");
}
nlfDef * 0;
ExecuteCommands (nlfDef);
}
else
{
ExecuteCommands ("ReplicateConstraint (\"this1.?.synRate:=this2.?.synRate\","+treeID+",tree_"+fileID+"_0);");
}
}
if (part || fileID > 1)
{
lfParts = lfParts + ",filteredData_" + fileID + "," + treeID;
}
}
}
lfParts * 0;
ExecuteCommands (lfParts + "," + lfDef1 + ");");
sumPath = resToPath + ".summary";
treeBranchParameters = 0;
for (fileID = 1; fileID <= fileCount; fileID = fileID + 1)
{
ExecuteCommands ("treeBranchParameters = treeBranchParameters + BranchCount(tree_" + fileID + "_0) + TipCount(tree_" + fileID + "_0);");
}
/*-------------- INDEPENDENT DISTRIBUTIONS ------------------*/
sop = Max(OPTIMIZATION_PRECISION,0.001);
fprintf (stdout, "Running simpler distribution approximations to ensure good convergence...\n");
OPTIMIZATION_PRECISION = 0.1;
P_1_1 := 0;P_1_2 := 0;P_1_3 := 0;
P_2_1 := 0;P_2_2 := 0;P_2_3 := 0;
S_1_0 := 1;S_1_1 := 1;S_1_2 := 1;S_1_3 := 1;
S_2_0 := 1;S_2_1 := 1;S_2_2 := 1;S_2_3 := 1;
R_1_0 := 1;R_1_1 := 1;R_1_2 := 1;
R_2_0 := 1;R_2_1 := 1;R_2_2 := 1;
Optimize (res,lf);
USE_LAST_RESULTS = 1;
SKIP_CONJUGATE_GRADIENT = 1;
d1 = ReportDistributionString(4,freqStrMx_1,"1_",1);
d2 = ReportDistributionString(4,freqStrMx_2,"2_",1);
fprintf (stdout, "\n*** Done with pass 1. Log(L) = ", Format(res[1][0],10,3), " *** \n");
fprintf (stdout, "Approximate rates for data set 1:\n", d1);
fprintf (stdout, "Approximate rates for data set 2:\n", d2);
codonFactor_1 := codonFactor_1__;
codonFactor_2 := codonFactor_2__;
AC_1 := AC_1__; AT_1 := AT_1__; CG_1 := CG_1__; CT_1 := CT_1__; GT_1 := GT_1__;
AC_2 := AC_2__; AT_2 := AT_2__; CG_2 := CG_2__; CT_2 := CT_2__; GT_2 := GT_2__;
fprintf (stdout, "\nGateaux sampling positively selected directions\n");
P_1_1 = 2/filteredData_1.sites;
S_1_3 = 1;
baseLineLL = res[1][0];
LFCompute (lf,LF_START_COMPUTE);
bestDiff = -1e100;
bestAlpha = 1;
bestBeta = 1;
bestLL = -1e100;
step = 0.1;
step2 = 0.25;
v1 = 0.05;
for (v1c = 0; v1c < 10; v1c = v1c + 1)
{
v2 = v1+step;
for (v2c = 0; v2c < 20; v2c = v2c+1)
{
checkASample ("1_0");
v2 = v2 + step2;
}
v1 = v1+step;
}
S_1_0 = bestAlpha;
R_1_0 = bestBeta;
bestAlpha = 1;
bestBeta = 1;
if (bestDiff <= 0)
{
P_1_1 = 0;
}
saveBD = bestDiff;
P_2_1 = 1/filteredData_2.sites;
S_2_3 = 1;
v1 = 0.05;
for (v1c = 0; v1c < 10; v1c = v1c + 1)
{
v2 = v1+step;
for (v2c = 0; v2c < 20; v2c = v2c+1)
{
checkASample ("2_0");
v2 = v2 + step2;
}
v1 = v1+step;
}
LFCompute (lf,LF_DONE_COMPUTE);
S_2_0 = bestAlpha;
R_2_0 = bestBeta;
if (bestDiff <= saveBD)
{
P_2_1 = 0;
}
if (bestDiff > 0)
{
fprintf (stdout, "\nFound a likelihood improvement in the direction (", S_1_0, ",", S_1_0*R_1_0, "), (", S_2_0, ",", S_2_0*R_2_0, "), ",bestDiff," likelihood points\n");
}
Optimize (res,lf);
d1 = ReportDistributionString(4,freqStrMx_1,"1_",1);
d2 = ReportDistributionString(4,freqStrMx_2,"2_",1);
fprintf (stdout, "\n*** Done with pass 2. Log(L) = ", Format(res[1][0],10,3), " *** \n");
fprintf (stdout, "Approximate rates for data set 1:\n", d1);
fprintf (stdout, "Approximate rates for data set 2:\n", d2);
fprintf (stdout, "\nGateaux sampling neutral directions\n");
baseLineLL = res[1][0];
LFCompute (lf,LF_START_COMPUTE);
P_1_2 = 1/(filteredData_1.sites*(1-P_1_1));
bestDiff = -1e100;
bestAlpha = 1;
step = 0.02;
v1 = 0;
for (v1c = 0; v1c < 50; v1c = v1c + 1)
{
v1 = v1 + step;
v2 = v1;
checkASample ("1_1");
}
S_1_1 = bestAlpha;
bestAlpha = 1;
if (bestDiff <= 0)
{
P_1_2 = 0;
}
saveBD = bestDiff;
R_1_1 := 1;
P_2_2 = 1/(filteredData_2.sites*(1-P_2_1));
v1 = 0;
for (v1c = 0; v1c < 50; v1c = v1c + 1)
{
v1 = v1 + step;
v2 = v1;
checkASample ("2_1");
}
LFCompute (lf,LF_DONE_COMPUTE);
S_2_1 = bestAlpha;
R_2_1 := 1;
if (bestDiff <= saveBD)
{
P_2_2 = 0;
}
if (bestDiff > 0)
{
fprintf (stdout, "\nFound a likelihood improvement in the direction (", S_1_1, ",", S_1_1, "), (", S_2_1, ",", S_2_1, "), ",bestDiff," likelihood points\n");
}
Optimize (res,lf);
d1 = ReportDistributionString(4,freqStrMx_1,"1_",1);
d2 = ReportDistributionString(4,freqStrMx_2,"2_",1);
fprintf (stdout, "\n*** Done with pass 3. Log(L) = ", Format(res[1][0],10,3), " *** \n");
fprintf (stdout, "Approximate rates for data set 1:\n", d1);
fprintf (stdout, "Approximate rates for data set 2:\n", d2);
fprintf (stdout, "\nGateaux sampling negative selected directions\n");
baseLineLL = res[1][0];
LFCompute (lf,LF_START_COMPUTE);
bestDiff = -1e100;
bestAlpha = 1;
bestBeta = 1;
step = 1/16;
v1 = 0;
P_1_3 = 2/(filteredData_1.sites*(1-P_1_1)*(1-P_1_2));
S_1_2 = 1;
for (v1c = 0; v1c < 15; v1c = v1c + 1)
{
v1 = v1+step;
v2 = step/2;
for (v2c = 0; v2c < v1c; v2c = v2c+1)
{
checkASample ("1_2");
v2 = v2 + step;
}
}
S_1_2 = bestAlpha;
R_1_2 = bestBeta;
bestAlpha = 1;
bestBeta = 1;
if (bestDiff <= 0)
{
P_1_3 = 0;
}
saveBD = bestDiff;
v1 = 0;
P_2_3 = 2/(filteredData_2.sites*(1-P_2_1)*(1-P_2_2));
S_2_2 = 1;
for (v1c = 0; v1c < 15; v1c = v1c + 1)
{
v1 = v1+step;
v2 = step/2;
for (v2c = 0; v2c < v1c; v2c = v2c+1)
{
checkASample ("2_2");
v2 = v2 + step;
}
}
LFCompute (lf,LF_DONE_COMPUTE);
S_2_2 = bestAlpha;
R_2_2 = bestBeta;
if (bestDiff <= saveBD)
{
P_2_2 = 0;
}
if (bestDiff > 0)
{
fprintf (stdout, "\nFound a likelihood improvement in the direction (", S_1_2, ",", S_1_2*R_1_2, "), (", S_2_2, ",", S_2_2*R_2_2, "), ",bestDiff," likelihood points\n");
}
AC_1 = AC_1; AT_1 = AT_1; CG_1 = CG_1; CT_1 = CT_1; GT_1 = GT_1;
AC_2 = AC_2; AT_2 = AT_2; CG_2 = CG_2; CT_2 = CT_2; GT_2 = GT_2;
if (Abs(modelConstraintString_1))
{
ExecuteCommands (modelConstraintString_1);
}
if (Abs(modelConstraintString_2))
{
ExecuteCommands (modelConstraintString_2);
}
codonFactor_1 = codonFactor_1;
codonFactor_2 = codonFactor_2;
GetString (paramList, lf, -1);
degF = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters;
OPTIMIZATION_PRECISION = sop;
fprintf (stdout, "Running an independent distributions model fit (", degF, " parameters)...\n");
Optimize (res,lf);
LogL = res[1][0];
AIC = 2*(degF-LogL);
AICc = 2*(degF*totalCodonCount/(totalCodonCount-degF-1) - LogL);
fprintf (stdout, "\n\nIndependent distributions model fit summary\n",
"\nLog likelihood:", Format (LogL, 15, 5),
"\nParameters :", Format (degF, 15, 0),
"\nAIC :", Format (AIC, 15, 5),
"\nc-AIC :", Format (AICc, 15, 5),"\n"
);
fprintf (sumPath,CLEAR_FILE,"Independent distributions model fit summary\n",
"\nLog likelihood:", Format (LogL, 15, 5),
"\nParameters :", Format (degF, 15, 0),
"\nAIC :", Format (AIC, 15, 5),
"\nc-AIC :", Format (AICc, 15, 5),"\n");
d1 = ReportDistributionString(4,freqStrMx_1,"1_",0);
d2 = ReportDistributionString(4,freqStrMx_2,"2_",0);
fprintf (stdout, "Inferred rates for data set 1:\n", d1);
fprintf (sumPath, "Inferred rates for data set 1:\n", d1);
fprintf (stdout, "Inferred rates for data set 2:\n", d2);
fprintf (sumPath, "Inferred rates for data set 2:\n", d2);
LIKELIHOOD_FUNCTION_OUTPUT = 6;
fprintf (resToPath,CLEAR_FILE,lf);
/*-------------- SHARED POSITIVE SELECTION STRENGTHS ------------------*/
for (k=0; k<respP; k=k+1)
{
ExecuteCommands ("R_1_"+k+"=0.5(R_1_"+k+"+R_2_"+k+");");
ExecuteCommands ("R_2_"+k+":=R_1_"+k+";");
}
GetString (paramList, lf, -1);
degFPSS = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters;
fprintf (stdout, "\nRunning a shared positive selection strengths model (",degFPSS," parameters)...\n");
Optimize (res_PSS,lf);
LogLPSS = res_PSS[1][0];
AICPSS = 2*(degFPSS-LogLPSS);
AICcPSS = 2*(degFPSS*totalCodonCount/(totalCodonCount-degFPSS-1) - LogLPSS);
fprintf (stdout, "\n\nShared positive selection strengths model fit summary\n",
"\nLog likelihood:", Format (LogLPSS, 15, 5),
"\nParameters :", Format (degFPSS, 15, 0),
"\nAIC :", Format (AICPSS, 15, 5),
"\nc-AIC :", Format (AICcPSS, 15, 5),"\n"
);
fprintf (sumPath, "\n\nShared positive strengths regime model fit summary\n",
"\nLog likelihood:", Format (LogLPSS, 15, 5),
"\nParameters :", Format (degFPSS, 15, 0),
"\nAIC :", Format (AICPSS, 15, 5),
"\nc-AIC :", Format (AICcPSS, 15, 5),"\n"
);
d1 = ReportDistributionString(4,freqStrMx_1,"1_",0);
d2 = ReportDistributionString(4,freqStrMx_2,"2_",0);
fprintf (stdout, "Inferred rates for data set 1:\n", d1);
fprintf (sumPath, "Inferred rates for data set 1:\n", d1);
fprintf (stdout, "Inferred rates for data set 2:\n", d2);
fprintf (sumPath, "Inferred rates for data set 2:\n", d2);
fpath = resToPath + ".SharedStrength";
LIKELIHOOD_FUNCTION_OUTPUT = 6;
fprintf (fpath,CLEAR_FILE,lf);
/*-------------- SHARED POSITIVE SELECTION PROPORTIONS ------------------*/
/* RELOAD INDEPENDENT FIT */
DeleteObject (lf);
ExecuteAFile (resToPath);
for (k=1; k<=respP; k=k+1)
{
ExecuteCommands ("P_1_"+k+"=0.5(P_1_"+k+"+P_2_"+k+");");
ExecuteCommands ("P_2_"+k+":=P_1_"+k+";");
}
GetString (paramList, lf, -1);
degFPSP = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters;
fprintf (stdout, "\nRunning a shared positive selection proportions model (",degFPSP," parameters) ...\n");
Optimize (res_PSP,lf);
LogLPSP = res_PSP[1][0];
AICPSP = 2*(degFPSP-LogLPSP);
AICcPSP = 2*(degFPSP*totalCodonCount/(totalCodonCount-degFPSP-1) - LogLPSP);
fprintf (stdout, "\n\nShared positive selection proportions model fit summary\n",
"\nLog likelihood:", Format (LogLPSP, 15, 5),
"\nParameters :", Format (degFPSP, 15, 0),
"\nAIC :", Format (AICPSP, 15, 5),
"\nc-AIC :", Format (AICcPSP, 15, 5),"\n"
);
fprintf (sumPath, "\n\nShared positive proportions regime model fit summary\n",
"\nLog likelihood:", Format (LogLPSP, 15, 5),
"\nParameters :", Format (degFPSP, 15, 0),
"\nAIC :", Format (AICPSP, 15, 5),
"\nc-AIC :", Format (AICcPSP, 15, 5),"\n"
);
d1 = ReportDistributionString(4,freqStrMx_1,"1_",0);
d2 = ReportDistributionString(4,freqStrMx_2,"2_",0);
fprintf (stdout, "Inferred rates for data set 1:\n", d1);
fprintf (sumPath, "Inferred rates for data set 1:\n", d1);
fprintf (stdout, "Inferred rates for data set 2:\n", d2);
fprintf (sumPath, "Inferred rates for data set 2:\n", d2);
fpath = resToPath + ".SharedProportion";
LIKELIHOOD_FUNCTION_OUTPUT = 6;
fprintf (fpath,CLEAR_FILE,lf);
/*-------------- SHARED POSITIVE SELECTION REGIME ------------------*/
/* RELOAD INDEPENDENT FIT */
DeleteObject (lf);
ExecuteAFile (resToPath);
for (k=0; k<respP; k=k+1)
{
ExecuteCommands ("R_1_"+k+"=0.5(R_1_"+k+"+R_2_"+k+");");
ExecuteCommands ("R_2_"+k+":=R_1_"+k+";");
}
for (k=1; k<=respP; k=k+1)
{
ExecuteCommands ("P_1_"+k+"=0.5(P_1_"+k+"+P_2_"+k+");");
ExecuteCommands ("P_2_"+k+":=P_1_"+k+";");
}
GetString (paramList, lf, -1);
degFPSH = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters;
fprintf (stdout, "\nRunning a shared positive selection regime model (", degFPSH, " parameters)...\n");
Optimize (res_PSH,lf);
LogLPSH = res_PSH[1][0];
AICPSH = 2*(degFPSH-LogLPSH);
AICcPSH = 2*(degFPSH*totalCodonCount/(totalCodonCount-degFPSH-1) - LogLPSH);
fprintf (stdout, "\n\nShared positive selection regime model fit summary\n",
"\nLog likelihood:", Format (LogLPSH, 15, 5),
"\nParameters :", Format (degFPSH, 15, 0),
"\nAIC :", Format (AICPSH, 15, 5),
"\nc-AIC :", Format (AICcPSH, 15, 5),"\n"
);
fprintf (sumPath, "\n\nShared positive selection regime model fit summary\n",
"\nLog likelihood:", Format (LogLPSH, 15, 5),
"\nParameters :", Format (degFPSH, 15, 0),
"\nAIC :", Format (AICPSH, 15, 5),
"\nc-AIC :", Format (AICcPSH, 15, 5),"\n"
);
d1 = ReportDistributionString(4,freqStrMx_1,"1_",0);
d2 = ReportDistributionString(4,freqStrMx_2,"2_",0);
fprintf (stdout, "Inferred rates for data set 1:\n", d1);
fprintf (sumPath, "Inferred rates for data set 1:\n", d1);
fprintf (stdout, "Inferred rates for data set 2:\n", d2);
fprintf (sumPath, "Inferred rates for data set 2:\n", d2);
fpath = resToPath + ".SharedPositiveSelection";
LIKELIHOOD_FUNCTION_OUTPUT = 6;
fprintf (fpath,CLEAR_FILE,lf);
/*-------------- SHARED MODEL ------------------*/
DeleteObject (lf);
ExecuteAFile (resToPath);
for (k=0; k<resp; k=k+1)
{
ExecuteCommands ("S_1_"+k+"=0.5(S_1_"+k+"+S_2_"+k+");");
if (k<respP || k>=respP+respN)
{
ExecuteCommands ("R_1_"+k+"=0.5(R_1_"+k+"+R_2_"+k+");");
}
ExecuteCommands ("S_2_"+k+":=S_1_"+k+";R_2_"+k+":=R_1_"+k+";");
}
for (k=1; k<=resp; k=k+1)
{
ExecuteCommands ("P_1_"+k+"=0.5(P_1_"+k+"+P_2_"+k+");");
ExecuteCommands ("P_2_"+k+":=P_1_"+k+";");
}
GetString (paramList, lf, -1);
degFJ = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters;
fprintf (stdout, "\nRunning a shared distributions model fit (", degFJ, " parameters)...\n");
Optimize (res_J,lf);
LogLJ = res_J[1][0];
AICJ = 2*(degFJ-LogLJ);
AICcJ = 2*(degFJ*totalCodonCount/(totalCodonCount-degFJ-1) - LogLJ);
fprintf (stdout, "\n\nShared distributions model fit summary\n",
"\nLog likelihood:", Format (LogLJ, 15, 5),
"\nParameters :", Format (degFJ, 15, 0),
"\nAIC :", Format (AICJ, 15, 5),
"\nc-AIC :", Format (AICcJ, 15, 5),"\n"
);
fprintf (sumPath, "\n\nShared distributions model fit summary\n",
"\nLog likelihood:", Format (LogLJ, 15, 5),
"\nParameters :", Format (degFJ, 15, 0),
"\nAIC :", Format (AICJ, 15, 5),
"\nc-AIC :", Format (AICcJ, 15, 5),"\n"
);
d1 = ReportDistributionString(4,freqStrMx_1,"1_",0);
fprintf (stdout, "Inferred joint rates:\n", d1);
fprintf (sumPath, "Inferred joint rates:\n", d1);
fpath = resToPath + ".JointAll";
LIKELIHOOD_FUNCTION_OUTPUT = 6;
fprintf (fpath,CLEAR_FILE,lf);
USE_LAST_RESULTS = 0;
SKIP_CONJUGATE_GRADIENT = 0;
LIKELIHOOD_FUNCTION_OUTPUT = 2;
fprintf (stdout, "\nDistribution comparison tests\n",
"\n\tAre the distributions different?",
"\n\t\tLR = ", Format (2*(LogL-LogLJ),10,3),
" DF = ", degF-degFJ,
" p = ", Format(1-CChi2(2*(LogL-LogLJ), degF-degFJ),8,3), "\n");
fprintf (stdout, "\n\tAre selective regimes (dN/dS and proportions) different?",
"\n\t\tLR = ", Format (2*(LogL-LogLPSH),10,3),
" DF = ", degF-degFPSH,
" p = ", Format(1-CChi2(2*(LogL-LogLPSH), degF-degFPSH),8,3), "\n");
fprintf (stdout, "\n\tAre selection strengths (dN/dS) different?",
"\n\t\tLR = ", Format (2*(LogL-LogLPSS),10,3),
" DF = ", degF-degFPSS,
" p = ", Format(1-CChi2(2*(LogL-LogLPSS), degF-degFPSH),8,3), "\n");
fprintf (stdout, "\n\tAre the proportions of codons under selection different?",
"\n\t\tLR = ", Format (2*(LogL-LogLPSP),10,3),
" DF = ", degF-degFPSP,
" p = ", Format(1-CChi2(2*(LogL-LogLPSP), degF-degFPSP),8,3), "\n");
fprintf (sumPath, "\n\nDistribution comparison tests\n",
"\n\tAre the distributions different?",
"\n\t\tLR = ", Format (2*(LogL-LogLJ),10,3),
" DF = ", degF-degFJ,
" p = ", Format(1-CChi2(2*(LogL-LogLJ), degF-degFJ),8,3), "\n");
fprintf (sumPath, "\n\tAre selective regimes (dN/dS and proportions) different?",
"\n\t\tLR = ", Format (2*(LogL-LogLPSH),10,3),
" DF = ", degF-degFPSH,
" p = ", Format(1-CChi2(2*(LogL-LogLPSH), degF-degFPSH),8,3), "\n");
fprintf (sumPath, "\n\tAre selection strengths (dN/dS) different?",
"\n\t\tLR = ", Format (2*(LogL-LogLPSS),10,3),
" DF = ", degF-degFPSS,
" p = ", Format(1-CChi2(2*(LogL-LogLPSS), degF-degFPSH),8,3), "\n");
fprintf (sumPath, "\n\tAre the proportions of codons under selection different?",
"\n\t\tLR = ", Format (2*(LogL-LogLPSP),10,3),
" DF = ", degF-degFPSP,
" p = ", Format(1-CChi2(2*(LogL-LogLPSP), degF-degFPSP),8,3), "\n");
/*------------------------------------------------------------------------------------------------------------*/
function checkASample (whichRate)
{
ExecuteCommands ("S_"+whichRate+"=v1;R_"+whichRate+"=v2/v1;");
LFCompute (lf,res_n);
localDiff = res_n-baseLineLL;
if (localDiff > bestDiff)
{
bestDiff = localDiff;
bestAlpha = v1;
bestBeta = v2/v1;
}
return 0;
}