mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 01:30:22 +00:00
1026 lines
29 KiB
Beef
1026 lines
29 KiB
Beef
/* Written by SL Kosakovsky Pond (sergeilkp@mac.com) October 2006 */
|
|
/* This analysis uses a 4 bin general bivariate discrete distributions,
|
|
with 2 negatively selected classes, a positively selected class and a neutral class,
|
|
to compare the distributions of dS and dN in two data sets.
|
|
|
|
The tests done:
|
|
|
|
- Are the distributions the same?
|
|
- Are there the same proportions of sites with dN>dS in the data sets?
|
|
- Do the sites under positive selection share the same dN/dS?
|
|
- Two previous tests combined
|
|
*/
|
|
|
|
RequireVersion ("0.9920061001");
|
|
|
|
/*---------------------------------------------------------------------------------------------------------------------------------------------------*/
|
|
|
|
function BuildCodonFrequencies (obsF)
|
|
{
|
|
PIStop = 1.0;
|
|
result = {ModelMatrixDimension,1};
|
|
hshift = 0;
|
|
|
|
for (h=0; h<64; h=h+1)
|
|
{
|
|
first = h$16;
|
|
second = h%16$4;
|
|
third = h%4;
|
|
if (_Genetic_Code[h]==10)
|
|
{
|
|
hshift = hshift+1;
|
|
PIStop = PIStop-obsF[first][0]*obsF[second][1]*obsF[third][2];
|
|
continue;
|
|
}
|
|
result[h-hshift][0]=obsF[first][0]*obsF[second][1]*obsF[third][2];
|
|
}
|
|
return result*(1.0/PIStop);
|
|
}
|
|
|
|
/*---------------------------------------------------------------------------------------------------------------------------------------------------*/
|
|
|
|
function ReportDistributionString (rc,freqStrMx,infix, skip0)
|
|
{
|
|
distroString = "";
|
|
distroString * 1024;
|
|
|
|
reportMx = {rc,4};
|
|
|
|
distroString * (" dN/dS dS dN Prob\n");
|
|
for (mi=0; mi<rc; mi=mi+1)
|
|
{
|
|
ExecuteCommands ("reportMx[mi][0]=S_"+infix+mi+"/c_"+infix+"scale;");
|
|
ExecuteCommands ("reportMx[mi][1]=NS_"+infix+mi+"/c_"+infix+"scale;");
|
|
reportMx[mi][2] = reportMx[mi][1]/reportMx[mi][0];
|
|
ExecuteCommands ("reportMx[mi][3]="+freqStrMx[mi]+";");
|
|
if (skip0 == 0 || reportMx[mi][3] > 0)
|
|
{
|
|
distroString * (Format(reportMx[mi][2],8,3)+Format(reportMx[mi][0],8,3)+Format(reportMx[mi][1],8,3)+Format(reportMx[mi][3],10,3)+"\n");
|
|
}
|
|
}
|
|
|
|
distroString * 0;
|
|
return distroString;
|
|
}
|
|
|
|
/*---------------------------------------------------------------------------------------------------------------------------------------------------*/
|
|
|
|
function DefineNucleotideBiases (dummy)
|
|
{
|
|
ModelTitle = "MG94x"+modelDesc[0];
|
|
modelConstraintString = "";
|
|
|
|
for (customLoopCounter2=0; customLoopCounter2<6; customLoopCounter2=customLoopCounter2+1)
|
|
{
|
|
if (rateBiasTerms[customLoopCounter2] != "1")
|
|
{
|
|
ExecuteCommands ("global " + rateBiasTerms[customLoopCounter2] + "=1;");
|
|
}
|
|
}
|
|
|
|
for (customLoopCounter2=1; customLoopCounter2<6; customLoopCounter2=customLoopCounter2+1)
|
|
{
|
|
for (customLoopCounter=0; customLoopCounter<customLoopCounter2; customLoopCounter=customLoopCounter+1)
|
|
{
|
|
if (modelDesc[customLoopCounter2]==modelDesc[customLoopCounter])
|
|
{
|
|
ModelTitle = ModelTitle+modelDesc[customLoopCounter2];
|
|
if (rateBiasTerms[customLoopCounter2] == "1")
|
|
{
|
|
modelConstraintString = modelConstraintString + rateBiasTerms[customLoopCounter]+":="+rateBiasTerms[customLoopCounter2]+";";
|
|
}
|
|
else
|
|
{
|
|
modelConstraintString = modelConstraintString + rateBiasTerms[customLoopCounter2]+":="+rateBiasTerms[customLoopCounter]+";";
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
if (customLoopCounter==customLoopCounter2)
|
|
{
|
|
ModelTitle = ModelTitle+modelDesc[customLoopCounter2];
|
|
}
|
|
}
|
|
|
|
|
|
if (Abs(modelConstraintString))
|
|
{
|
|
ExecuteCommands (modelConstraintString);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*---------------------------------------------------------------------------------------------------------------------------------------------------*/
|
|
|
|
function defineRateDistribution (infix)
|
|
{
|
|
ExecuteCommands ("global S_"+infix+"0 = 0.5;S_"+infix+"0:>0.0000001;global NS_"+infix+"0 = 0.1;");
|
|
|
|
for (mi=1; mi<resp; mi=mi+1)
|
|
{
|
|
categDef1 * ("global S_"+infix+mi+"=0.5;S_"+infix+mi+":>0.0000001;\nglobal NS_"+infix+mi+";\n");
|
|
if (randomizeInitValues)
|
|
{
|
|
categDef1*("global P_"+infix+mi+" = Random(0.05,0.95);\nP_"+infix+mi+":<1;\n");
|
|
categDef1*("global S_"+infix+mi+" = Random(0.05,1);");
|
|
}
|
|
else
|
|
{
|
|
categDef1*("global P_"+infix+mi+" = 1/"+(resp+1-mi)+";\nP_"+infix+mi+":<1;\n");
|
|
}
|
|
}
|
|
|
|
freqStrMx = {resp,1};
|
|
if (resp>1)
|
|
{
|
|
freqStrMx[0] = "P_"+infix+"1";
|
|
|
|
for (mi=1; mi<resp-1; mi=mi+1)
|
|
{
|
|
freqStrMx[mi] = "";
|
|
for (mi2=1;mi2<=mi;mi2=mi2+1)
|
|
{
|
|
freqStrMx[mi] = freqStrMx[mi]+"(1-P_"+infix+mi2+")";
|
|
}
|
|
freqStrMx[mi] = freqStrMx[mi]+"P_"+infix+(mi+1);
|
|
}
|
|
freqStrMx[mi] = "";
|
|
|
|
for (mi2=1;mi2<mi;mi2=mi2+1)
|
|
{
|
|
freqStrMx[mi] = freqStrMx[mi]+"(1-P_"+infix+mi2+")";
|
|
}
|
|
freqStrMx[mi] = freqStrMx[mi]+"(1-P_"+infix+mi+")";
|
|
}
|
|
else
|
|
{
|
|
freqStrMx[0] = "1";
|
|
}
|
|
|
|
categDef1*( "\n\nglobal c_"+infix+"scale:=S_"+infix+"0*" + freqStrMx[0]);
|
|
|
|
for (mi=1; mi<resp; mi=mi+1)
|
|
{
|
|
categDef1*( "+S_"+infix+mi+"*" + freqStrMx[mi]);
|
|
}
|
|
|
|
categDef1 * ";";
|
|
|
|
for (mi=0; mi<resp; mi=mi+1)
|
|
{
|
|
categDef1 * ("\nglobal R_"+infix+mi+"=1;NS_"+infix+mi+":=R_"+infix+mi+"*S_"+infix+mi+";\n");
|
|
}
|
|
|
|
if (stratDNDS)
|
|
{
|
|
for (mi=respP; mi<respN+respP; mi=mi+1)
|
|
{
|
|
categDef1 * ("\nR_"+infix+mi+":=1;");
|
|
}
|
|
|
|
if (randomizeInitValues)
|
|
{
|
|
for (mi=respP+respN; mi<resp; mi=mi+1)
|
|
{
|
|
categDef1 * ("\nR_"+infix+mi+":<1;R_"+infix+mi+"="+Random(0.05,0.95)+";");
|
|
}
|
|
|
|
for (mi=0; mi<respP; mi=mi+1)
|
|
{
|
|
categDef1 * ("\nR_"+infix+mi+":>1;R_"+infix+mi+"="+Random(1.05,10)+";");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (mi=respN+respP; mi<resp; mi=mi+1)
|
|
{
|
|
categDef1 * ("\nR_"+infix+mi+":<1;R_"+infix+mi+"=1/"+(2+mi-respN-respP)+";");
|
|
}
|
|
|
|
for (mi=0; mi<respP; mi=mi+1)
|
|
{
|
|
categDef1 * ("\nR_"+infix+mi+":>1;R_"+infix+mi+"=2+"+mi+";");
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (randomizeInitValues)
|
|
{
|
|
for (mi=0; mi<resp; mi=mi+1)
|
|
{
|
|
categDef1 * ("\nR_"+infix+mi+"="+Random(0.05,1.75)+";");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (mi=0; mi<resp; mi=mi+1)
|
|
{
|
|
categDef1 * ("\nR_"+infix+mi+"="+(0.1+0.3*mi)+";");
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
categDef1 * 0;
|
|
return 0;
|
|
}
|
|
/*---------------------------------------------------------------------------------------------------------------------------------------------------*/
|
|
|
|
ChoiceList (branchLengths,"Branch Lengths",1,SKIP_NONE,
|
|
"Codon Model","Jointly optimize rate parameters and branch lengths (slow and thorough)",
|
|
"Nucleotide Model","Estimate branch lengths once, using an appropriate nucleotide model (quick and dirty)."
|
|
);
|
|
|
|
if (branchLengths<0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
fileCount = 2;
|
|
#include "TemplateModels/chooseGeneticCode.def";
|
|
|
|
ExecuteAFile ("2RatesAnalyses/MG94xREVxBivariate.mdl");
|
|
fprintf (stdout, "\nReading input file 1/2\n");
|
|
SetDialogPrompt ("Please specify codon data #1:");
|
|
DataSet ds_1 = ReadDataFile (PROMPT_FOR_FILE);
|
|
fprintf (stdout, "\nData file 1: ", LAST_FILE_PATH, "\n\t", ds_1.species, " sequences and ", ds_1.sites, " nucleotides\n");
|
|
DataSetFilter filteredData_1 = CreateFilter (ds_1,3,"","",GeneticCodeExclusions);
|
|
HarvestFrequencies (observedFreq_1,filteredData_1,3,1,1);
|
|
HarvestFrequencies (observedFreqSingle_1,filteredData_1,1,1,1);
|
|
done = 0;
|
|
while (!done)
|
|
{
|
|
fprintf (stdout,"\nPlease enter a 6 character model designation (e.g:010010 defines HKY85):");
|
|
fscanf (stdin,"String", modelDesc);
|
|
if (Abs(modelDesc)==6)
|
|
{
|
|
done = 1;
|
|
}
|
|
}
|
|
ModelTitle = "MG94x"+modelDesc[0];
|
|
rateBiasTerms = {{"AC_1","1","AT_1","CG_1","CT_1","GT_1"}};
|
|
DefineNucleotideBiases (0);
|
|
modelConstraintString_1 = modelConstraintString;
|
|
ModelTitle_1 = ModelTitle;
|
|
fprintf (stdout, "Using model ", ModelTitle_1, " on alignment 1\n\n");
|
|
#include "queryTree.bf";
|
|
treeString_1 = treeString;
|
|
|
|
|
|
fprintf (stdout, "\nReading input file 2/2\n");
|
|
SetDialogPrompt ("Please specify codon data #2:");
|
|
DataSet ds_2 = ReadDataFile (PROMPT_FOR_FILE);
|
|
fprintf (stdout, "\nData file 2: ", LAST_FILE_PATH, "\n\t", ds_2.species, " sequences and ", ds_2.sites, " nucleotides\n");
|
|
DataSetFilter filteredData_2 = CreateFilter (ds_2,3,"","",GeneticCodeExclusions);
|
|
HarvestFrequencies (observedFreq_2,filteredData_2,3,1,1);
|
|
HarvestFrequencies (observedFreqSingle_2,filteredData_2,1,1,1);
|
|
done = 0;
|
|
while (!done)
|
|
{
|
|
fprintf (stdout,"\nPlease enter a 6 character model designation (e.g:010010 defines HKY85):");
|
|
fscanf (stdin,"String", modelDesc);
|
|
if (Abs(modelDesc)==6)
|
|
{
|
|
done = 1;
|
|
}
|
|
}
|
|
ModelTitle = "MG94x"+modelDesc[0];
|
|
rateBiasTerms = {{"AC_2","1","AT_2","CG_2","CT_2","GT_2"}};
|
|
DefineNucleotideBiases (0);
|
|
modelConstraintString_2 = modelConstraintString;
|
|
ModelTitle_2 = ModelTitle;
|
|
fprintf (stdout, "Using model ", ModelTitle_2, " on alignment 2\n\n");
|
|
#include "queryTree.bf";
|
|
treeString_2 = treeString;
|
|
|
|
totalCodonCount = filteredData_1.sites + filteredData_2.sites;
|
|
|
|
ChoiceList (randomizeInitValues, "Initial Value Options",1,SKIP_NONE,
|
|
"Default", "Use default inital values for rate distribution parameters.",
|
|
"Randomized", "Select initial values for rate distribution parameters at random.");
|
|
|
|
|
|
if (randomizeInitValues < 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
stratDNDS = 1;
|
|
resp = 4;
|
|
respM = 2;
|
|
respN = 1;
|
|
respP = 1;
|
|
|
|
fprintf (stdout, "\nUsing\n\t", respM, " negatively selected classes\n\t", respN, " neutrally evolving classes\n\t",
|
|
respP, " positively selected classes\n\n");
|
|
|
|
categDef1 = "";
|
|
categDef1 * 1024;
|
|
|
|
lfDef = {};
|
|
|
|
for (fileID = 0; fileID < fileCount; fileID = fileID + 1)
|
|
{
|
|
lfDef[fileID] = "";
|
|
lfDef[fileID] * 1024;
|
|
lfDef[fileID] * "Log(";
|
|
}
|
|
|
|
defineRateDistribution ("1_");
|
|
categDef_1 = categDef1;
|
|
freqStrMx_1 = freqStrMx;
|
|
|
|
for (mi=0; mi<resp; mi=mi+1)
|
|
{
|
|
fileID = 0;
|
|
if (mi)
|
|
{
|
|
lfDef[fileID] * "+";
|
|
}
|
|
lfDef[fileID]*(freqStrMx_1[mi]+"*SITE_LIKELIHOOD["+(fileID*resp+mi)+"]");
|
|
}
|
|
|
|
defineRateDistribution ("2_");
|
|
categDef_2 = categDef1;
|
|
freqStrMx_2 = freqStrMx;
|
|
|
|
for (mi=0; mi<resp; mi=mi+1)
|
|
{
|
|
fileID = 1;
|
|
if (mi)
|
|
{
|
|
lfDef[fileID] * "+";
|
|
}
|
|
lfDef[fileID]*(freqStrMx_2[mi]+"*SITE_LIKELIHOOD["+(fileID*resp+mi)+"]");
|
|
}
|
|
|
|
lfDef1 = "";
|
|
lfDef1 * 128;
|
|
lfDef1 * "\"";
|
|
|
|
for (fileID = 0; fileID < fileCount; fileID = fileID + 1)
|
|
{
|
|
lfDef[fileID] * ")";
|
|
lfDef[fileID] * 0;
|
|
lfDef1 * lfDef[fileID];
|
|
if (fileID < fileCount - 1)
|
|
{
|
|
lfDef1 * "+";
|
|
}
|
|
}
|
|
|
|
lfDef1 * "\"";
|
|
lfDef1 * 0;
|
|
|
|
ExecuteCommands (categDef_1);
|
|
ExecuteCommands (categDef_2);
|
|
|
|
ModelMatrixDimension = 64;
|
|
for (h = 0 ;h<64; h=h+1)
|
|
{
|
|
if (_Genetic_Code[h]==10)
|
|
{
|
|
ModelMatrixDimension = ModelMatrixDimension-1;
|
|
}
|
|
}
|
|
|
|
SetDialogPrompt ("Save resulting fits to:");
|
|
fprintf (PROMPT_FOR_FILE, CLEAR_FILE);
|
|
resToPath = LAST_FILE_PATH;
|
|
|
|
vectorOfFrequencies_1 = BuildCodonFrequencies (observedFreq_1);
|
|
vectorOfFrequencies_2 = BuildCodonFrequencies (observedFreq_2);
|
|
|
|
fprintf (stdout, "Fitting a nucleotide model to approximate branch lengths and get starting nucleotide bias estimates...\n");
|
|
|
|
nucModelMatrix_1 = {{*,AC_1*t,t,AT_1*t}{AC_1*t,*,CG_1*t,CT_1*t}{t,CG_1*t,*,GT_1*t}{AT_1*t,CT_1*t,GT_1*t,*}};
|
|
Model nucModel_1 = (nucModelMatrix_1,observedFreqSingle_1);
|
|
DataSetFilter nucFilter_1 = CreateFilter (filteredData_1,1);
|
|
Tree nucTree_1 = treeString_1;
|
|
|
|
nucModelMatrix_2 = {{*,AC_2*t,t,AT_2*t}{AC_2*t,*,CG_2*t,CT_2*t}{t,CG_2*t,*,GT_2*t}{AT_2*t,CT_2*t,GT_2*t,*}};
|
|
Model nucModel_2 = (nucModelMatrix_2,observedFreqSingle_2);
|
|
DataSetFilter nucFilter_2 = CreateFilter (filteredData_2,1);
|
|
Tree nucTree_2 = treeString_2;
|
|
|
|
LikelihoodFunction nuc_lf = (nucFilter_1, nucTree_1, nucFilter_2, nucTree_2);
|
|
Optimize (nuc_res, nuc_lf);
|
|
|
|
global codonFactor_1 = 0.33;
|
|
global codonFactor_2 = 0.33;
|
|
|
|
lfParts = "";
|
|
lfParts * 128;
|
|
lfParts * "LikelihoodFunction lf = (filteredData_1,tree_1_0";
|
|
|
|
for (fileID = 1; fileID <= fileCount; fileID = fileID + 1)
|
|
{
|
|
rateMultipliers = {{"" ,"AC_"+fileID+"*","" ,"AT_"+fileID+"*"}
|
|
{"AC_"+fileID+"*","" ,"CG_"+fileID+"*","CT_"+fileID+"*"}
|
|
{"" ,"CG_"+fileID+"*","" ,"GT_"+fileID+"*"}
|
|
{"AT_"+fileID+"*","CT_"+fileID+"*","GT_"+fileID+"*","" }
|
|
};
|
|
|
|
for (part = 0; part < resp; part = part + 1)
|
|
{
|
|
ExecuteCommands ("PopulateModelMatrix(\"rate_matrix_"+fileID+"_"+part+"\",observedFreq_"+fileID+",\"S_"+fileID+"_"+part+"/c_"+fileID+"_scale\",\"NS_"+fileID+"_"+part+"/c_"+fileID+"_scale\");");
|
|
ExecuteCommands ("Model MG94model_"+fileID+"_"+part+"= (rate_matrix_"+fileID+"_"+part+",vectorOfFrequencies_"+fileID+",0);");
|
|
|
|
treeID = "tree_"+fileID+"_"+part;
|
|
ExecuteCommands ("Tree "+treeID+"=treeString_" + fileID +";");
|
|
if (branchLengths)
|
|
{
|
|
ExecuteCommands ("ReplicateConstraint (\"this1.?.synRate:=this2.?.t__/codonFactor_"+fileID+"\","+treeID+",nucTree_"+fileID+");");
|
|
}
|
|
else
|
|
{
|
|
if (part == 0)
|
|
{
|
|
ExecuteCommands ("bnames = BranchName(nucTree_" + fileID + ",-1);");
|
|
nlfDef = "";
|
|
nlfDef * 128;
|
|
for (lc = 0; lc < Columns (bnames); lc=lc+1)
|
|
{
|
|
nlfDef * (treeID + "." + bnames[lc] + ".synRate = nucTree_" + fileID + "." + bnames[lc] + ".t/codonFactor_"+fileID+";");
|
|
}
|
|
nlfDef * 0;
|
|
ExecuteCommands (nlfDef);
|
|
}
|
|
else
|
|
{
|
|
ExecuteCommands ("ReplicateConstraint (\"this1.?.synRate:=this2.?.synRate\","+treeID+",tree_"+fileID+"_0);");
|
|
}
|
|
}
|
|
if (part || fileID > 1)
|
|
{
|
|
lfParts = lfParts + ",filteredData_" + fileID + "," + treeID;
|
|
}
|
|
}
|
|
}
|
|
|
|
lfParts * 0;
|
|
ExecuteCommands (lfParts + "," + lfDef1 + ");");
|
|
|
|
sumPath = resToPath + ".summary";
|
|
|
|
treeBranchParameters = 0;
|
|
for (fileID = 1; fileID <= fileCount; fileID = fileID + 1)
|
|
{
|
|
ExecuteCommands ("treeBranchParameters = treeBranchParameters + BranchCount(tree_" + fileID + "_0) + TipCount(tree_" + fileID + "_0);");
|
|
}
|
|
|
|
/*-------------- INDEPENDENT DISTRIBUTIONS ------------------*/
|
|
|
|
sop = Max(OPTIMIZATION_PRECISION,0.001);
|
|
|
|
fprintf (stdout, "Running simpler distribution approximations to ensure good convergence...\n");
|
|
|
|
OPTIMIZATION_PRECISION = 0.1;
|
|
|
|
P_1_1 := 0;P_1_2 := 0;P_1_3 := 0;
|
|
P_2_1 := 0;P_2_2 := 0;P_2_3 := 0;
|
|
|
|
S_1_0 := 1;S_1_1 := 1;S_1_2 := 1;S_1_3 := 1;
|
|
S_2_0 := 1;S_2_1 := 1;S_2_2 := 1;S_2_3 := 1;
|
|
R_1_0 := 1;R_1_1 := 1;R_1_2 := 1;
|
|
R_2_0 := 1;R_2_1 := 1;R_2_2 := 1;
|
|
|
|
Optimize (res,lf);
|
|
USE_LAST_RESULTS = 1;
|
|
SKIP_CONJUGATE_GRADIENT = 1;
|
|
|
|
d1 = ReportDistributionString(4,freqStrMx_1,"1_",1);
|
|
d2 = ReportDistributionString(4,freqStrMx_2,"2_",1);
|
|
|
|
fprintf (stdout, "\n*** Done with pass 1. Log(L) = ", Format(res[1][0],10,3), " *** \n");
|
|
fprintf (stdout, "Approximate rates for data set 1:\n", d1);
|
|
fprintf (stdout, "Approximate rates for data set 2:\n", d2);
|
|
|
|
codonFactor_1 := codonFactor_1__;
|
|
codonFactor_2 := codonFactor_2__;
|
|
|
|
AC_1 := AC_1__; AT_1 := AT_1__; CG_1 := CG_1__; CT_1 := CT_1__; GT_1 := GT_1__;
|
|
AC_2 := AC_2__; AT_2 := AT_2__; CG_2 := CG_2__; CT_2 := CT_2__; GT_2 := GT_2__;
|
|
|
|
|
|
|
|
fprintf (stdout, "\nGateaux sampling positively selected directions\n");
|
|
|
|
P_1_1 = 2/filteredData_1.sites;
|
|
S_1_3 = 1;
|
|
|
|
baseLineLL = res[1][0];
|
|
LFCompute (lf,LF_START_COMPUTE);
|
|
bestDiff = -1e100;
|
|
bestAlpha = 1;
|
|
bestBeta = 1;
|
|
bestLL = -1e100;
|
|
|
|
step = 0.1;
|
|
step2 = 0.25;
|
|
v1 = 0.05;
|
|
|
|
for (v1c = 0; v1c < 10; v1c = v1c + 1)
|
|
{
|
|
v2 = v1+step;
|
|
for (v2c = 0; v2c < 20; v2c = v2c+1)
|
|
{
|
|
checkASample ("1_0");
|
|
v2 = v2 + step2;
|
|
}
|
|
v1 = v1+step;
|
|
}
|
|
|
|
S_1_0 = bestAlpha;
|
|
R_1_0 = bestBeta;
|
|
bestAlpha = 1;
|
|
bestBeta = 1;
|
|
|
|
if (bestDiff <= 0)
|
|
{
|
|
P_1_1 = 0;
|
|
}
|
|
saveBD = bestDiff;
|
|
|
|
P_2_1 = 1/filteredData_2.sites;
|
|
S_2_3 = 1;
|
|
|
|
v1 = 0.05;
|
|
for (v1c = 0; v1c < 10; v1c = v1c + 1)
|
|
{
|
|
v2 = v1+step;
|
|
for (v2c = 0; v2c < 20; v2c = v2c+1)
|
|
{
|
|
checkASample ("2_0");
|
|
v2 = v2 + step2;
|
|
}
|
|
v1 = v1+step;
|
|
}
|
|
|
|
LFCompute (lf,LF_DONE_COMPUTE);
|
|
|
|
S_2_0 = bestAlpha;
|
|
R_2_0 = bestBeta;
|
|
|
|
if (bestDiff <= saveBD)
|
|
{
|
|
P_2_1 = 0;
|
|
}
|
|
|
|
if (bestDiff > 0)
|
|
{
|
|
fprintf (stdout, "\nFound a likelihood improvement in the direction (", S_1_0, ",", S_1_0*R_1_0, "), (", S_2_0, ",", S_2_0*R_2_0, "), ",bestDiff," likelihood points\n");
|
|
}
|
|
|
|
|
|
Optimize (res,lf);
|
|
|
|
d1 = ReportDistributionString(4,freqStrMx_1,"1_",1);
|
|
d2 = ReportDistributionString(4,freqStrMx_2,"2_",1);
|
|
|
|
fprintf (stdout, "\n*** Done with pass 2. Log(L) = ", Format(res[1][0],10,3), " *** \n");
|
|
fprintf (stdout, "Approximate rates for data set 1:\n", d1);
|
|
fprintf (stdout, "Approximate rates for data set 2:\n", d2);
|
|
|
|
|
|
fprintf (stdout, "\nGateaux sampling neutral directions\n");
|
|
|
|
baseLineLL = res[1][0];
|
|
LFCompute (lf,LF_START_COMPUTE);
|
|
|
|
P_1_2 = 1/(filteredData_1.sites*(1-P_1_1));
|
|
bestDiff = -1e100;
|
|
bestAlpha = 1;
|
|
step = 0.02;
|
|
v1 = 0;
|
|
for (v1c = 0; v1c < 50; v1c = v1c + 1)
|
|
{
|
|
v1 = v1 + step;
|
|
v2 = v1;
|
|
|
|
checkASample ("1_1");
|
|
}
|
|
|
|
|
|
S_1_1 = bestAlpha;
|
|
bestAlpha = 1;
|
|
|
|
if (bestDiff <= 0)
|
|
{
|
|
P_1_2 = 0;
|
|
}
|
|
saveBD = bestDiff;
|
|
R_1_1 := 1;
|
|
|
|
P_2_2 = 1/(filteredData_2.sites*(1-P_2_1));
|
|
|
|
v1 = 0;
|
|
for (v1c = 0; v1c < 50; v1c = v1c + 1)
|
|
{
|
|
v1 = v1 + step;
|
|
v2 = v1;
|
|
|
|
checkASample ("2_1");
|
|
}
|
|
|
|
|
|
LFCompute (lf,LF_DONE_COMPUTE);
|
|
|
|
S_2_1 = bestAlpha;
|
|
R_2_1 := 1;
|
|
|
|
if (bestDiff <= saveBD)
|
|
{
|
|
P_2_2 = 0;
|
|
}
|
|
|
|
if (bestDiff > 0)
|
|
{
|
|
fprintf (stdout, "\nFound a likelihood improvement in the direction (", S_1_1, ",", S_1_1, "), (", S_2_1, ",", S_2_1, "), ",bestDiff," likelihood points\n");
|
|
}
|
|
|
|
Optimize (res,lf);
|
|
d1 = ReportDistributionString(4,freqStrMx_1,"1_",1);
|
|
d2 = ReportDistributionString(4,freqStrMx_2,"2_",1);
|
|
|
|
fprintf (stdout, "\n*** Done with pass 3. Log(L) = ", Format(res[1][0],10,3), " *** \n");
|
|
fprintf (stdout, "Approximate rates for data set 1:\n", d1);
|
|
fprintf (stdout, "Approximate rates for data set 2:\n", d2);
|
|
|
|
fprintf (stdout, "\nGateaux sampling negative selected directions\n");
|
|
|
|
|
|
baseLineLL = res[1][0];
|
|
LFCompute (lf,LF_START_COMPUTE);
|
|
bestDiff = -1e100;
|
|
bestAlpha = 1;
|
|
bestBeta = 1;
|
|
|
|
|
|
step = 1/16;
|
|
v1 = 0;
|
|
P_1_3 = 2/(filteredData_1.sites*(1-P_1_1)*(1-P_1_2));
|
|
S_1_2 = 1;
|
|
for (v1c = 0; v1c < 15; v1c = v1c + 1)
|
|
{
|
|
v1 = v1+step;
|
|
v2 = step/2;
|
|
for (v2c = 0; v2c < v1c; v2c = v2c+1)
|
|
{
|
|
checkASample ("1_2");
|
|
v2 = v2 + step;
|
|
}
|
|
}
|
|
|
|
|
|
S_1_2 = bestAlpha;
|
|
R_1_2 = bestBeta;
|
|
bestAlpha = 1;
|
|
bestBeta = 1;
|
|
|
|
if (bestDiff <= 0)
|
|
{
|
|
P_1_3 = 0;
|
|
}
|
|
saveBD = bestDiff;
|
|
|
|
|
|
v1 = 0;
|
|
P_2_3 = 2/(filteredData_2.sites*(1-P_2_1)*(1-P_2_2));
|
|
S_2_2 = 1;
|
|
for (v1c = 0; v1c < 15; v1c = v1c + 1)
|
|
{
|
|
v1 = v1+step;
|
|
v2 = step/2;
|
|
for (v2c = 0; v2c < v1c; v2c = v2c+1)
|
|
{
|
|
checkASample ("2_2");
|
|
v2 = v2 + step;
|
|
}
|
|
}
|
|
|
|
LFCompute (lf,LF_DONE_COMPUTE);
|
|
|
|
S_2_2 = bestAlpha;
|
|
R_2_2 = bestBeta;
|
|
|
|
if (bestDiff <= saveBD)
|
|
{
|
|
P_2_2 = 0;
|
|
}
|
|
|
|
if (bestDiff > 0)
|
|
{
|
|
fprintf (stdout, "\nFound a likelihood improvement in the direction (", S_1_2, ",", S_1_2*R_1_2, "), (", S_2_2, ",", S_2_2*R_2_2, "), ",bestDiff," likelihood points\n");
|
|
}
|
|
|
|
AC_1 = AC_1; AT_1 = AT_1; CG_1 = CG_1; CT_1 = CT_1; GT_1 = GT_1;
|
|
AC_2 = AC_2; AT_2 = AT_2; CG_2 = CG_2; CT_2 = CT_2; GT_2 = GT_2;
|
|
|
|
if (Abs(modelConstraintString_1))
|
|
{
|
|
ExecuteCommands (modelConstraintString_1);
|
|
}
|
|
if (Abs(modelConstraintString_2))
|
|
{
|
|
ExecuteCommands (modelConstraintString_2);
|
|
}
|
|
|
|
codonFactor_1 = codonFactor_1;
|
|
codonFactor_2 = codonFactor_2;
|
|
|
|
|
|
GetString (paramList, lf, -1);
|
|
degF = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters;
|
|
|
|
OPTIMIZATION_PRECISION = sop;
|
|
fprintf (stdout, "Running an independent distributions model fit (", degF, " parameters)...\n");
|
|
Optimize (res,lf);
|
|
LogL = res[1][0];
|
|
|
|
AIC = 2*(degF-LogL);
|
|
AICc = 2*(degF*totalCodonCount/(totalCodonCount-degF-1) - LogL);
|
|
|
|
fprintf (stdout, "\n\nIndependent distributions model fit summary\n",
|
|
"\nLog likelihood:", Format (LogL, 15, 5),
|
|
"\nParameters :", Format (degF, 15, 0),
|
|
"\nAIC :", Format (AIC, 15, 5),
|
|
"\nc-AIC :", Format (AICc, 15, 5),"\n"
|
|
);
|
|
|
|
fprintf (sumPath,CLEAR_FILE,"Independent distributions model fit summary\n",
|
|
"\nLog likelihood:", Format (LogL, 15, 5),
|
|
"\nParameters :", Format (degF, 15, 0),
|
|
"\nAIC :", Format (AIC, 15, 5),
|
|
"\nc-AIC :", Format (AICc, 15, 5),"\n");
|
|
|
|
d1 = ReportDistributionString(4,freqStrMx_1,"1_",0);
|
|
d2 = ReportDistributionString(4,freqStrMx_2,"2_",0);
|
|
|
|
fprintf (stdout, "Inferred rates for data set 1:\n", d1);
|
|
fprintf (sumPath, "Inferred rates for data set 1:\n", d1);
|
|
fprintf (stdout, "Inferred rates for data set 2:\n", d2);
|
|
fprintf (sumPath, "Inferred rates for data set 2:\n", d2);
|
|
|
|
LIKELIHOOD_FUNCTION_OUTPUT = 6;
|
|
fprintf (resToPath,CLEAR_FILE,lf);
|
|
|
|
|
|
/*-------------- SHARED POSITIVE SELECTION STRENGTHS ------------------*/
|
|
|
|
for (k=0; k<respP; k=k+1)
|
|
{
|
|
ExecuteCommands ("R_1_"+k+"=0.5(R_1_"+k+"+R_2_"+k+");");
|
|
ExecuteCommands ("R_2_"+k+":=R_1_"+k+";");
|
|
}
|
|
|
|
GetString (paramList, lf, -1);
|
|
degFPSS = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters;
|
|
|
|
fprintf (stdout, "\nRunning a shared positive selection strengths model (",degFPSS," parameters)...\n");
|
|
Optimize (res_PSS,lf);
|
|
LogLPSS = res_PSS[1][0];
|
|
|
|
AICPSS = 2*(degFPSS-LogLPSS);
|
|
AICcPSS = 2*(degFPSS*totalCodonCount/(totalCodonCount-degFPSS-1) - LogLPSS);
|
|
|
|
fprintf (stdout, "\n\nShared positive selection strengths model fit summary\n",
|
|
"\nLog likelihood:", Format (LogLPSS, 15, 5),
|
|
"\nParameters :", Format (degFPSS, 15, 0),
|
|
"\nAIC :", Format (AICPSS, 15, 5),
|
|
"\nc-AIC :", Format (AICcPSS, 15, 5),"\n"
|
|
);
|
|
|
|
fprintf (sumPath, "\n\nShared positive strengths regime model fit summary\n",
|
|
"\nLog likelihood:", Format (LogLPSS, 15, 5),
|
|
"\nParameters :", Format (degFPSS, 15, 0),
|
|
"\nAIC :", Format (AICPSS, 15, 5),
|
|
"\nc-AIC :", Format (AICcPSS, 15, 5),"\n"
|
|
);
|
|
|
|
d1 = ReportDistributionString(4,freqStrMx_1,"1_",0);
|
|
d2 = ReportDistributionString(4,freqStrMx_2,"2_",0);
|
|
|
|
fprintf (stdout, "Inferred rates for data set 1:\n", d1);
|
|
fprintf (sumPath, "Inferred rates for data set 1:\n", d1);
|
|
fprintf (stdout, "Inferred rates for data set 2:\n", d2);
|
|
fprintf (sumPath, "Inferred rates for data set 2:\n", d2);
|
|
fpath = resToPath + ".SharedStrength";
|
|
LIKELIHOOD_FUNCTION_OUTPUT = 6;
|
|
fprintf (fpath,CLEAR_FILE,lf);
|
|
|
|
/*-------------- SHARED POSITIVE SELECTION PROPORTIONS ------------------*/
|
|
|
|
/* RELOAD INDEPENDENT FIT */
|
|
DeleteObject (lf);
|
|
ExecuteAFile (resToPath);
|
|
|
|
for (k=1; k<=respP; k=k+1)
|
|
{
|
|
ExecuteCommands ("P_1_"+k+"=0.5(P_1_"+k+"+P_2_"+k+");");
|
|
ExecuteCommands ("P_2_"+k+":=P_1_"+k+";");
|
|
}
|
|
|
|
GetString (paramList, lf, -1);
|
|
degFPSP = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters;
|
|
fprintf (stdout, "\nRunning a shared positive selection proportions model (",degFPSP," parameters) ...\n");
|
|
Optimize (res_PSP,lf);
|
|
LogLPSP = res_PSP[1][0];
|
|
|
|
|
|
AICPSP = 2*(degFPSP-LogLPSP);
|
|
AICcPSP = 2*(degFPSP*totalCodonCount/(totalCodonCount-degFPSP-1) - LogLPSP);
|
|
|
|
fprintf (stdout, "\n\nShared positive selection proportions model fit summary\n",
|
|
"\nLog likelihood:", Format (LogLPSP, 15, 5),
|
|
"\nParameters :", Format (degFPSP, 15, 0),
|
|
"\nAIC :", Format (AICPSP, 15, 5),
|
|
"\nc-AIC :", Format (AICcPSP, 15, 5),"\n"
|
|
);
|
|
|
|
fprintf (sumPath, "\n\nShared positive proportions regime model fit summary\n",
|
|
"\nLog likelihood:", Format (LogLPSP, 15, 5),
|
|
"\nParameters :", Format (degFPSP, 15, 0),
|
|
"\nAIC :", Format (AICPSP, 15, 5),
|
|
"\nc-AIC :", Format (AICcPSP, 15, 5),"\n"
|
|
);
|
|
|
|
d1 = ReportDistributionString(4,freqStrMx_1,"1_",0);
|
|
d2 = ReportDistributionString(4,freqStrMx_2,"2_",0);
|
|
|
|
fprintf (stdout, "Inferred rates for data set 1:\n", d1);
|
|
fprintf (sumPath, "Inferred rates for data set 1:\n", d1);
|
|
fprintf (stdout, "Inferred rates for data set 2:\n", d2);
|
|
fprintf (sumPath, "Inferred rates for data set 2:\n", d2);
|
|
fpath = resToPath + ".SharedProportion";
|
|
LIKELIHOOD_FUNCTION_OUTPUT = 6;
|
|
fprintf (fpath,CLEAR_FILE,lf);
|
|
|
|
|
|
/*-------------- SHARED POSITIVE SELECTION REGIME ------------------*/
|
|
|
|
/* RELOAD INDEPENDENT FIT */
|
|
DeleteObject (lf);
|
|
ExecuteAFile (resToPath);
|
|
|
|
for (k=0; k<respP; k=k+1)
|
|
{
|
|
ExecuteCommands ("R_1_"+k+"=0.5(R_1_"+k+"+R_2_"+k+");");
|
|
ExecuteCommands ("R_2_"+k+":=R_1_"+k+";");
|
|
}
|
|
for (k=1; k<=respP; k=k+1)
|
|
{
|
|
ExecuteCommands ("P_1_"+k+"=0.5(P_1_"+k+"+P_2_"+k+");");
|
|
ExecuteCommands ("P_2_"+k+":=P_1_"+k+";");
|
|
}
|
|
|
|
GetString (paramList, lf, -1);
|
|
degFPSH = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters;
|
|
|
|
fprintf (stdout, "\nRunning a shared positive selection regime model (", degFPSH, " parameters)...\n");
|
|
Optimize (res_PSH,lf);
|
|
LogLPSH = res_PSH[1][0];
|
|
|
|
AICPSH = 2*(degFPSH-LogLPSH);
|
|
AICcPSH = 2*(degFPSH*totalCodonCount/(totalCodonCount-degFPSH-1) - LogLPSH);
|
|
|
|
fprintf (stdout, "\n\nShared positive selection regime model fit summary\n",
|
|
"\nLog likelihood:", Format (LogLPSH, 15, 5),
|
|
"\nParameters :", Format (degFPSH, 15, 0),
|
|
"\nAIC :", Format (AICPSH, 15, 5),
|
|
"\nc-AIC :", Format (AICcPSH, 15, 5),"\n"
|
|
);
|
|
|
|
fprintf (sumPath, "\n\nShared positive selection regime model fit summary\n",
|
|
"\nLog likelihood:", Format (LogLPSH, 15, 5),
|
|
"\nParameters :", Format (degFPSH, 15, 0),
|
|
"\nAIC :", Format (AICPSH, 15, 5),
|
|
"\nc-AIC :", Format (AICcPSH, 15, 5),"\n"
|
|
);
|
|
|
|
d1 = ReportDistributionString(4,freqStrMx_1,"1_",0);
|
|
d2 = ReportDistributionString(4,freqStrMx_2,"2_",0);
|
|
|
|
fprintf (stdout, "Inferred rates for data set 1:\n", d1);
|
|
fprintf (sumPath, "Inferred rates for data set 1:\n", d1);
|
|
fprintf (stdout, "Inferred rates for data set 2:\n", d2);
|
|
fprintf (sumPath, "Inferred rates for data set 2:\n", d2);
|
|
fpath = resToPath + ".SharedPositiveSelection";
|
|
LIKELIHOOD_FUNCTION_OUTPUT = 6;
|
|
fprintf (fpath,CLEAR_FILE,lf);
|
|
|
|
/*-------------- SHARED MODEL ------------------*/
|
|
|
|
DeleteObject (lf);
|
|
ExecuteAFile (resToPath);
|
|
|
|
for (k=0; k<resp; k=k+1)
|
|
{
|
|
ExecuteCommands ("S_1_"+k+"=0.5(S_1_"+k+"+S_2_"+k+");");
|
|
if (k<respP || k>=respP+respN)
|
|
{
|
|
ExecuteCommands ("R_1_"+k+"=0.5(R_1_"+k+"+R_2_"+k+");");
|
|
}
|
|
ExecuteCommands ("S_2_"+k+":=S_1_"+k+";R_2_"+k+":=R_1_"+k+";");
|
|
}
|
|
for (k=1; k<=resp; k=k+1)
|
|
{
|
|
ExecuteCommands ("P_1_"+k+"=0.5(P_1_"+k+"+P_2_"+k+");");
|
|
ExecuteCommands ("P_2_"+k+":=P_1_"+k+";");
|
|
}
|
|
|
|
GetString (paramList, lf, -1);
|
|
degFJ = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters;
|
|
fprintf (stdout, "\nRunning a shared distributions model fit (", degFJ, " parameters)...\n");
|
|
|
|
Optimize (res_J,lf);
|
|
LogLJ = res_J[1][0];
|
|
|
|
AICJ = 2*(degFJ-LogLJ);
|
|
AICcJ = 2*(degFJ*totalCodonCount/(totalCodonCount-degFJ-1) - LogLJ);
|
|
|
|
fprintf (stdout, "\n\nShared distributions model fit summary\n",
|
|
"\nLog likelihood:", Format (LogLJ, 15, 5),
|
|
"\nParameters :", Format (degFJ, 15, 0),
|
|
"\nAIC :", Format (AICJ, 15, 5),
|
|
"\nc-AIC :", Format (AICcJ, 15, 5),"\n"
|
|
);
|
|
|
|
fprintf (sumPath, "\n\nShared distributions model fit summary\n",
|
|
"\nLog likelihood:", Format (LogLJ, 15, 5),
|
|
"\nParameters :", Format (degFJ, 15, 0),
|
|
"\nAIC :", Format (AICJ, 15, 5),
|
|
"\nc-AIC :", Format (AICcJ, 15, 5),"\n"
|
|
);
|
|
|
|
d1 = ReportDistributionString(4,freqStrMx_1,"1_",0);
|
|
|
|
fprintf (stdout, "Inferred joint rates:\n", d1);
|
|
fprintf (sumPath, "Inferred joint rates:\n", d1);
|
|
|
|
fpath = resToPath + ".JointAll";
|
|
LIKELIHOOD_FUNCTION_OUTPUT = 6;
|
|
fprintf (fpath,CLEAR_FILE,lf);
|
|
|
|
USE_LAST_RESULTS = 0;
|
|
SKIP_CONJUGATE_GRADIENT = 0;
|
|
LIKELIHOOD_FUNCTION_OUTPUT = 2;
|
|
|
|
fprintf (stdout, "\nDistribution comparison tests\n",
|
|
"\n\tAre the distributions different?",
|
|
"\n\t\tLR = ", Format (2*(LogL-LogLJ),10,3),
|
|
" DF = ", degF-degFJ,
|
|
" p = ", Format(1-CChi2(2*(LogL-LogLJ), degF-degFJ),8,3), "\n");
|
|
|
|
fprintf (stdout, "\n\tAre selective regimes (dN/dS and proportions) different?",
|
|
"\n\t\tLR = ", Format (2*(LogL-LogLPSH),10,3),
|
|
" DF = ", degF-degFPSH,
|
|
" p = ", Format(1-CChi2(2*(LogL-LogLPSH), degF-degFPSH),8,3), "\n");
|
|
|
|
fprintf (stdout, "\n\tAre selection strengths (dN/dS) different?",
|
|
"\n\t\tLR = ", Format (2*(LogL-LogLPSS),10,3),
|
|
" DF = ", degF-degFPSS,
|
|
" p = ", Format(1-CChi2(2*(LogL-LogLPSS), degF-degFPSH),8,3), "\n");
|
|
|
|
fprintf (stdout, "\n\tAre the proportions of codons under selection different?",
|
|
"\n\t\tLR = ", Format (2*(LogL-LogLPSP),10,3),
|
|
" DF = ", degF-degFPSP,
|
|
" p = ", Format(1-CChi2(2*(LogL-LogLPSP), degF-degFPSP),8,3), "\n");
|
|
|
|
fprintf (sumPath, "\n\nDistribution comparison tests\n",
|
|
"\n\tAre the distributions different?",
|
|
"\n\t\tLR = ", Format (2*(LogL-LogLJ),10,3),
|
|
" DF = ", degF-degFJ,
|
|
" p = ", Format(1-CChi2(2*(LogL-LogLJ), degF-degFJ),8,3), "\n");
|
|
|
|
fprintf (sumPath, "\n\tAre selective regimes (dN/dS and proportions) different?",
|
|
"\n\t\tLR = ", Format (2*(LogL-LogLPSH),10,3),
|
|
" DF = ", degF-degFPSH,
|
|
" p = ", Format(1-CChi2(2*(LogL-LogLPSH), degF-degFPSH),8,3), "\n");
|
|
|
|
fprintf (sumPath, "\n\tAre selection strengths (dN/dS) different?",
|
|
"\n\t\tLR = ", Format (2*(LogL-LogLPSS),10,3),
|
|
" DF = ", degF-degFPSS,
|
|
" p = ", Format(1-CChi2(2*(LogL-LogLPSS), degF-degFPSH),8,3), "\n");
|
|
|
|
fprintf (sumPath, "\n\tAre the proportions of codons under selection different?",
|
|
"\n\t\tLR = ", Format (2*(LogL-LogLPSP),10,3),
|
|
" DF = ", degF-degFPSP,
|
|
" p = ", Format(1-CChi2(2*(LogL-LogLPSP), degF-degFPSP),8,3), "\n");
|
|
/*------------------------------------------------------------------------------------------------------------*/
|
|
|
|
function checkASample (whichRate)
|
|
{
|
|
ExecuteCommands ("S_"+whichRate+"=v1;R_"+whichRate+"=v2/v1;");
|
|
LFCompute (lf,res_n);
|
|
localDiff = res_n-baseLineLL;
|
|
if (localDiff > bestDiff)
|
|
{
|
|
bestDiff = localDiff;
|
|
bestAlpha = v1;
|
|
bestBeta = v2/v1;
|
|
}
|
|
return 0;
|
|
}
|