From 585d74ecc995d2fda7107d72f212c6c34863647f Mon Sep 17 00:00:00 2001 From: Mike Doud Date: Sat, 2 May 2015 13:24:43 -0700 Subject: [PATCH] add better HyPhy samples --- samples/HyPhy/AAModelComparison.bf | 252 + samples/HyPhy/CodonModelCompare.bf | 1046 ++ samples/HyPhy/MFPositiveSelection.bf | 1113 ++ samples/HyPhy/MolecularClock.bf | 147 + samples/HyPhy/dNdSDistributionComparison.bf | 1025 ++ samples/HyPhy/hyphy_cmds.bf | 11003 ++++++++++++++++++ 6 files changed, 14586 insertions(+) create mode 100644 samples/HyPhy/AAModelComparison.bf create mode 100644 samples/HyPhy/CodonModelCompare.bf create mode 100644 samples/HyPhy/MFPositiveSelection.bf create mode 100644 samples/HyPhy/MolecularClock.bf create mode 100644 samples/HyPhy/dNdSDistributionComparison.bf create mode 100644 samples/HyPhy/hyphy_cmds.bf diff --git a/samples/HyPhy/AAModelComparison.bf b/samples/HyPhy/AAModelComparison.bf new file mode 100644 index 00000000..602a9e56 --- /dev/null +++ b/samples/HyPhy/AAModelComparison.bf @@ -0,0 +1,252 @@ +if (Rows (modelMatrixList) == 0) +{ + modelMatrixList = + { + {"Equal Input", "EIAA.mdl", "19"} + {"Dayhoff","Dayhoff.mdl","0"} + {"Dayhoff+F","Dayhoff_F.mdl","19"} + {"JTT", "Jones.mdl", "0"} + {"JTT+F", "Jones_F.mdl", "19"} + {"WAG", "WAG.mdl", "0"} + {"WAG+F", "WAG_F.mdl", "19"} + {"rtREV", "rtREV.mdl", "0"} + {"rtREV+F", "rtREV_F.mdl", "19"} + {"mtMAM", "mtMAM.mdl", "0"} + {"mtMAM+F", "mtMAM_F.mdl", "19"} + {"mtREV 24", "mtREV_24.mdl", "0"} + {"mtREV 24+F", "mtREV_24_F.mdl", "19"} + {"HIV within", "HIVwithin.mdl", "0"} + {"HIV within+F", "HIVwithin+F.mdl", "19"} + {"HIV between", "HIVbetween.mdl", "0"} + {"HIV between+F", "HIVbetween+F.mdl", "19"} + {"REV-1 step", "reducedREV.mdl", "19"} + {"REV", "mtREV.mdl", "19"} + }; +} + +/*___________________________________________________________________________________________________________*/ + +function runAModel (modelID, fileName, xtraP, midx) +{ + ExecuteCommands ("#include \"TemplateModels/"+fileName+"\";"); + Tree givenTree = treeString; + LikelihoodFunction lf = (filteredData,givenTree); + + GetString (lf_info, lf, -1); + locals = lf_info["Local Independent"]; + + if (Columns (branchLengthStash)) + { + USE_LAST_RESULTS = 1; + for (_iv = 0; _iv < Columns (locals); _iv = _iv+1) + { + ExecuteCommands (locals[_iv] + "=1;\n"); + } + currentBL = BranchLength (givenTree,0); + currentBN = BranchName (givenTree,-1); + for (_iv = 0; _iv < Columns (currentBN); _iv = _iv+1) + { + ExecuteCommands ("givenTree."+currentBN[_iv]+".t="+branchLengthStash[_iv]/currentBL+";"); + + } + } + else + { + for (_iv = 0; _iv < Columns (locals); _iv = _iv+1) + { + ExecuteCommands (locals[_iv] + "=0.1;\n"); + } + USE_LAST_RESULTS = 1; + } + + Optimize (res,lf); + + fprintf (stdout, "| ", modelID); + for (k=0; k1) + { + cAIC = 2(-res[1][0]+params*(filteredData.sites/(filteredData.sites-params-1))); + } + else + { + cAIC = 0; + } + + branchLengths = BranchLength (givenTree,-1); + TL = 0; + for (k=Rows(branchLengths)*Columns(branchLengths)-1; k>=0; k=k-1) + { + TL = TL + branchLengths[k]; + } + + fprintf (stdout, "| ", Format (res[1][0],14,3), " | ", Format (params,5,0), " | ", + Format (AIC, 9,3), " | ",); + + if (cAIC > 0) + { + fprintf (stdout, Format (cAIC,11,3), " | "); + } + else + { + fprintf (stdout, " N/A | "); + } + + fprintf (stdout, Format (TL,11,3), " |\n", sepString); + + resultMatrix[midx][0] = res[1][0]; + resultMatrix[midx][1] = params; + resultMatrix[midx][2] = AIC; + resultMatrix[midx][3] = cAIC; + resultMatrix[midx][4] = TL; + + if (AIC < bestAIC) + { + bestAIC = AIC; + bestAICidx = midx; + branchLengthStash = BranchLength (givenTree,-1); + } + + if (cAIC > 0) + { + if (bestCAIC > cAIC) + { + bestCAIC = cAIC; + bestCAICidx = midx; + + } + } + + return 0; +} + + +/*___________________________________________________________________________________________________________*/ + + + +maxModelWidth = 7; +skipCodeSelectionStep = 0; + +ChoiceList (doREV, "Include REV?", 1, SKIP_NONE, "Yes", "Include REV and reduced REV models. CAUTION: these models take a long time to fit.", + "No", "Only use empirical models"); + +if (doREV < 0) +{ + return 0; +} + +if (doREV == 0) +{ + #include "TemplateModels/chooseGeneticCode.def"; + skipCodeSelectionStep = 1; +} + +modelCount = Rows (modelMatrixList) - 2*doREV; + +for (k=0; k=0) +{ + fprintf (stdout, "\n\nBest c-AIC model:\n\t", modelMatrixList[bestCAICidx][0], " with the score of ", bestCAIC); +} + +labelMatrix = {{"Log-likelihood","Parameters","AIC","c-AIC","Total tree length",""}}; + +aaString = "Model"; + +for (fC = 0; fC < modelCount; fC = fC+1) +{ + aaString = aaString + ";" + modelMatrixList[fC][0]; +} + +USE_LAST_RESULTS = 0; + +labelMatrix[5] = aaString; +skipCodeSelectionStep = 0; +OpenWindow (CHARTWINDOW,{{"Model Fits"} + {"labelMatrix"}, + {"resultMatrix"}, + {"Bar Chart"}, + {"Index"}, + {"c-AIC"}, + {"Model Index"}, + {""}, + {"AIC"} + }, + "SCREEN_WIDTH-60;SCREEN_HEIGHT-60;30;30"); diff --git a/samples/HyPhy/CodonModelCompare.bf b/samples/HyPhy/CodonModelCompare.bf new file mode 100644 index 00000000..e24b730d --- /dev/null +++ b/samples/HyPhy/CodonModelCompare.bf @@ -0,0 +1,1046 @@ +/* + This file takes a nucleotide data set and a tree (either from the data file or from a separate file) and computes maximum likelihood estimates for every possible 4x4 reversible model on that data and tree. + + We use the string (v1,v2,v3,v4,v5,v6), where and v1..6 = 0..5 + to encode a 4x4 symmetric transition matrix with entries + [* v1 v2 v3] + [- * v4 v5] + [- - * v6] + [- - - * ] + + For instance: (010010) encodes HKY85. + + For each model the following information is reported: + - Model string. (e.g. (012345) for the GRM) + - Number of model parameters + - Max ln-likelihood for the model + - Likelihood ratio statistic (as a sub-model of the GRM) + - AIC + - P-Value for the Likelihood Ratio Test. + + + Sergei L. Kosakovsky Pond, Summer 2002. + +*/ + +function ReceiveJobs (sendOrNot) +{ + if (MPI_NODE_COUNT>1) + { + MPIReceive (-1, fromNode, result_String); + jobModelNum = MPINodeState[fromNode-1][1]; + vv1 = MPINodeState[fromNode-1][2]; + vv2 = MPINodeState[fromNode-1][3]; + vv3 = MPINodeState[fromNode-1][4]; + vv4 = MPINodeState[fromNode-1][5]; + vv5 = MPINodeState[fromNode-1][6]; + vv6 = MPINodeState[fromNode-1][7]; + if (sendOrNot) + { + MPISend (fromNode,lf); + MPINodeState[fromNode-1][1] = modelNum; + MPINodeState[fromNode-1][2] = v1; + MPINodeState[fromNode-1][3] = v2; + MPINodeState[fromNode-1][4] = v3; + MPINodeState[fromNode-1][5] = v4; + MPINodeState[fromNode-1][6] = v5; + MPINodeState[fromNode-1][7] = v6; + } + else + { + MPINodeState[fromNode-1][0] = 0; + MPINodeState[fromNode-1][1] = -1; + } + + ExecuteCommands (result_String); + } + else + { + jobModelNum = modelNum; + } + + if (jobModelNum == 0) + { + stdl = lf_MLES[1][0]; + fullnp = lf_MLES[1][1]+addOn; + fprintf(stdout,"\n(012345) Full Model ln-lik = ",stdl,". Parameter Count=",Format(fullnp,0,0)," AIC = ", 2*(fullnp-stdl),"\n\n"); + + + resultCache [0][0] = 1; + resultCache [0][1] = 2; + resultCache [0][2] = 3; + resultCache [0][3] = 4; + resultCache [0][4] = 5; + resultCache [0][5] = lf_MLES[1][0]; + resultCache [0][6] = lf_MLES[1][1]+addOn; + resultCache [0][7] = 0; + resultCache [0][8] = 0; + + fprintf (stdout,"\n# | Model | # prm | lnL | LRT | AIC | P-Value |"); + fprintf (stdout,"\n----|----------|-------|-----------|----------------|------------|------------------|"); + + if (MPI_NODE_COUNT>1) + { + for (h=1; h<203; h=h+1) + { + lnL = resultCache[h][5]; + + if (lnL<0) + { + np = resultCache[h][6]; + LRT = -2*(lnL-stdl); + if (LRT<0) + { + LRT = 0; + } + AIC = -2*lnL+2*np; + PRINT_DIGITS = 3; + fprintf (stdout,"\n",h); + PRINT_DIGITS = 1; + fprintf (stdout," | (",0, resultCache[h][0], resultCache[h][1], resultCache[h][2], resultCache[h][3], resultCache[h][4],") | "); + fprintf (stdout,Format (np,5,0)); + PRINT_DIGITS = 8; + fprintf (stdout, " | ",lnL," | ",Format(LRT,14,3), " | ", AIC, " | ", ); + + PRINT_DIGITS = 15; + if (LRT==0) + { + pValue = 1; + } + else + { + pValue = 1-CChi2(LRT,fullnp-np); + } + fprintf (stdout,pValue," |"); + resultCache [jobModelNum][7] = pValue; + if (pValue1)&&(resultCache[0][5]>=0)) + { + resultCache [jobModelNum][0] = vv2; + resultCache [jobModelNum][1] = vv3; + resultCache [jobModelNum][2] = vv4; + resultCache [jobModelNum][3] = vv5; + resultCache [jobModelNum][4] = vv6; + resultCache [jobModelNum][5] = lf_MLES[1][0]; + resultCache [jobModelNum][6] = lf_MLES[1][1]+addOn; + return fromNode - 1; + } + } + + np = lf_MLES[1][1]+addOn; + lnL = lf_MLES[1][0]; + LRT = -2*(lnL-stdl); + if (LRT<0) + { + LRT = 0; + } + AIC = -2*lnL+2*np; + PRINT_DIGITS = 3; + fprintf (stdout,"\n",jobModelNum); + PRINT_DIGITS = 1; + fprintf (stdout," | (",vv1,vv2,vv3,vv4,vv5,vv6,") | "); + fprintf (stdout,Format (np,5,0)); + PRINT_DIGITS = 8; + fprintf (stdout, " | ",lnL," | ",Format(LRT,14,3), " | ", AIC, " | ", ); + + + PRINT_DIGITS = 15; + if (LRT==0) + { + pValue = 1; + } + else + { + pValue = 1-CChi2(LRT,fullnp-np); + } + fprintf (stdout,pValue," |"); + + resultCache [jobModelNum][0] = vv2; + resultCache [jobModelNum][1] = vv3; + resultCache [jobModelNum][2] = vv4; + resultCache [jobModelNum][3] = vv5; + resultCache [jobModelNum][4] = vv6; + resultCache [jobModelNum][5] = lf_MLES[1][0]; + resultCache [jobModelNum][6] = lf_MLES[1][1]+addOn; + resultCache [jobModelNum][7] = pValue; + if (pValue 0) +{ + #include "TemplateModels/defineGamma.mdl"; +} + +ChoiceList (branchLengths,"Estimate Branch Lengths",1,SKIP_NONE, + "Every Time","Branch lengths are reestimated for every model.", + "Once","Branch lenghts obtained from the nucleotide GTR model and reused for subsequent models." + ); + +if (branchLengths<0) +{ + return; +} + +rejectAt = 0; + +while ((rejectAt<=0)||(rejectAt>=1)) +{ + fprintf (stdout, "\nModel rejection level (e.g. 0.05):"); + fscanf (stdin,"Number", rejectAt); +} + +SetDialogPrompt ("Save results to:"); + +fprintf (PROMPT_FOR_FILE, CLEAR_FILE); +BASE_PATH = LAST_FILE_PATH; + +KEEP_OPTIMAL_ORDER = 1; +MESSAGE_LOGGING = 0; + +global AC=1; +global AT=1; +global CG=1; +global CT=1; +global GT=1; +global R=1; + + +r = setElement (0,2,1); +r = setElement (0,3,2); +r = setElement (1,2,3); +r = setElement (1,3,4); +r = setElement (2,3,5); + +MG94custom = 0; +MULTIPLY_BY_FREQS = PopulateModelMatrix ("MG94custom", observedFreq); +vectorOfFrequencies = BuildCodonFrequencies (observedFreq); +Model MG94customModel = (MG94custom,vectorOfFrequencies,0); + +USE_POSITION_SPECIFIC_FREQS = 1; +Tree tr = treeString; + +addOn = 0; + +if (branchLengths) +{ + global TreeScaler = 1; + GTRMatrix = {{*,AC*nt,nt,AT*nt}{AC*nt,*,CG*nt,CT*nt}{nt,CG*nt,*,GT*nt}{AT*nt,CT*nt,GT*nt,*}}; + DataSetFilter nucFilter = CreateFilter (filteredData,1); + HarvestFrequencies (nucFreq,nucFilter,1,1,1); + Model GTRModel = (GTRMatrix,nucFreq,1); + givenTreeString = Format (tr,0,0); + Tree nucTree = givenTreeString; + LikelihoodFunction lfn = (nucFilter, nucTree); + Optimize (nres,lfn); + ReplicateConstraint ("this1.?.synRate:=this2.?.nt__/TreeScaler",tr,nucTree); + addOn = nres[1][1]-nres[1][2]-1; +} + +LikelihoodFunction lf = (filteredData, tr); + +resultCache = {203,9}; + +modelNum = 0; +rejectCount = 0; + +if (MPI_NODE_COUNT>1) +{ + MPINodeState = {MPI_NODE_COUNT-1,8}; + OPTIMIZE_SUMMATION_ORDER = 0; + MPISend (1,lf); + MPINodeState[0][0] = 1; + MPINodeState[0][1] = modelNum; +} +else +{ + Optimize (lf_MLES,lf); + vv1 = 0; + vv2 = 0; + vv3 = 0; + vv4 = 0; + vv5 = 0; + vv6 = 0; + dummy = ReceiveJobs (0); +} + +rateBiasTerms = {{"AC","1","AT","CG","CT","GT"}}; + +for (v2=0; v2<=1; v2=v2+1) +{ + for (v3=0; v3<=v2+1; v3=v3+1) + { + if (v3>v2) + { + ub4 = v3; + } + else + { + ub4 = v2; + } + for (v4=0; v4<=ub4+1; v4=v4+1) + { + if (v4>=ub4) + { + ub5 = v4; + } + else + { + ub5 = ub4; + } + for (v5=0; v5<=ub5+1; v5=v5+1) + { + if (v5>ub5) + { + ub6 = v5; + } + else + { + ub6 = ub5; + } + for (v6=0; v6<=ub6+1; v6=v6+1) + { + if (v6==5) + { + break; + } + + R = 1; + + paramCount = 0; + + modelDesc = "0"+Format(v2,1,0); + modelDesc = modelDesc+Format(v3,1,0); + modelDesc = modelDesc+Format(v4,1,0); + modelDesc = modelDesc+Format(v5,1,0); + modelDesc = modelDesc+Format(v6,1,0); + + modelConstraintString = ""; + + AC = 1; + AT = 1; + CG = 1; + CT = 1; + GT = 1; + + for (customLoopCounter2=1; customLoopCounter2<6; customLoopCounter2=customLoopCounter2+1) + { + for (customLoopCounter=0; customLoopCounter1) + { + for (mpiNode = 0; mpiNode < MPI_NODE_COUNT-1; mpiNode = mpiNode+1) + { + if (MPINodeState[mpiNode][0]==0) + { + break; + } + } + + if (mpiNode==MPI_NODE_COUNT-1) + /* all nodes busy */ + { + mpiNode = ReceiveJobs (1); + } + else + { + MPISend (mpiNode+1,lf); + MPINodeState[mpiNode][0] = 1; + MPINodeState[mpiNode][1] = modelNum; + MPINodeState[mpiNode][2] = v1; + MPINodeState[mpiNode][3] = v2; + MPINodeState[mpiNode][4] = v3; + MPINodeState[mpiNode][5] = v4; + MPINodeState[mpiNode][6] = v5; + MPINodeState[mpiNode][7] = v6; + } + } + else + { + Optimize (lf_MLES,lf); + vv1 = v1; + vv2 = v2; + vv3 = v3; + vv4 = v4; + vv5 = v5; + vv6 = v6; + dummy = ReceiveJobs (0); + } + } + } + } + } + +} + +if (MPI_NODE_COUNT>1) +{ + while (1) + { + for (nodeCounter = 0; nodeCounter < MPI_NODE_COUNT-1; nodeCounter = nodeCounter+1) + { + if (MPINodeState[nodeCounter][0]==1) + { + fromNode = ReceiveJobs (0); + break; + } + } + if (nodeCounter == MPI_NODE_COUNT-1) + { + break; + } + } + OPTIMIZE_SUMMATION_ORDER = 1; +} + +function checkEmbedding (_m1, _m2) +{ + for (r=0; r<6; r=r+1) + { + if (_m2[r]<_m1[r]) + { + /*fprintf (stdout,_m1," ", _m2, " Reject 1 at position ",r,"\n");*/ + return 0; + } + if (_m2[r]>_m1[r]) + { + for (r2 = 0; r2 < 6; r2 = r2+1) + { + if ((_m2[r2]==_m2[r])&&(_m1[r2]!=_m1[r])) + { + /*fprintf (stdout,_m1," ", _m2, " Reject 2 at positions ",r,r2,"\n");*/ + return 0; + } + } + } + } + return 1; +} + +PRINT_DIGITS = 0; + +fprintf (stdout, "\n\n--------------------------\n (*) => p-Value < ", rejectAt, "\nRejected ", rejectCount, " models.\n"); + + +if (rejectCount<202) +{ + + fprintf (stdout, "\nPerforming nested tests on the remaining models...\n"); + + done = 0; + while (!done) + { + done = 1; + for (v2=1; v2<203; v2=v2+1) + { + if (resultCache[v2][8]) + { + modelString = "0"; + for (v3 = 0; v3<5; v3=v3+1) + { + modelString = modelString + resultCache [v2][v3]; + } + for (v3 = v2+1; v3<203; v3 = v3+1) + { + if (resultCache[v3][8]) + { + modelString2 = "0"; + for (v4 = 0; v4<5; v4=v4+1) + { + modelString2 = modelString2 + resultCache [v3][v4]; + } + if (checkEmbedding (modelString, modelString2)) + { + fprintf (stdout,"H: (", modelString,") A: (", modelString2, "). "); + done = 0; + LRT = 2*(resultCache[v3][5]-resultCache[v2][5]); + npd = resultCache[v3][6]-resultCache[v2][6]; + if (LRT<0) + { + pValue = 1; + } + else + { + pValue = 1-CChi2(LRT,npd); + } + fprintf (stdout," P-Value=", Format (pValue,10,3)); + if (pValue1)", + "0 & 2 (Normal>1)", + "3 Normal", + "RE: Lognormal", + "RE: Gamma", + "RE: Discrete"}}; + +ParameterCount = {{0, + 1, + 3, + 4, + 2, + 4, + 2, + 4, + 5, + 5, + 5, + 5, + 6, + 1, + 2, + 4 + }}; + +MAXIMUM_ITERATIONS_PER_VARIABLE = 2000; +OPTIMIZATION_PRECISION = 0.001; + +function SetWDistribution (resp) +{ + if (rateType == 0) + { + global P = .5; + P:<1; + categFreqMatrix = {{P,1-P}}; + categRateMatrix = {{0,1}}; + category c = (2, categFreqMatrix , MEAN, ,categRateMatrix, 0, 1e25); + } + else + { + if (rateType == 1) + { + global P1 = 1/3; + global P2 = 0; + + P1:<1; + P2:<1; + + global W = 1; + categFreqMatrix = {{P1,(1-P1)*P2, (1-P1)*(1-P2)}} ; + categRateMatrix = {{0,1,W}}; + category c = (3, categFreqMatrix , MEAN, ,categRateMatrix, 0, 1e25); + } + else + { + if (rateType == 2) + { + global P1 = 1/3; + global P2 = .5; + P1:<1; + P2:<1; + global W1 = .25; + global R1 = 4; + global R2 = 3; + R1:>1; + R2:>1; + categFreqMatrix = {{P1,(1-P1)*P2, (1-P1)*(1-P2)}} ; + categRateMatrix = {{W1,W1*R1,W1*R1*R2}}; + category c = (3, categFreqMatrix , MEAN, ,categRateMatrix, 0, 1e25); + } + else + { + if (rateType == 3) + { + global P1 = 1/5; + global P2 = 1/4; + global P3 = 1/3; + global P4 = 1/2; + + P1:<1; + P2:<1; + P3:<1; + P4:<1; + + categFreqMatrix = {{P1, + (1-P1)P2, + (1-P1)(1-P2)*P3, + (1-P1)(1-P2)(1-P3)P4, + (1-P1)(1-P2)(1-P3)(1-P4)}} ; + categRateMatrix = {{0,1/3,2/3,1,3}}; + category c = (5, categFreqMatrix , MEAN, ,categRateMatrix, 0, 1e25); + } + else + { + if (rateType == 4) + { + global alpha = .5; + global beta = 1; + alpha:>0.01;alpha:<100; + beta:>0.01; + beta:<200; + category c = (resp, EQUAL, MEAN, GammaDist(_x_,alpha,beta), CGammaDist(_x_,alpha,beta), 0 , + 1e25,CGammaDist(_x_,alpha+1,beta)*alpha/beta); + } + else + { + if (rateType == 5) + { + global alpha = .5; + global beta = 1; + global alpha2= .75; + global P = .5; + alpha:>0.01;alpha:<100; + beta:>0.01; + beta:<200; + P:<1; + alpha2:>0.01;alpha2:<100; + category c = (resp, EQUAL, MEAN, P*GammaDist(_x_,alpha,beta) + (1-P)*GammaDist(_x_,alpha2,alpha2) + , P*CGammaDist(_x_,alpha,beta) + (1-P)*CGammaDist(_x_,alpha2,alpha2), + 0 , 1e25, + P*CGammaDist(_x_,alpha+1,beta)*alpha/beta + (1-P)*CGammaDist(_x_,alpha2+1,alpha2)); + } + else + { + if (rateType == 6) + { + global betaP = 1; + global betaQ = 1; + betaP:>0.05;betaP:<85; + betaQ:>0.05;betaQ:<85; + category c = (resp, EQUAL, MEAN, _x_^(betaP-1)*(1-_x_)^(betaQ-1)/Beta(betaP,betaQ), IBeta(_x_,betaP,betaQ), 0 , + 1,IBeta(_x_,betaP+1,betaQ)*betaP/(betaP+betaQ)); + } + else + { + if (rateType == 7) + { + global W = 2; + /*W:>1;*/ + global P = 1-1/(resp+1); + global betaP = 1; + global betaQ = 2; + betaP:>0.05; + betaQ:>0.05; + betaP:<85; + betaQ:<85; + P:>0.0000001; + P:<0.9999999; + categFreqMatrix = {resp+1,1}; + for (k=0; k=W), + 0,1e25, + P*IBeta(Min(_x_,1),betaP+1,betaQ)*betaP/(betaP+betaQ)+(1-P)*W*(_x_>=W)); + } + else + { + if (rateType == 8) + { + global P = .5; + global betaP = 1; + global betaQ = 2; + betaP:>0.05;betaP:<85; + betaQ:>0.05;betaQ:<85; + global alpha = .5; + global beta = 1; + alpha:>0.01;alpha:<100; + beta:>0.01; + beta:<200; + P:<1; + category c = (resp, EQUAL, MEAN, + P*_x_^(betaP-1)*(1-Min(_x_,1))^(betaQ-1)/Beta(betaP,betaQ)+(1-P)*GammaDist(_x_,alpha,beta), + P*IBeta(Min(_x_,1),betaP,betaQ)+(1-P)*CGammaDist(_x_,alpha,beta), + 0,1e25, + P*betaP/(betaP+betaQ)*IBeta(Min(_x_,1),betaP+1,betaQ)+(1-P)*alpha/beta*CGammaDist(_x_,alpha+1,beta)); + } + else + { + if (rateType == 9) + { + global P = .5; + P:<1; + global betaP = 1; + betaP:>0.05;betaP:<85; + global betaQ = 2; + betaQ:>0.05;betaQ:<85; + global alpha = .5; + alpha:>0.01;alpha:<100; + global beta = 1; + beta:>0.01;beta:<500; + category c = (resp, EQUAL, MEAN, + P*_x_^(betaP-1)*(1-Min(_x_,1))^(betaQ-1)/Beta(betaP,betaQ)+(1-P)*(_x_>1)*GammaDist(Max(1e-20,_x_-1),alpha,beta), + P*IBeta(Min(_x_,1),betaP,betaQ)+(1-P)*CGammaDist(Max(_x_-1,0),alpha,beta), + 0,1e25, + P*betaP/(betaP+betaQ)*IBeta(Min(_x_,1),betaP+1,betaQ)+ + (1-P)*(alpha/beta*CGammaDist(Max(0,_x_-1),alpha+1,beta)+CGammaDist(Max(0,_x_-1),alpha,beta))); + } + else + { + if (rateType == 10) + { + global P = .5; + global betaP = 1; + global betaQ = 2; + betaP:>0.05; + betaQ:>0.05; + betaP:<85; + betaQ:<85; + global mu = 3; + global sigma = .01; + sigma:>0.0001; + sqrt2pi = Sqrt(8*Arctan(1)); + P:<1; + + category c = (resp, EQUAL, MEAN, + P*_x_^(betaP-1)*(1-Min(_x_,1))^(betaQ-1)/Beta(betaP,betaQ)+ + (1-P)*(_x_>=1)*Exp(-(_x_-mu)(_x_-mu)/(2*sigma*sigma))/(sqrt2pi__*sigma)/ZCDF((mu-1)/sigma), + P*IBeta(Min(_x_,1),betaP,betaQ)+(1-P)*(_x_>=1)*(1-ZCDF((mu-_x_)/sigma)/ZCDF((mu-1)/sigma)), + 0,1e25, + P*betaP/(betaP+betaQ)*IBeta(Min(_x_,1),betaP+1,betaQ)+ + (1-P)*(_x_>=1)*(mu*(1-ZCDF((1-mu)/sigma)-ZCDF((mu-_x_)/sigma))+ + sigma*(Exp((mu-1)(1-mu)/(2*sigma*sigma))-Exp((_x_-mu)(mu-_x_)/(2*sigma*sigma)))/sqrt2pi__)/ZCDF((mu-1)/sigma)); + } + else + { + if (rateType == 11) + { + global P = 1/3; + global P1 = .5; + + global mu = 3; + global sigma = .5; + sigma:>0.0001; + global sigma1 = 1; + sigma1:>0.0001; + + sqrt2pi = Sqrt(8*Arctan(1)); + P:<1; + P1:<1; + + categFreqMatrix = {resp+1,1}; + for (k=1; k<=resp; k=k+1) + { + categFreqMatrix[k]:=(1-P)/resp__; + } + categFreqMatrix[0]:=P; + + category c = (resp+1, categFreqMatrix, MEAN, + (1-P)((1-P1)*Exp(-(_x_-mu)(_x_-mu)/(2*sigma1*sigma1))/(sqrt2pi__*sigma1)/ZCDF(mu/sigma1)+ + P1*Exp(-(_x_-1)(_x_-1)/(2*sigma*sigma))/(sqrt2pi__*sigma)/ZCDF(1/sigma)), + P+(1-P)(_x_>1e-20)((1-P1)(1-ZCDF((mu-_x_)/sigma1)/ZCDF(mu/sigma1))+ + P1*(1-ZCDF((1-_x_)/sigma)/ZCDF(1/sigma))), + 0,1e25, + (1-P)((1-P1)(mu*(1-ZCDF(-mu/sigma1)-ZCDF((mu-_x_)/sigma1))+ + sigma1*(Exp(-mu*mu/(2*sigma1*sigma1))-Exp((_x_-mu)(mu-_x_)/(2*sigma1*sigma1)))/sqrt2pi__)/ZCDF(mu/sigma1)+ + P(1-ZCDF(-1/sigma)-ZCDF((1-_x_)/sigma)+ + sigma*(Exp(-1/(2*sigma*sigma))-Exp((_x_-1)(1-_x_)/(2*sigma*sigma)))/sqrt2pi__)/ZCDF(1/sigma)) + ); + } + else + { + if (rateType == 12) + { + global P = 1/3; + global P1 = .5; + + global mu = 3; + global sigma = .25; + global sigma1 = .5; + global sigma2 = 1; + sigma:>0.0001; + sigma1:>0.0001; + sigma2:>0.0001; + + sqrt2pi = Sqrt(8*Arctan(1)); + P:<1; + P1:<1; + + category c = (resp, EQUAL , MEAN, + 2*P*Exp(-_x_^2/(2*sigma*sigma))+ + (1-P)((1-P1)*Exp((_x_-mu)(mu-_x_)/(2*sigma2*sigma2))/(sqrt2pi__*sigma2)/ZCDF(mu/sigma2)+ + P1*Exp((1-_x_)(_x_-1)/(2*sigma1*sigma1))/(sqrt2pi__*sigma1)/ZCDF(1/sigma1)), + P*(1-2*ZCDF(-_x_/sigma))+ + (1-P)((1-P1)(1-ZCDF((mu-_x_)/sigma2)/ZCDF(mu/sigma2))+ + P1*(1-ZCDF((1-_x_)/sigma1)/ZCDF(1/sigma1))), + 0,1e25, + 2*P*sigma*(1-Exp(-_x_*_x_/(2*sigma*sigma)))/sqrt2pi__+ + (1-P)((1-P1)(mu*(1-ZCDF(-mu/sigma2)-ZCDF((mu-_x_)/sigma2))+ + sigma2*(Exp(-mu*mu/(2*sigma2*sigma2))-Exp((_x_-mu)(mu-_x_)/(2*sigma2*sigma2)))/sqrt2pi__)/ZCDF(mu/sigma2)+ + P1(1-ZCDF(-1/sigma1)-ZCDF((1-_x_)/sigma1)+ + sigma1*(Exp(-1/(2*sigma1*sigma1))-Exp((_x_-1)(1-_x_)/(2*sigma1*sigma1)))/sqrt2pi__)/ZCDF(mu/sigma1)) + ); + } + else + { + if (rateType == 13) + { + global sigma = .1; + sigma:>0.0001;sigma:<10; + sqrt2pi = Sqrt(8*Arctan(1)); + global _x_:<1e200; + category c = (resp, EQUAL, MEAN, + Exp (-Log(_x_)*Log(_x_) / (2*sigma*sigma)) / (_x_*sigma*sqrt2pi__), /*density*/ + ZCDF (Log(_x_)/sigma), /*CDF*/ + 1e-200, /*left bound*/ + 1e200, /*right bound*/ + Exp (.5*sigma^2)*ZCDF (Log(_x_)/sigma-sigma), + CONSTANT_ON_PARTITION + ); + } + else + { + if (rateType == 14) + { + global alpha = .5; + global beta = 1; + alpha:>0.01;alpha:<100; + beta:>0.01; + beta:<200; + category c = (resp, EQUAL, MEAN, GammaDist(_x_,alpha,beta), CGammaDist(_x_,alpha,beta), 0 , + 1e25,CGammaDist(_x_,alpha+1,beta)*alpha/beta,CONSTANT_ON_PARTITION); + } + else + { + global P1 = 1/3; + global P2 = .5; + P1:<1; + P2:<1; + global W1 = .25; + global R1 = 4; + global R2 = 3; + R1:>1; + R2:>1; + categFreqMatrix = {{P1,(1-P1)*P2, (1-P1)*(1-P2)}} ; + categRateMatrix = {{W1,W1*R1,W1*R1*R2}}; + category c = (3, categFreqMatrix , MEAN, ,categRateMatrix, 0, 1e25, ,CONSTANT_ON_PARTITION); + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + return 0; +} + +/* ____________________________________________________________________________________________________________________*/ + +function FrameText (frameChar,vertChar,parOff,theText) +{ + h = Abs (theText)+4; + fprintf (stdout,"\n"); + for (k=0; k1) break; + } + if (k 1 */ + { + ConstructCategoryMatrix(marginals,lf,COMPLETE); + + CC = Columns (marginals); + if (rateType>=13) + /* subset rate variation */ + { + CC = CC/numberOfSubsets; + + subsetMarginals = {D,numberOfSubsets}; + for (v=0; v1 (Posterior cutoff = ",sigLevel,")\n\n"); + for (v=0; v=sigLevel) + { + fprintf (LAST_FILE_PATH,Format (v+1,0,0)," (",positiveProb,")\n"); + } + } + fprintf (LAST_FILE_PATH,"\n\n------------------------------------------------\n\n Subsets with dN/dS<=1 (Posterior cutoff = ",sigLevel,")\n\n"); + for (v=0; v1 (Posterior cutoff = ",sigLevel,")\n\n"); + for (v=0; v=sigLevel) + { + fprintf (LAST_FILE_PATH,Format (v+1,0,0)," (",positiveProb,")\n"); + } + } + fprintf (LAST_FILE_PATH,"\n\n------------------------------------------------\n\n Sites with dN/dS<=1 (Posterior cutoff = ",sigLevel,")\n\n"); + for (v=0; v1."); + } + fprintf (LAST_FILE_PATH,"\n\n------------------------------------------------\n\n"); + return E; +} + +/* ____________________________________________________________________________________________________________________*/ + +function PopulateModelMatrix (ModelMatrixName&, EFV) +{ + ModelMatrixName = {ModelMatrixDimension,ModelMatrixDimension}; + + hshift = 0; + + if (modelType==0) + { + for (h=0; h<64; h=h+1) + { + if (_Genetic_Code[h]==10) + { + hshift = hshift+1; + continue; + } + vshift = hshift; + for (v = h+1; v<64; v=v+1) + { + diff = v-h; + if (_Genetic_Code[v]==10) + { + vshift = vshift+1; + continue; + } + if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0)) + { + if (h$4==v$4) + { + transition = v%4; + transition2= h%4; + } + else + { + if(diff%16==0) + { + transition = v$16; + transition2= h$16; + } + else + { + transition = v%16$4; + transition2= h%16$4; + } + } + if (_Genetic_Code[0][h]==_Genetic_Code[0][v]) + { + ModelMatrixName[h-hshift][v-vshift] := t*EFV__[transition__]; + ModelMatrixName[v-vshift][h-hshift] := t*EFV__[transition2__]; + } + else + { + ModelMatrixName[h-hshift][v-vshift] := c*t*EFV__[transition__]; + ModelMatrixName[v-vshift][h-hshift] := c*t*EFV__[transition2__]; + } + } + } + } + } + else + { + if (modelType==1) + { + for (h=0; h<64; h=h+1) + { + if (_Genetic_Code[h]==10) + { + hshift = hshift+1; + continue; + } + vshift = hshift; + for (v = h+1; v<64; v=v+1) + { + diff = v-h; + if (_Genetic_Code[v]==10) + { + vshift = vshift+1; + continue; + } + nucPosInCodon = 2; + if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0)) + { + if (h$4==v$4) + { + transition = v%4; + transition2= h%4; + } + else + { + if(diff%16==0) + { + transition = v$16; + transition2= h$16; + nucPosInCodon = 0; + } + else + { + transition = v%16$4; + transition2= h%16$4; + nucPosInCodon = 1; + } + } + if (_Genetic_Code[0][h]==_Genetic_Code[0][v]) + { + ModelMatrixName[h-hshift][v-vshift] := t*EFV__[transition__][nucPosInCodon__]; + ModelMatrixName[v-vshift][h-hshift] := t*EFV__[transition2__][nucPosInCodon__]; + } + else + { + ModelMatrixName[h-hshift][v-vshift] := c*t*EFV__[transition__][nucPosInCodon__]; + ModelMatrixName[v-vshift][h-hshift] := c*t*EFV__[transition2__][nucPosInCodon__]; + } + } + } + } + } + else + { + for (h=0; h<64; h=h+1) + { + if (_Genetic_Code[h]==10) + { + hshift = hshift+1; + continue; + } + vshift = hshift; + for (v = h+1; v<64; v=v+1) + { + diff = v-h; + if (_Genetic_Code[v]==10) + { + vshift = vshift+1; + continue; + } + if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0)) + { + if (h$4==v$4) + { + transition = v%4; + transition2= h%4; + } + else + { + if(diff%16==0) + { + transition = v$16; + transition2= h$16; + } + else + { + transition = v%16$4; + transition2= h%16$4; + } + } + if (_Genetic_Code[0][h]==_Genetic_Code[0][v]) + { + if (Abs(transition-transition2)%2) + { + ModelMatrixName[h-hshift][v-vshift] := kappa*t; + ModelMatrixName[v-vshift][h-hshift] := kappa*t; + } + else + { + ModelMatrixName[h-hshift][v-vshift] := t; + ModelMatrixName[v-vshift][h-hshift] := t; + } + + } + else + { + if (Abs(transition-transition2)%2) + { + ModelMatrixName[h-hshift][v-vshift] := kappa*c*t; + ModelMatrixName[v-vshift][h-hshift] := kappa*c*t; + } + else + { + ModelMatrixName[h-hshift][v-vshift] := c*t; + ModelMatrixName[v-vshift][h-hshift] := c*t; + } + } + } + } + } + } + } + return (modelType>1); +} + +/* ____________________________________________________________________________________________________________________*/ + +function PopulateModelMatrix2 (ModelMatrixName&, EFV) +{ + ModelMatrixName = {ModelMatrixDimension,ModelMatrixDimension}; + + hshift = 0; + + for (h=0; h<64; h=h+1) + { + if (_Genetic_Code[h]==10) + { + hshift = hshift+1; + continue; + } + vshift = hshift; + for (v = h+1; v<64; v=v+1) + { + diff = v-h; + if (_Genetic_Code[v]==10) + { + vshift = vshift+1; + continue; + } + if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0)) + { + if (h$4==v$4) + { + transition = v%4; + transition2= h%4; + } + else + { + if(diff%16==0) + { + transition = v$16; + transition2= h$16; + } + else + { + transition = v%16$4; + transition2= h%16$4; + } + } + if (_Genetic_Code[0][h]==_Genetic_Code[0][v]) + { + if (Abs(transition-transition2)%2) + { + ExecuteCommands ("ModelMatrixName[h-hshift][v-vshift] := kappa"+l+"*t;ModelMatrixName[v-vshift][h-hshift] := kappa"+l+"*t;"); + } + else + { + ModelMatrixName[h-hshift][v-vshift] := t; + ModelMatrixName[v-vshift][h-hshift] := t; + } + + } + else + { + if (Abs(transition-transition2)%2) + { + ExecuteCommands ("ModelMatrixName[h-hshift][v-vshift] := c*kappa"+l+"*t;ModelMatrixName[v-vshift][h-hshift] := c*kappa"+l+"*t;"); + } + else + { + ModelMatrixName[h-hshift][v-vshift] := c*t; + ModelMatrixName[v-vshift][h-hshift] := c*t; + } + } + } + } + } + return 1; +} +/* ____________________________________________________________________________________________________________________*/ + +function spawnLikelihood (kappaSharedOrNot) +{ + if (kappaSharedOrNot) + { + for (l=0; l=lowerSeqBound)&&(speciesIndex1)","Beta & (Normal>1)", + "0 & 2 (Normal>1)","0 & 2 (Normal>1)", + "3 Normal","3 Normal", + "RE:Log normal","Random Effects: Log normal", + "RE:Gamma","Random Effects: Gamma", + "RE:Discrete","Random Effects: 3 bin Discrete"); + + if (modelTypes[0]<0) + { + return; + } + for (rateType = 0; rateType < Rows(modelTypes)*Columns(modelTypes); rateType = rateType + 1) + { + modelType = modelTypes[rateType]; + chosenModelList[modelType] = 1; + } +} + +ChoiceList (shareType,"Choose parameter sharing mode",1,SKIP_NONE, + "All","Share dN/dS, transversion/transition ratio (if applicable) and base frequencies for all subsets.", + "dN/dS Only","Share only dN/dS. Transversion/transition ratio (if applicable) and base frequencies are separate for each subset." +); + +if (shareType<0) +{ + return; +} + +ChoiceList (modelType,"Choose a model",1,SKIP_NONE, + "MG94 1x4","Muse-Gaut 94 model with 4(-1) nucleotide frequency parameters (intra-codon position independent).", + "MG94 3x4","Muse-Gaut 94 model with 12(-3) nucleotide frequency parameters (intra-codon position specific).", + "GY94 1x4","Goldman-Yang 94 model with 4(-1) nucleotide frequency parameters (intra-codon position independent).", + "GY94 3x4","Goldman-Yang 94 model with 12(-3) nucleotide frequency parameters (intra-codon position specific)." +); + +if (modelType<0) +{ + return; +} + +if ((modelType==0)||(modelType==2)) +{ + if (shareType==0) + { + HarvestFrequencies (observedFreq,filteredData,1,1,0); + vectorOfFrequencies = BuildCodonFrequencies4 (observedFreq); + } + else + { + for (v=0; v1) +{ + global kappa = 2.; +} + +fprintf (stdout, "\n\n\nChoose the cutoff (0 to 1) for posterior of dN/dS>1 for a site to be considered under selective pressure:"); +fscanf (stdin, "Number",psigLevel); +if ((psigLevel <= 0)||(psigLevel>1)) +{ + psigLevel = .95; +} +fprintf (stdout, "\n>Using ", psigLevel , " cutoff\n"); + +fprintf (stdout, "\nChoose the number of categories in discretized distributions:"); +fscanf (stdin, "Number",categCount); +categCount = categCount$1; +if (categCount<=0) +{ + categCount = 8; +} + +fprintf (stdout, "\n>Using ", Format (categCount,0,0), " categories.\n"); + +SetDialogPrompt ("Write detailed results to:"); + +fprintf (PROMPT_FOR_FILE,CLEAR_FILE); + +global c = 1.; + +dummyVar = FrameText ("-","|",2,"SUMMARY TABLE"); +tableSeparator = "+-------------------------+----------------+---------------+-----+\n"; +fprintf (stdout, "\n\"p\" is the number of parameters in addition to the branch lengths.\nDetailed results including sites with dN/dS>1 will be written to\n",LAST_FILE_PATH,"\n\n"); +fprintf (stdout, tableSeparator, + "| MODEL (Number & Desc) | Log likelihood | dN/dS | p |\n", + tableSeparator); + +cachedBranchLengths = {{-1,-1}}; + +if (chosenModelList[0]>0) +{ + timer = Time(1); + fprintf (LAST_FILE_PATH,"\n*** RUNNING SINGLE RATE MODEL ***\n#################################\n"); + dummy = spawnLikelihood (shareType); + Optimize (res,lf); + fprintf (LAST_FILE_PATH,"\n>Done in ", Time(1)-timer, " seconds \n\n"); + fprintf (LAST_FILE_PATH,lf,"\n\n-----------------------------------\n\ndN/dS = ",c,"\n\n"); + + fprintf (stdout, "| 0. Single Rate Model | ",Format (res[1][0],14,6)," | ",Format (c,13,8)," | 0 |\n", + tableSeparator); + + timer = res[1][1]-res[1][2]; + cachedBranchLengths = {timer,1}; + + for (rateType = timer; rateType < Columns(cachedBranchLengths); rateType = rateType+1) + { + cachedBranchLengths[rateType-timer][0] = res [0][rateType]; + } +} + +for (rateType = 0; rateType < 16; rateType = rateType + 1) +{ + if (chosenModelList[rateType+1]==0) + { + continue; + } + timer = Time(1); + dummy = SetWDistribution (categCount); + dummy = spawnLikelihood (shareType); + + fprintf (LAST_FILE_PATH,"\n*** RUNNING MODEL ", Format(rateType+1,0,0), " (",ModelNames[rateType],") ***\n######################################\n"); + /*if (cachedBranchLengths[0][0]>=0.0) + { + v = ParameterCount[rateType]; + if (modelType>1) + { + v=v+1; + } + for (h=0; hDone in ",Time(1)-timer, " seconds \n\n", lf); + fprintf (stdout, "| "); + if (rateType<9) + { + fprintf (stdout," "); + } + fprintf (stdout, Format (rateType+1,0,0), ". ", ModelNames[rateType]); + for (dummy = Abs(ModelNames[rateType])+5; dummy<25; dummy = dummy+1) + { + fprintf (stdout," "); + } + dummy = GetDistributionParameters(psigLevel); + fprintf (stdout,"| ",Format (res[1][0],14,6)," | ",Format (dummy,13,8)," | ", + Format(ParameterCount[rateType],0,0)," |\n",tableSeparator); + + if (modelType>1) + { + kappa = 2.; + } +} diff --git a/samples/HyPhy/MolecularClock.bf b/samples/HyPhy/MolecularClock.bf new file mode 100644 index 00000000..92606d2d --- /dev/null +++ b/samples/HyPhy/MolecularClock.bf @@ -0,0 +1,147 @@ +#include "molclockBootstrap.bf"; + +RESTORE_GLOBALS = 1; +_DO_TREE_REBALANCE_ = 0; +VERBOSITY_LEVEL = -1; + +function RestoreGlobalValues (lfIndex) +{ + if (lfIndex==0) + { + for (i=0;i1) +{ + ChoiceList (parameter2Constrain, "Parameter(s) to constrain:",1,SKIP_NONE,LAST_MODEL_PARAMETER_LIST); + + if (parameter2Constrain<0) + { + return; + } + if (parameter2Constrain==0) + { + parameter2ConstrainString = ""; + for (parameter2Constrain=Rows("LAST_MODEL_PARAMETER_LIST")-1; parameter2Constrain; parameter2Constrain = parameter2Constrain-1) + { + GetString (funnyString,LAST_MODEL_PARAMETER_LIST,parameter2Constrain); + parameter2ConstrainString = parameter2ConstrainString + funnyString + ","; + } + GetString (funnyString,LAST_MODEL_PARAMETER_LIST,0); + parameter2ConstrainString = parameter2ConstrainString + funnyString; + } + else + { + GetString (parameter2ConstrainString,LAST_MODEL_PARAMETER_LIST,parameter2Constrain-1); + } +} +else +{ + GetString (parameter2ConstrainString,LAST_MODEL_PARAMETER_LIST,0); +} + +timer = Time(0); + +LikelihoodFunction lf = (filteredData,givenTree); + +Optimize (res,lf); + +separator = "*-----------------------------------------------------------*"; + +fprintf (stdout, "\n", separator, "\nRESULTS WITHOUT THE CLOCK:\n",lf); + +fullModelLik = res[1][0]; + +fullVars = res[1][1]; + +/* now specify the constraint */ + +Tree clockTree = treeString; + +ExecuteCommands ("MolecularClock (clockTree,"+parameter2ConstrainString+");"); + +LikelihoodFunction lfConstrained = (filteredData, clockTree); + +USE_LAST_RESULTS = 1; +Optimize (res1,lfConstrained); +USE_LAST_RESULTS = 0; + +SAVE_GLOBALS = res1[1][2]; + +if (SAVE_GLOBALS) +{ + globalSpoolMatrix = {1,SAVE_GLOBALS}; + + for (i=0;idS in the data sets? + - Do the sites under positive selection share the same dN/dS? + - Two previous tests combined +*/ + +RequireVersion ("0.9920061001"); + +/*---------------------------------------------------------------------------------------------------------------------------------------------------*/ + +function BuildCodonFrequencies (obsF) +{ + PIStop = 1.0; + result = {ModelMatrixDimension,1}; + hshift = 0; + + for (h=0; h<64; h=h+1) + { + first = h$16; + second = h%16$4; + third = h%4; + if (_Genetic_Code[h]==10) + { + hshift = hshift+1; + PIStop = PIStop-obsF[first][0]*obsF[second][1]*obsF[third][2]; + continue; + } + result[h-hshift][0]=obsF[first][0]*obsF[second][1]*obsF[third][2]; + } + return result*(1.0/PIStop); +} + +/*---------------------------------------------------------------------------------------------------------------------------------------------------*/ + +function ReportDistributionString (rc,freqStrMx,infix, skip0) +{ + distroString = ""; + distroString * 1024; + + reportMx = {rc,4}; + + distroString * (" dN/dS dS dN Prob\n"); + for (mi=0; mi 0) + { + distroString * (Format(reportMx[mi][2],8,3)+Format(reportMx[mi][0],8,3)+Format(reportMx[mi][1],8,3)+Format(reportMx[mi][3],10,3)+"\n"); + } + } + + distroString * 0; + return distroString; +} + +/*---------------------------------------------------------------------------------------------------------------------------------------------------*/ + +function DefineNucleotideBiases (dummy) +{ + ModelTitle = "MG94x"+modelDesc[0]; + modelConstraintString = ""; + + for (customLoopCounter2=0; customLoopCounter2<6; customLoopCounter2=customLoopCounter2+1) + { + if (rateBiasTerms[customLoopCounter2] != "1") + { + ExecuteCommands ("global " + rateBiasTerms[customLoopCounter2] + "=1;"); + } + } + + for (customLoopCounter2=1; customLoopCounter2<6; customLoopCounter2=customLoopCounter2+1) + { + for (customLoopCounter=0; customLoopCounter0.0000001;global NS_"+infix+"0 = 0.1;"); + + for (mi=1; mi0.0000001;\nglobal NS_"+infix+mi+";\n"); + if (randomizeInitValues) + { + categDef1*("global P_"+infix+mi+" = Random(0.05,0.95);\nP_"+infix+mi+":<1;\n"); + categDef1*("global S_"+infix+mi+" = Random(0.05,1);"); + } + else + { + categDef1*("global P_"+infix+mi+" = 1/"+(resp+1-mi)+";\nP_"+infix+mi+":<1;\n"); + } + } + + freqStrMx = {resp,1}; + if (resp>1) + { + freqStrMx[0] = "P_"+infix+"1"; + + for (mi=1; mi1;R_"+infix+mi+"="+Random(1.05,10)+";"); + } + } + else + { + for (mi=respN+respP; mi1;R_"+infix+mi+"=2+"+mi+";"); + } + } + } + else + { + if (randomizeInitValues) + { + for (mi=0; mi 1) + { + lfParts = lfParts + ",filteredData_" + fileID + "," + treeID; + } + } +} + +lfParts * 0; +ExecuteCommands (lfParts + "," + lfDef1 + ");"); + +sumPath = resToPath + ".summary"; + +treeBranchParameters = 0; +for (fileID = 1; fileID <= fileCount; fileID = fileID + 1) +{ + ExecuteCommands ("treeBranchParameters = treeBranchParameters + BranchCount(tree_" + fileID + "_0) + TipCount(tree_" + fileID + "_0);"); +} + +/*-------------- INDEPENDENT DISTRIBUTIONS ------------------*/ + +sop = Max(OPTIMIZATION_PRECISION,0.001); + +fprintf (stdout, "Running simpler distribution approximations to ensure good convergence...\n"); + +OPTIMIZATION_PRECISION = 0.1; + +P_1_1 := 0;P_1_2 := 0;P_1_3 := 0; +P_2_1 := 0;P_2_2 := 0;P_2_3 := 0; + +S_1_0 := 1;S_1_1 := 1;S_1_2 := 1;S_1_3 := 1; +S_2_0 := 1;S_2_1 := 1;S_2_2 := 1;S_2_3 := 1; +R_1_0 := 1;R_1_1 := 1;R_1_2 := 1; +R_2_0 := 1;R_2_1 := 1;R_2_2 := 1; + +Optimize (res,lf); +USE_LAST_RESULTS = 1; +SKIP_CONJUGATE_GRADIENT = 1; + +d1 = ReportDistributionString(4,freqStrMx_1,"1_",1); +d2 = ReportDistributionString(4,freqStrMx_2,"2_",1); + +fprintf (stdout, "\n*** Done with pass 1. Log(L) = ", Format(res[1][0],10,3), " *** \n"); +fprintf (stdout, "Approximate rates for data set 1:\n", d1); +fprintf (stdout, "Approximate rates for data set 2:\n", d2); + +codonFactor_1 := codonFactor_1__; +codonFactor_2 := codonFactor_2__; + +AC_1 := AC_1__; AT_1 := AT_1__; CG_1 := CG_1__; CT_1 := CT_1__; GT_1 := GT_1__; +AC_2 := AC_2__; AT_2 := AT_2__; CG_2 := CG_2__; CT_2 := CT_2__; GT_2 := GT_2__; + + + +fprintf (stdout, "\nGateaux sampling positively selected directions\n"); + +P_1_1 = 2/filteredData_1.sites; +S_1_3 = 1; + +baseLineLL = res[1][0]; +LFCompute (lf,LF_START_COMPUTE); +bestDiff = -1e100; +bestAlpha = 1; +bestBeta = 1; +bestLL = -1e100; + +step = 0.1; +step2 = 0.25; +v1 = 0.05; + +for (v1c = 0; v1c < 10; v1c = v1c + 1) +{ + v2 = v1+step; + for (v2c = 0; v2c < 20; v2c = v2c+1) + { + checkASample ("1_0"); + v2 = v2 + step2; + } + v1 = v1+step; +} + +S_1_0 = bestAlpha; +R_1_0 = bestBeta; +bestAlpha = 1; +bestBeta = 1; + +if (bestDiff <= 0) +{ + P_1_1 = 0; +} +saveBD = bestDiff; + +P_2_1 = 1/filteredData_2.sites; +S_2_3 = 1; + +v1 = 0.05; +for (v1c = 0; v1c < 10; v1c = v1c + 1) +{ + v2 = v1+step; + for (v2c = 0; v2c < 20; v2c = v2c+1) + { + checkASample ("2_0"); + v2 = v2 + step2; + } + v1 = v1+step; +} + +LFCompute (lf,LF_DONE_COMPUTE); + +S_2_0 = bestAlpha; +R_2_0 = bestBeta; + +if (bestDiff <= saveBD) +{ + P_2_1 = 0; +} + +if (bestDiff > 0) +{ + fprintf (stdout, "\nFound a likelihood improvement in the direction (", S_1_0, ",", S_1_0*R_1_0, "), (", S_2_0, ",", S_2_0*R_2_0, "), ",bestDiff," likelihood points\n"); +} + + +Optimize (res,lf); + +d1 = ReportDistributionString(4,freqStrMx_1,"1_",1); +d2 = ReportDistributionString(4,freqStrMx_2,"2_",1); + +fprintf (stdout, "\n*** Done with pass 2. Log(L) = ", Format(res[1][0],10,3), " *** \n"); +fprintf (stdout, "Approximate rates for data set 1:\n", d1); +fprintf (stdout, "Approximate rates for data set 2:\n", d2); + + +fprintf (stdout, "\nGateaux sampling neutral directions\n"); + +baseLineLL = res[1][0]; +LFCompute (lf,LF_START_COMPUTE); + +P_1_2 = 1/(filteredData_1.sites*(1-P_1_1)); +bestDiff = -1e100; +bestAlpha = 1; +step = 0.02; +v1 = 0; +for (v1c = 0; v1c < 50; v1c = v1c + 1) +{ + v1 = v1 + step; + v2 = v1; + + checkASample ("1_1"); +} + + +S_1_1 = bestAlpha; +bestAlpha = 1; + +if (bestDiff <= 0) +{ + P_1_2 = 0; +} +saveBD = bestDiff; +R_1_1 := 1; + +P_2_2 = 1/(filteredData_2.sites*(1-P_2_1)); + +v1 = 0; +for (v1c = 0; v1c < 50; v1c = v1c + 1) +{ + v1 = v1 + step; + v2 = v1; + + checkASample ("2_1"); +} + + +LFCompute (lf,LF_DONE_COMPUTE); + +S_2_1 = bestAlpha; +R_2_1 := 1; + +if (bestDiff <= saveBD) +{ + P_2_2 = 0; +} + +if (bestDiff > 0) +{ + fprintf (stdout, "\nFound a likelihood improvement in the direction (", S_1_1, ",", S_1_1, "), (", S_2_1, ",", S_2_1, "), ",bestDiff," likelihood points\n"); +} + +Optimize (res,lf); +d1 = ReportDistributionString(4,freqStrMx_1,"1_",1); +d2 = ReportDistributionString(4,freqStrMx_2,"2_",1); + +fprintf (stdout, "\n*** Done with pass 3. Log(L) = ", Format(res[1][0],10,3), " *** \n"); +fprintf (stdout, "Approximate rates for data set 1:\n", d1); +fprintf (stdout, "Approximate rates for data set 2:\n", d2); + +fprintf (stdout, "\nGateaux sampling negative selected directions\n"); + + +baseLineLL = res[1][0]; +LFCompute (lf,LF_START_COMPUTE); +bestDiff = -1e100; +bestAlpha = 1; +bestBeta = 1; + + +step = 1/16; +v1 = 0; +P_1_3 = 2/(filteredData_1.sites*(1-P_1_1)*(1-P_1_2)); +S_1_2 = 1; +for (v1c = 0; v1c < 15; v1c = v1c + 1) +{ + v1 = v1+step; + v2 = step/2; + for (v2c = 0; v2c < v1c; v2c = v2c+1) + { + checkASample ("1_2"); + v2 = v2 + step; + } +} + + +S_1_2 = bestAlpha; +R_1_2 = bestBeta; +bestAlpha = 1; +bestBeta = 1; + +if (bestDiff <= 0) +{ + P_1_3 = 0; +} +saveBD = bestDiff; + + +v1 = 0; +P_2_3 = 2/(filteredData_2.sites*(1-P_2_1)*(1-P_2_2)); +S_2_2 = 1; +for (v1c = 0; v1c < 15; v1c = v1c + 1) +{ + v1 = v1+step; + v2 = step/2; + for (v2c = 0; v2c < v1c; v2c = v2c+1) + { + checkASample ("2_2"); + v2 = v2 + step; + } +} + +LFCompute (lf,LF_DONE_COMPUTE); + +S_2_2 = bestAlpha; +R_2_2 = bestBeta; + +if (bestDiff <= saveBD) +{ + P_2_2 = 0; +} + +if (bestDiff > 0) +{ + fprintf (stdout, "\nFound a likelihood improvement in the direction (", S_1_2, ",", S_1_2*R_1_2, "), (", S_2_2, ",", S_2_2*R_2_2, "), ",bestDiff," likelihood points\n"); +} + +AC_1 = AC_1; AT_1 = AT_1; CG_1 = CG_1; CT_1 = CT_1; GT_1 = GT_1; +AC_2 = AC_2; AT_2 = AT_2; CG_2 = CG_2; CT_2 = CT_2; GT_2 = GT_2; + +if (Abs(modelConstraintString_1)) +{ + ExecuteCommands (modelConstraintString_1); +} +if (Abs(modelConstraintString_2)) +{ + ExecuteCommands (modelConstraintString_2); +} + +codonFactor_1 = codonFactor_1; +codonFactor_2 = codonFactor_2; + + +GetString (paramList, lf, -1); +degF = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters; + +OPTIMIZATION_PRECISION = sop; +fprintf (stdout, "Running an independent distributions model fit (", degF, " parameters)...\n"); +Optimize (res,lf); +LogL = res[1][0]; + +AIC = 2*(degF-LogL); +AICc = 2*(degF*totalCodonCount/(totalCodonCount-degF-1) - LogL); + +fprintf (stdout, "\n\nIndependent distributions model fit summary\n", + "\nLog likelihood:", Format (LogL, 15, 5), + "\nParameters :", Format (degF, 15, 0), + "\nAIC :", Format (AIC, 15, 5), + "\nc-AIC :", Format (AICc, 15, 5),"\n" +); + +fprintf (sumPath,CLEAR_FILE,"Independent distributions model fit summary\n", + "\nLog likelihood:", Format (LogL, 15, 5), + "\nParameters :", Format (degF, 15, 0), + "\nAIC :", Format (AIC, 15, 5), + "\nc-AIC :", Format (AICc, 15, 5),"\n"); + +d1 = ReportDistributionString(4,freqStrMx_1,"1_",0); +d2 = ReportDistributionString(4,freqStrMx_2,"2_",0); + +fprintf (stdout, "Inferred rates for data set 1:\n", d1); +fprintf (sumPath, "Inferred rates for data set 1:\n", d1); +fprintf (stdout, "Inferred rates for data set 2:\n", d2); +fprintf (sumPath, "Inferred rates for data set 2:\n", d2); + +LIKELIHOOD_FUNCTION_OUTPUT = 6; +fprintf (resToPath,CLEAR_FILE,lf); + + +/*-------------- SHARED POSITIVE SELECTION STRENGTHS ------------------*/ + +for (k=0; k=respP+respN) + { + ExecuteCommands ("R_1_"+k+"=0.5(R_1_"+k+"+R_2_"+k+");"); + } + ExecuteCommands ("S_2_"+k+":=S_1_"+k+";R_2_"+k+":=R_1_"+k+";"); +} +for (k=1; k<=resp; k=k+1) +{ + ExecuteCommands ("P_1_"+k+"=0.5(P_1_"+k+"+P_2_"+k+");"); + ExecuteCommands ("P_2_"+k+":=P_1_"+k+";"); +} + +GetString (paramList, lf, -1); +degFJ = Columns(paramList["Global Independent"]) + 14 - 2*branchLengths + treeBranchParameters; +fprintf (stdout, "\nRunning a shared distributions model fit (", degFJ, " parameters)...\n"); + +Optimize (res_J,lf); +LogLJ = res_J[1][0]; + +AICJ = 2*(degFJ-LogLJ); +AICcJ = 2*(degFJ*totalCodonCount/(totalCodonCount-degFJ-1) - LogLJ); + +fprintf (stdout, "\n\nShared distributions model fit summary\n", + "\nLog likelihood:", Format (LogLJ, 15, 5), + "\nParameters :", Format (degFJ, 15, 0), + "\nAIC :", Format (AICJ, 15, 5), + "\nc-AIC :", Format (AICcJ, 15, 5),"\n" +); + +fprintf (sumPath, "\n\nShared distributions model fit summary\n", + "\nLog likelihood:", Format (LogLJ, 15, 5), + "\nParameters :", Format (degFJ, 15, 0), + "\nAIC :", Format (AICJ, 15, 5), + "\nc-AIC :", Format (AICcJ, 15, 5),"\n" +); + +d1 = ReportDistributionString(4,freqStrMx_1,"1_",0); + +fprintf (stdout, "Inferred joint rates:\n", d1); +fprintf (sumPath, "Inferred joint rates:\n", d1); + +fpath = resToPath + ".JointAll"; +LIKELIHOOD_FUNCTION_OUTPUT = 6; +fprintf (fpath,CLEAR_FILE,lf); + +USE_LAST_RESULTS = 0; +SKIP_CONJUGATE_GRADIENT = 0; +LIKELIHOOD_FUNCTION_OUTPUT = 2; + +fprintf (stdout, "\nDistribution comparison tests\n", + "\n\tAre the distributions different?", + "\n\t\tLR = ", Format (2*(LogL-LogLJ),10,3), + " DF = ", degF-degFJ, + " p = ", Format(1-CChi2(2*(LogL-LogLJ), degF-degFJ),8,3), "\n"); + +fprintf (stdout, "\n\tAre selective regimes (dN/dS and proportions) different?", + "\n\t\tLR = ", Format (2*(LogL-LogLPSH),10,3), + " DF = ", degF-degFPSH, + " p = ", Format(1-CChi2(2*(LogL-LogLPSH), degF-degFPSH),8,3), "\n"); + +fprintf (stdout, "\n\tAre selection strengths (dN/dS) different?", + "\n\t\tLR = ", Format (2*(LogL-LogLPSS),10,3), + " DF = ", degF-degFPSS, + " p = ", Format(1-CChi2(2*(LogL-LogLPSS), degF-degFPSH),8,3), "\n"); + +fprintf (stdout, "\n\tAre the proportions of codons under selection different?", + "\n\t\tLR = ", Format (2*(LogL-LogLPSP),10,3), + " DF = ", degF-degFPSP, + " p = ", Format(1-CChi2(2*(LogL-LogLPSP), degF-degFPSP),8,3), "\n"); + +fprintf (sumPath, "\n\nDistribution comparison tests\n", + "\n\tAre the distributions different?", + "\n\t\tLR = ", Format (2*(LogL-LogLJ),10,3), + " DF = ", degF-degFJ, + " p = ", Format(1-CChi2(2*(LogL-LogLJ), degF-degFJ),8,3), "\n"); + +fprintf (sumPath, "\n\tAre selective regimes (dN/dS and proportions) different?", + "\n\t\tLR = ", Format (2*(LogL-LogLPSH),10,3), + " DF = ", degF-degFPSH, + " p = ", Format(1-CChi2(2*(LogL-LogLPSH), degF-degFPSH),8,3), "\n"); + +fprintf (sumPath, "\n\tAre selection strengths (dN/dS) different?", + "\n\t\tLR = ", Format (2*(LogL-LogLPSS),10,3), + " DF = ", degF-degFPSS, + " p = ", Format(1-CChi2(2*(LogL-LogLPSS), degF-degFPSH),8,3), "\n"); + +fprintf (sumPath, "\n\tAre the proportions of codons under selection different?", + "\n\t\tLR = ", Format (2*(LogL-LogLPSP),10,3), + " DF = ", degF-degFPSP, + " p = ", Format(1-CChi2(2*(LogL-LogLPSP), degF-degFPSP),8,3), "\n"); +/*------------------------------------------------------------------------------------------------------------*/ + +function checkASample (whichRate) +{ + ExecuteCommands ("S_"+whichRate+"=v1;R_"+whichRate+"=v2/v1;"); + LFCompute (lf,res_n); + localDiff = res_n-baseLineLL; + if (localDiff > bestDiff) + { + bestDiff = localDiff; + bestAlpha = v1; + bestBeta = v2/v1; + } + return 0; +} diff --git a/samples/HyPhy/hyphy_cmds.bf b/samples/HyPhy/hyphy_cmds.bf new file mode 100644 index 00000000..7a2189f9 --- /dev/null +++ b/samples/HyPhy/hyphy_cmds.bf @@ -0,0 +1,11003 @@ +INTEGRATION_PRECISION_FACTOR = 5.0e-6; +END_OF_FILE = 0; +LIKELIHOOD_FUNCTION_OUTPUT = 5; +ACCEPT_BRANCH_LENGTHS = 1; +#include "/home/oashenbe/.local/lib/python2.7/site-packages/phyloExpCM/data//NTsCodonsAAs.ibf"; +fprintf(stdout, "Running HYPHY script hyphy_cmds.bf...\n"); +DataSet data = ReadDataFile("_codenames_Aligned_NPs_Swine.fasta"); +assert(data.sites % 3 == 0, "Sequence lengths not multiples of 3"); +totalcodons = data.sites $ 3; +fprintf(stdout, "Read from _codenames_Aligned_NPs_Swine.fasta a set of ", data.species, " sequences consisting of ", data.sites, " nucleotides corresponding to ", totalcodons, " codons each.\n"); +fprintf(stdout, "The analysis will include the following 498 codon positions (sequential numbering starting with 1):\n1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498\n"); +assert(totalcodons >= 498, "Largest included site exceeds sequence length"); +DataSetFilter codonfilter = CreateFilter(data, 3, "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621,622,623,624,625,626,627,628,629,630,631,632,633,634,635,636,637,638,639,640,641,642,643,644,645,646,647,648,649,650,651,652,653,654,655,656,657,658,659,660,661,662,663,664,665,666,667,668,669,670,671,672,673,674,675,676,677,678,679,680,681,682,683,684,685,686,687,688,689,690,691,692,693,694,695,696,697,698,699,700,701,702,703,704,705,706,707,708,709,710,711,712,713,714,715,716,717,718,719,720,721,722,723,724,725,726,727,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,816,817,818,819,820,821,822,823,824,825,826,827,828,829,830,831,832,833,834,835,836,837,838,839,840,841,842,843,844,845,846,847,848,849,850,851,852,853,854,855,856,857,858,859,860,861,862,863,864,865,866,867,868,869,870,871,872,873,874,875,876,877,878,879,880,881,882,883,884,885,886,887,888,889,890,891,892,893,894,895,896,897,898,899,900,901,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,930,931,932,933,934,935,936,937,938,939,940,941,942,943,944,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024,1025,1026,1027,1028,1029,1030,1031,1032,1033,1034,1035,1036,1037,1038,1039,1040,1041,1042,1043,1044,1045,1046,1047,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,1072,1073,1074,1075,1076,1077,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,1111,1112,1113,1114,1115,1116,1117,1118,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128,1129,1130,1131,1132,1133,1134,1135,1136,1137,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,1148,1149,1150,1151,1152,1153,1154,1155,1156,1157,1158,1159,1160,1161,1162,1163,1164,1165,1166,1167,1168,1169,1170,1171,1172,1173,1174,1175,1176,1177,1178,1179,1180,1181,1182,1183,1184,1185,1186,1187,1188,1189,1190,1191,1192,1193,1194,1195,1196,1197,1198,1199,1200,1201,1202,1203,1204,1205,1206,1207,1208,1209,1210,1211,1212,1213,1214,1215,1216,1217,1218,1219,1220,1221,1222,1223,1224,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234,1235,1236,1237,1238,1239,1240,1241,1242,1243,1244,1245,1246,1247,1248,1249,1250,1251,1252,1253,1254,1255,1256,1257,1258,1259,1260,1261,1262,1263,1264,1265,1266,1267,1268,1269,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279,1280,1281,1282,1283,1284,1285,1286,1287,1288,1289,1290,1291,1292,1293,1294,1295,1296,1297,1298,1299,1300,1301,1302,1303,1304,1305,1306,1307,1308,1309,1310,1311,1312,1313,1314,1315,1316,1317,1318,1319,1320,1321,1322,1323,1324,1325,1326,1327,1328,1329,1330,1331,1332,1333,1334,1335,1336,1337,1338,1339,1340,1341,1342,1343,1344,1345,1346,1347,1348,1349,1350,1351,1352,1353,1354,1355,1356,1357,1358,1359,1360,1361,1362,1363,1364,1365,1366,1367,1368,1369,1370,1371,1372,1373,1374,1375,1376,1377,1378,1379,1380,1381,1382,1383,1384,1385,1386,1387,1388,1389,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399,1400,1401,1402,1403,1404,1405,1406,1407,1408,1409,1410,1411,1412,1413,1414,1415,1416,1417,1418,1419,1420,1421,1422,1423,1424,1425,1426,1427,1428,1429,1430,1431,1432,1433,1434,1435,1436,1437,1438,1439,1440,1441,1442,1443,1444,1445,1446,1447,1448,1449,1450,1451,1452,1453,1454,1455,1456,1457,1458,1459,1460,1461,1462,1463,1464,1465,1466,1467,1468,1469,1470,1471,1472,1473,1474,1475,1476,1477,1478,1479,1480,1481,1482,1483,1484,1485,1486,1487,1488,1489,1490,1491,1492,1493", "", "TAA,TAG,TGA"); +assert(data.species == codonfilter.species, "species number mismatch"); +assert(codonfilter.sites == 498, "Codon filtered data does not contain the right number of sites"); +fprintf(stdout, "Created a codon filter of ", codonfilter.sites, " sites.\n"); +assert(totalcodons - (totalcodons - 498) - 0 == codonfilter.sites, "Codon filtered data is not the expected length. Do sequences contain stop codons?"); +CheckCodonFilter("codonfilter"); +fprintf(stdout, "Reading tree string from _codenames_codonphyml_Swine_tree.newick.\n"); +fscanf("_codenames_codonphyml_Swine_tree.newick", String, treestring); +fprintf(stdout, "Using the Goldman Yang 1994 (GY94) codon model...\n"); +#include "/home/oashenbe/.local/lib/python2.7/site-packages/phyloExpCM/data//CF3x4.ibf"; +#include "/home/oashenbe/.local/lib/python2.7/site-packages/phyloExpCM/data//GY94.ibf"; +CreateGY94Model("CF3x4", "global", "global", 4, 4, 1); +UseModel(model); +ExecuteCommands("Tree tree = treestring;") +assert(codonfilter.species == TipCount(tree), "Number of species and number of tips differ"); +LikelihoodFunction likelihood = (codonfilter, tree); +fprintf(stdout, "\nNow optimizing the likelihood function...\n"); +Optimize(mlestimates, likelihood) +fprintf(stdout, "Completed likelihood optimization. Optimized ", mlestimates[1][1], " indpendent parameters and ", mlestimates[1][2], " shared parameters to obtain a log likelihood of ", mlestimates[1][0], ".\n"); +fprintf(stdout, "Writing the results to hyphy_output.txt.\n"); +fprintf("hyphy_output.txt", "Log likelihood: ", mlestimates[1][0], "\nindependent parameters (includes branch lengths): ", mlestimates[1][1], "\nshared parameters: ", mlestimates[1][2], "\nnumber of branch lengths: ", TipCount(tree) + BranchCount(tree), "\nnumber of tip nodes: ", TipCount(tree), "\nnumber of internal branches: ", BranchCount(tree), "\n",likelihood); +fprintf(stdout, "\nNow computing per-site likelihoods.\n"); +fprintf(stdout, "\nFirst fixing all global variables to the maximum-likelihood values estimated on the entire tree.\n"); +GetString(associativearray, likelihood, -1); +globalindependentvariables = associativearray["Global Independent"]; +for (ivariable=0; ivariable