diff --git a/samples/HyPhy/454.bf b/samples/HyPhy/454.bf new file mode 100644 index 00000000..40c867bd --- /dev/null +++ b/samples/HyPhy/454.bf @@ -0,0 +1,2988 @@ + + + + + +
+ + + + + +
+ You can clone with + HTTPS, SSH, or Subversion. + + + +
+ + + + Clone in Desktop + + + + + + + Download ZIP + +| + | ExecuteAFile ("Utility/GrabBag.bf"); | +
| + | ExecuteAFile ("Utility/DBTools.ibf"); | +
| + | + |
| + | + |
| + | alignOptions = {}; | +
| + | + |
| + | SetDialogPrompt ("Sequence File:"); | +
| + | DataSet unal = ReadDataFile (PROMPT_FOR_FILE); | +
| + | BASE_FILE_PATH = LAST_FILE_PATH; | +
| + | DB_FILE_PATH = LAST_FILE_PATH + ".db"; | +
| + | + |
| + | ANALYSIS_DB_ID = _openCacheDB (DB_FILE_PATH); | +
| + | + |
| + | haveTable = _TableExists (ANALYSIS_DB_ID, "SETTINGS"); | +
| + | if (haveTable) | +
| + | { | +
| + | existingSettings = _ExecuteSQL (ANALYSIS_DB_ID, "SELECT * FROM SETTINGS"); | +
| + | } | +
| + | + |
| + | if (Abs(existingSettings)) | +
| + | { | +
| + | existingSettings = existingSettings [0]; | +
| + | ExecuteCommands ("_Genetic_Code = " + existingSettings["GENETIC_CODE"]); | +
| + | ExecuteCommands (existingSettings["OPTIONS"] ^ {{"_hyphyAssociativeArray","alignOptions"}}); | +
| + | masterReferenceSequence = existingSettings["REFERENCE"]; | +
| + | dbSequences = _ExecuteSQL (ANALYSIS_DB_ID, "SELECT SEQUENCE_ID FROM SEQUENCES WHERE STAGE = 0"); | +
| + | unalSequenceCount = Abs(dbSequences); | +
| + | toDoSequences = {unalSequenceCount,1}; | +
| + | for (k = 0; k < unalSequenceCount; k=k+1) | +
| + | { | +
| + | toDoSequences[k] = (dbSequences[k])["SEQUENCE_ID"]; | +
| + | } | +
| + | dbSequences = 0; | +
| + | fprintf (stdout, "[PHASE 1] Reloaded ", unalSequenceCount, " unprocessed sequences\n"); | +
| + | } | +
| + | else | +
| + | { | +
| + | tableDefines = {}; | +
| + | tableDefines ["SETTINGS"] = {}; | +
| + | (tableDefines ["SETTINGS"])["RUN_DATE"] = "DATE"; | +
| + | (tableDefines ["SETTINGS"])["OPTIONS"] = "TEXT"; | +
| + | (tableDefines ["SETTINGS"])["REFERENCE"] = "TEXT"; | +
| + | (tableDefines ["SETTINGS"])["GENETIC_CODE"] = "TEXT"; | +
| + | (tableDefines ["SETTINGS"])["THRESHOLD"] = "REAL"; | +
| + | + |
| + | tableDefines ["SEQUENCES"] = {}; | +
| + | (tableDefines ["SEQUENCES"])["SEQUENCE_ID"] = "TEXT UNIQUE"; | +
| + | (tableDefines ["SEQUENCES"])["LENGTH"] = "INTEGER"; | +
| + | (tableDefines ["SEQUENCES"])["STAGE"] = "INTEGER"; | +
| + | /* | +
| + | 0 - initial import | +
| + | 1 - in frame without a fix | +
| + | 2 - one frame shift / fixed | +
| + | 3 - out-of-frame; not fixed / not aligned | +
| + | */ | +
| + | (tableDefines ["SEQUENCES"])["RAW"] = "TEXT"; | +
| + | (tableDefines ["SEQUENCES"])["ALIGNED_AA"] = "TEXT"; /* aligned aa. sequence */ | +
| + | (tableDefines ["SEQUENCES"])["ALIGNED"] = "TEXT"; /* aligned nucleotide sequence */ | +
| + | (tableDefines ["SEQUENCES"])["OFFSET"] = "INTEGER"; /* start offset w.r.t the reference sequence */ | +
| + | (tableDefines ["SEQUENCES"])["END_OFFSET"] = "INTEGER"; /* end offset w.r.t the reference sequence */ | +
| + | (tableDefines ["SEQUENCES"])["SCORE"] = "REAL"; | +
| + | (tableDefines ["SEQUENCES"])["FRAME"] = "INTEGER"; | +
| + | + |
| + | _CreateTableIfNeeded (ANALYSIS_DB_ID, "SETTINGS", tableDefines["SETTINGS"], 0); | +
| + | _CreateTableIfNeeded (ANALYSIS_DB_ID, "SEQUENCES", tableDefines["SEQUENCES"], 1); | +
| + | + |
| + | /* START ALIGNMENT SETTINGS */ | +
| + | LoadFunctionLibrary ("SeqAlignShared.ibf"); | +
| + | + |
| + | DataSetFilter filteredData = CreateFilter (unal,1); | +
| + | + |
| + | GetInformation (UnalignedSeqs,filteredData); | +
| + | /* preprocess sequences */ | +
| + | + |
| + | unalSequenceCount = Rows(UnalignedSeqs)*Columns(UnalignedSeqs); | +
| + | GetString (sequenceNames, unal, -1); | +
| + | + |
| + | longestSequence = 0; | +
| + | longestSequenceIDX = 0; | +
| + | + |
| + | seqRecord = {}; | +
| + | + |
| + | fprintf (stdout, "[PHASE 1] Initial Processing of ", unalSequenceCount, " sequences\n"); | +
| + | + |
| + | for (seqCounter = 0; seqCounter < unalSequenceCount; seqCounter = seqCounter+1) | +
| + | { | +
| + | UnalignedSeqs[seqCounter] = UnalignedSeqs[seqCounter]^{{"[^a-zA-Z]",""}}; | +
| + | UnalignedSeqs[seqCounter] = UnalignedSeqs[seqCounter]^{{"^N+",""}}; | +
| + | UnalignedSeqs[seqCounter] = UnalignedSeqs[seqCounter]^{{"N+$",""}}; | +
| + | + |
| + | seqRecord ["SEQUENCE_ID"] = sequenceNames[seqCounter]; | +
| + | seqRecord ["LENGTH"] = Abs (UnalignedSeqs[seqCounter]); | +
| + | seqRecord ["STAGE"] = 0; | +
| + | seqRecord ["RAW"] = UnalignedSeqs[seqCounter]; | +
| + | + |
| + | if (doLongestSequence) | +
| + | { | +
| + | if (doLongestSequence == 1 || seqCounter != unalSequenceCount-1) | +
| + | { | +
| + | if (Abs (UnalignedSeqs[seqCounter]) > longestSequence) | +
| + | { | +
| + | longestSequence = Abs (UnalignedSeqs[seqCounter]); | +
| + | longestSequenceIDX = seqCounter; | +
| + | } | +
| + | } | +
| + | } | +
| + | + |
| + | _InsertRecord (ANALYSIS_DB_ID, "SEQUENCES", seqRecord); | +
| + | SetParameter (STATUS_BAR_STATUS_STRING, "Initial processing ("+seqCounter+"/"+unalSequenceCount+" done)",0); | +
| + | } | +
| + | + |
| + | if (refSeq == 0) | +
| + | { | +
| + | masterReferenceSequence = UnalignedSeqs[0]; | +
| + | } | +
| + | + |
| + | + |
| + | if (doLongestSequence) | +
| + | { | +
| + | fprintf (stdout, "\nSelected sequence ", sequenceNames[longestSequenceIDX], " as reference."); | +
| + | masterReferenceSequence = UnalignedSeqs[longestSequenceIDX]; | +
| + | } | +
| + | + |
| + | incFileName = HYPHY_LIB_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TemplateModels"+DIRECTORY_SEPARATOR+"chooseGeneticCode.def"; | +
| + | ExecuteCommands ("#include \""+incFileName+"\";"); | +
| + | + |
| + | doLongestSequence = (refSeq==1); | +
| + | aRecord = {}; | +
| + | aRecord["RUN_DATE"] = _ExecuteSQL(ANALYSIS_DB_ID,"SELECT DATE('NOW') AS CURRENT_DATE"); | +
| + | aRecord["RUN_DATE"] = ((aRecord["RUN_DATE"])[0])["CURRENT_DATE"]; | +
| + | aRecord["OPTIONS"] = "" + alignOptions; | +
| + | aRecord["REFERENCE"] = masterReferenceSequence; | +
| + | aRecord["GENETIC_CODE"] = "" + _Genetic_Code; | +
| + | _InsertRecord (ANALYSIS_DB_ID, "SETTINGS", aRecord); | +
| + | toDoSequences = sequenceNames; | +
| + | UnalignedSequences = 0; | +
| + | + |
| + | } | +
| + | + |
| + | skipOutliers = 1; | +
| + | doRC = 1; | +
| + | + |
| + | /* build codon translation table */ | +
| + | + |
| + | codonToAAMap = {}; | +
| + | codeToAA = "FLIMVSPTAYXHQNKDECWRG"; | +
| + | + |
| + | nucChars = "ACGT"; | +
| + | + |
| + | for (p1=0; p1<64; p1=p1+1) | +
| + | { | +
| + | codon = nucChars[p1$16]+nucChars[p1%16$4]+nucChars[p1%4]; | +
| + | ccode = _Genetic_Code[p1]; | +
| + | codonToAAMap[codon] = codeToAA[ccode]; | +
| + | } | +
| + | + |
| + | /* determine reading frames */ | +
| + | ProteinSequences = {}; | +
| + | AllTranslations = {}; | +
| + | ReadingFrames = {}; | +
| + | StopCodons = {}; | +
| + | StopPositions = {}; | +
| + | RC = {}; | +
| + | + |
| + | fprintf (stdout, "\n[PHASE 2] Detecting reading frames for each unprocessed sequence...\n"); | +
| + | frameCounter = {3,2}; | +
| + | stillHasStops = {}; | +
| + | aRecord = {}; | +
| + | + |
| + | for (seqCounter = 0; seqCounter < unalSequenceCount; seqCounter = seqCounter+1) | +
| + | { | +
| + | rawSeq = (_ExecuteSQL(ANALYSIS_DB_ID,"SELECT RAW FROM SEQUENCES WHERE SEQUENCE_ID = '" + toDoSequences[seqCounter] + "'")); | +
| + | aSeq = (rawSeq[0])["RAW"]; | +
| + | seqLen = Abs(aSeq)-2; | +
| + | + |
| + | minStops = 1e20; | +
| + | tString = ""; | +
| + | rFrame = 0; | +
| + | rrc = 0; | +
| + | allTran = {3,2}; | +
| + | stopPosn = {6,2}; | +
| + | + |
| + | for (rc = 0; rc<=doRC; rc = rc+1) | +
| + | { | +
| + | if (rc) | +
| + | { | +
| + | aSeq = nucleotideReverseComplement (aSeq) | +
| + | } | +
| + | for (offset = 0; offset < 3; offset = offset+1) | +
| + | { | +
| + | translString = ""; | +
| + | translString * (seqLen/3+1); | +
| + | for (seqPos = offset; seqPos < seqLen; seqPos = seqPos+3) | +
| + | { | +
| + | codon = aSeq[seqPos][seqPos+2]; | +
| + | prot = codonToAAMap[codon]; | +
| + | if (Abs(prot)) | +
| + | { | +
| + | translString * prot; | +
| + | } | +
| + | else | +
| + | { | +
| + | translString * "?"; | +
| + | } | +
| + | } | +
| + | translString * 0; | +
| + | translString = translString^{{"X$","?"}}; | +
| + | stopPos = translString||"X"; | +
| + | if (stopPos[0]>=0) | +
| + | { | +
| + | stopCount = Rows(stopPos)$2; | +
| + | stopPosn[3*rc+offset][0] = stopPos[0]; | +
| + | stopPosn[3*rc+offset][1] = stopPos[stopCount*2-1]; | +
| + | } | +
| + | else | +
| + | { | +
| + | stopCount = 0; | +
| + | } | +
| + | if (stopCount<minStops) | +
| + | { | +
| + | minStops = stopCount; | +
| + | rFrame = offset; | +
| + | rrc = rc; | +
| + | tString = translString; | +
| + | } | +
| + | allTran[offset][rc] = translString; | +
| + | } | +
| + | } | +
| + | + |
| + | ReadingFrames [seqCounter] = rFrame; | +
| + | ProteinSequences [seqCounter] = tString; | +
| + | frameCounter [rFrame][rrc] = frameCounter[rFrame][rrc]+1; | +
| + | StopPositions [seqCounter] = stopPosn; | +
| + | AllTranslations [seqCounter] = allTran; | +
| + | RC [seqCounter] = rrc; | +
| + | + |
| + | SetParameter (STATUS_BAR_STATUS_STRING, "Reading frame analysis ("+seqCounter+"/"+unalSequenceCount+" done)",0); | +
| + | } | +
| + | + |
| + | _closeCacheDB (ANALYSIS_DB_ID); | +
| + | return 0; | +
| + | + |
| + | s1 = ProteinSequences[0]; | +
| + | fprintf (stdout, "\nFound:\n\t", frameCounter[0], " sequences in reading frame 1\n\t",frameCounter[1], " sequences in reading frame 2\n\t",frameCounter[2], " sequences in reading frame 3\n\nThere were ", Abs(stillHasStops), " sequences with apparent frameshift/sequencing errors\n"); | +
| + | + |
| + | + |
| + | skipSeqs = {}; | +
| + | + |
| + | for (k=0; k<Abs(stillHasStops); k=k+1) | +
| + | { | +
| + | seqCounter = stillHasStops[k]; | +
| + | seqName = sequenceNames[seqCounter]; | +
| + | fprintf (stdout,"Sequence ", seqCounter+1, " (", seqName, ") seems to have"); | +
| + | stopPosn = StopPositions[seqCounter]; | +
| + | + |
| + | fStart = -1; | +
| + | fEnd = -1; | +
| + | fMin = 1e10; | +
| + | frame1 = 0; | +
| + | frame2 = 0; | +
| + | + |
| + | checkFramePosition (stopPosn[0][1],stopPosn[1][0],0,1); | +
| + | checkFramePosition (stopPosn[1][1],stopPosn[0][0],1,0); | +
| + | checkFramePosition (stopPosn[0][1],stopPosn[2][0],0,2); | +
| + | checkFramePosition (stopPosn[2][1],stopPosn[0][0],2,0); | +
| + | checkFramePosition (stopPosn[2][1],stopPosn[1][0],2,1); | +
| + | checkFramePosition (stopPosn[1][1],stopPosn[2][0],1,2); | +
| + | + |
| + | if (fStart>=0) | +
| + | { | +
| + | allTran = AllTranslations[seqCounter]; | +
| + | useq = UnalignedSeqs[seqCounter]; | +
| + | fprintf (stdout, " a shift from frame ", frame2+1, " to frame ", frame1+1, " between a.a. positions ", fStart, " and ", fEnd, "."); | +
| + | fStart2 = Max(fStart-1,0); | +
| + | fEnd2 = Min(fEnd+1,Min(Abs(allTran[frame1]),Abs(allTran[frame2]))-1); | +
| + | tempString = allTran[frame2]; | +
| + | fprintf (stdout, "\n\tRegion ", fStart2, "-", fEnd2, " in frame ", frame2+1, ":\n\t", tempString[fStart2][fEnd2]); | +
| + | fprintf (stdout, "\n\t", useq[3*fStart2+frame2][3*fEnd2+frame2-1]); | +
| + | tempString = allTran[frame1]; | +
| + | fprintf (stdout, "\n\tRegion ", fStart2, "-", fEnd2, " in frame ", frame1+1, ":\n\t", tempString[fStart2][fEnd2]); | +
| + | fprintf (stdout, "\n\t", useq[3*fStart2+frame1][3*fEnd2+frame1-1]); | +
| + | fprintf (stdout, "\n\t\tAttempting to resolve by alignment to reference. "); | +
| + | + |
| + | f1s = allTran[frame1]; | +
| + | f2s = allTran[frame2]; | +
| + | f1l = Abs(f1s); | +
| + | + |
| + | bestScore = -1e10; | +
| + | bestSplice = -1; | +
| + | + |
| + | for (k2=fStart; k2<fEnd; k2=k2+1) | +
| + | { | +
| + | s2 = f2s[0][k2]+f1s[k2+1][Abs(f1s)]; | +
| + | inStr = {{s1,s2}}; | +
| + | AlignSequences(aligned, inStr, alignOptions); | +
| + | aligned = aligned[0]; | +
| + | aligned = aligned[0]; | +
| + | if (aligned > bestScore) | +
| + | { | +
| + | bestScore = aligned; | +
| + | bestSplice = k2; | +
| + | bestString = s2; | +
| + | } | +
| + | } | +
| + | fprintf (stdout, "Best splice site appears to be at a.a. position ", bestSplice, "\n"); | +
| + | /* update best spliced string */ | +
| + | + |
| + | ProteinSequences[seqCounter] = bestString; | +
| + | ReadingFrames[seqCounter] = 0; | +
| + | + |
| + | UnalignedSeqs[seqCounter] = useq[frame2][frame2+3*bestSplice+2] + useq[frame1+3*bestSplice+3][Abs(useq)-1] + "---"; | +
| + | } | +
| + | else | +
| + | { | +
| + | + |
| + | fprintf (stdout, " multiple frameshifts\n"); | +
| + | skipSeqs[seqCounter] = 1; | +
| + | } | +
| + | } | +
| + | + |
| + | SeqAlignments = {}; | +
| + | startingPosition = {unalSequenceCount,2}; | +
| + | refLength = Abs(ProteinSequences[0]); | +
| + | refInsertions = {refLength,1}; | +
| + | + |
| + | fprintf (stdout,"\nPerforming pairwise alignment with reference sequences\n"); | +
| + | + |
| + | alignmentScores = {}; | +
| + | + |
| + | for (seqCounter = 1; seqCounter < unalSequenceCount; seqCounter = seqCounter+1) | +
| + | { | +
| + | if (skipSeqs[seqCounter] == 0) | +
| + | { | +
| + | s2 = ProteinSequences[seqCounter]; | +
| + | inStr = {{s1,s2}}; | +
| + | AlignSequences(aligned, inStr, alignOptions); | +
| + | aligned = aligned[0]; | +
| + | SeqAlignments[seqCounter] = aligned; | +
| + | alignmentScores[Abs(alignmentScores)] = aligned[0]/Abs(aligned[1]); | +
| + | aligned = aligned[1]; | +
| + | myStartingPosition = aligned$"[^-]"; | +
| + | myEndingPosition = Abs (aligned)-1; | +
| + | while (aligned[myEndingPosition]=="-") | +
| + | { | +
| + | myEndingPosition = myEndingPosition - 1; | +
| + | } | +
| + | myStartingPosition = myStartingPosition[0]; | +
| + | startingPosition[seqCounter][0] = myStartingPosition; | +
| + | startingPosition[seqCounter][1] = myEndingPosition; | +
| + | aligned = aligned[myStartingPosition][myEndingPosition]; | +
| + | refInsert = aligned||"-+"; | +
| + | if (refInsert[0]>0) | +
| + | { | +
| + | insCount = Rows (refInsert)/2; | +
| + | offset = 0; | +
| + | for (insN = 0; insN < insCount; insN = insN+1) | +
| + | { | +
| + | insPos = refInsert[insN*2]; | +
| + | insLength = refInsert[insN*2+1]-insPos+1; | +
| + | insPos = insPos-offset; | +
| + | if (refInsertions[insPos]<insLength) | +
| + | { | +
| + | refInsertions[insPos]=insLength; | +
| + | } | +
| + | offset = offset + insLength; | +
| + | } | +
| + | } | +
| + | } | +
| + | } | +
| + | + |
| + | alignmentScoresM = avlToMatrix ("alignmentScores"); | +
| + | ExecuteAFile ("Utility/DescriptiveStatistics.bf"); | +
| + | distInfo = GatherDescriptiveStats (alignmentScoresM); | +
| + | /*lowerCuttoff = 0.25;*/ | +
| + | distInfo["Mean"] - 2*distInfo["Std.Dev"]; | +
| + | /* produce a fully gapped reference sequence */ | +
| + | + |
| + | fprintf (stdout,"\nMerging pairwise alignments into a MSA\n"); | +
| + | + |
| + | fullRefSeq = ""; | +
| + | fullRefSeq * refLength; | +
| + | fullRefSeq * (s1[0]); | +
| + | + |
| + | + |
| + | s1N = UnalignedSeqs[0]; | +
| + | + |
| + | fullRefSeqN = ""; | +
| + | fullRefSeqN * (3*refLength); | +
| + | fullRefSeqN * (s1N[0][2]); | +
| + | + |
| + | frameShift = ReadingFrames[0]; | +
| + | + |
| + | for (seqCounter=1;seqCounter<refLength;seqCounter=seqCounter+1) | +
| + | { | +
| + | gapCount = refInsertions[seqCounter]; | +
| + | for (k=0; k<gapCount;k=k+1) | +
| + | { | +
| + | fullRefSeq*("-"); | +
| + | fullRefSeqN*("---"); | +
| + | } | +
| + | fullRefSeq * (s1[seqCounter]); | +
| + | fullRefSeqN * (s1N[frameShift+seqCounter*3][frameShift+seqCounter*3+2]); | +
| + | } | +
| + | + |
| + | fullRefSeq * 0; | +
| + | fullRefSeqN * 0; | +
| + | + |
| + | return 0; | +
| + | + |
| + | refLength = Abs(fullRefSeq); | +
| + | + |
| + | SetDialogPrompt ("Save alignment to:"); | +
| + | + |
| + | seqName=sequenceNames[0]; | +
| + | fprintf (PROMPT_FOR_FILE,CLEAR_FILE,">",seqName,"\n",fullRefSeq); | +
| + | fName = LAST_FILE_PATH; | +
| + | fNameC = fName+".nuc"; | +
| + | fprintf (fNameC,CLEAR_FILE,">",seqName,"\n",fullRefSeqN); | +
| + | + |
| + | alCounter = 0; | +
| + | + |
| + | for (seqCounter = 1; seqCounter < unalSequenceCount; seqCounter = seqCounter+1) | +
| + | { | +
| + | if (skipSeqs[seqCounter] == 0) | +
| + | { | +
| + | if (skipOutliers == 0 && alignmentScoresM[alCounter] < lowerCuttoff) | +
| + | { | +
| + | seqName=sequenceNames[seqCounter]; | +
| + | fprintf (stdout, "Sequence ", seqName ," was skipped because of a poor alignment score.\n"); | +
| + | skipSeqs[seqCounter] = 1; | +
| + | alCounter = alCounter + 1; | +
| + | continue; | +
| + | } | +
| + | alCounter = alCounter + 1; | +
| + | seqName=sequenceNames[seqCounter]; | +
| + | aligned = SeqAlignments[seqCounter]; | +
| + | + |
| + | aligned1 = aligned[1]; | +
| + | aligned2 = aligned[2]; | +
| + | + |
| + | s2 = startingPosition[seqCounter][0]; | +
| + | e2 = startingPosition[seqCounter][1]; | +
| + | + |
| + | gappedSeq = ""; | +
| + | gappedSeq * Abs(aligned2); | +
| + | + |
| + | + |
| + | k=0; | +
| + | + |
| + | while (k<refLength) | +
| + | { | +
| + | while (fullRefSeq[k]!=aligned1[s2]) | +
| + | { | +
| + | gappedSeq*("-"); | +
| + | k=k+1; | +
| + | } | +
| + | gappedSeq*(aligned2[s2]); | +
| + | s2=s2+1; | +
| + | k=k+1; | +
| + | } | +
| + | + |
| + | gappedSeq * 0; | +
| + | + |
| + | gappedSeqN = ""; | +
| + | gappedSeqN * (3*Abs(aligned2)); | +
| + | + |
| + | frameShift = ReadingFrames[seqCounter]; | +
| + | + |
| + | s1N = UnalignedSeqs[seqCounter]; | +
| + | s2N = ProteinSequences[seqCounter]; | +
| + | s2 = startingPosition[seqCounter][0]; | +
| + | k = 0; | +
| + | e2 = Abs(gappedSeq); | +
| + | k = 0; | +
| + | while (k<e2) | +
| + | { | +
| + | while ((s2N[s2]!=gappedSeq[k])&&(k<e2)) | +
| + | { | +
| + | gappedSeqN * ("---"); | +
| + | k=k+1; | +
| + | } | +
| + | if (k<e2) | +
| + | { | +
| + | gappedSeqN * s1N[frameShift+s2*3][frameShift+s2*3+2]; | +
| + | s2 = s2+1; | +
| + | k=k+1; | +
| + | } | +
| + | } | +
| + | gappedSeqN * 0; | +
| + | + |
| + | if (refSeq2 && seqCounter == unalSequenceCount-1) | +
| + | { | +
| + | fscanf (fName, "Raw", soFar); | +
| + | fprintf (fName, CLEAR_FILE,">",seqName,"\n",gappedSeq,"\n",soFar); | +
| + | fscanf (fNameC, "Raw", soFar); | +
| + | fprintf (fNameC,CLEAR_FILE,">",seqName,"\n",gappedSeqN,"\n",soFar); | +
| + | + |
| + | } | +
| + | else | +
| + | { | +
| + | fprintf (fName,"\n>",seqName,"\n",gappedSeq); | +
| + | fprintf (fNameC,"\n>",seqName,"\n",gappedSeqN); | +
| + | } | +
| + | } | +
| + | } | +
| + | + |
| + | if (Abs(skipSeqs)) | +
| + | { | +
| + | fName = fName+".bad"; | +
| + | for (seqCounter = 1; seqCounter < unalSequenceCount; seqCounter = seqCounter+1) | +
| + | { | +
| + | if (skipSeqs[seqCounter]) | +
| + | { | +
| + | seqName=sequenceNames[seqCounter]; | +
| + | fprintf (fName,">",seqName,"\n",UnalignedSeqs[seqCounter],"\n"); | +
| + | } | +
| + | } | +
| + | } | +
| + | + |
| + | function checkFramePosition (pos1, pos2, fr1, fr2) | +
| + | { | +
| + | fSpan = pos2-pos1; | +
| + | + |
| + | if (fSpan>1) /* first followed by second*/ | +
| + | { | +
| + | if (fSpan < fMin) | +
| + | { | +
| + | fMin = fSpan; | +
| + | frame1 = fr1; | +
| + | frame2 = fr2; | +
| + | fStart = pos1+1; | +
| + | fEnd = pos2; | +
| + | } | +
| + | } | +
| + | return 0; | +
| + | } | +
| + | + |
+ You can clone with + HTTPS, SSH, or Subversion. + + + +
+ + + + Clone in Desktop + + + + + + + Download ZIP + +![]()
Cannot retrieve contributors at this time
+| + | if (Rows (modelMatrixList) == 0) | +
| + | { | +
| + | modelMatrixList = | +
| + | { | +
| + | {"Equal Input", "EIAA.mdl", "19"} | +
| + | {"Dayhoff","Dayhoff.mdl","0"} | +
| + | {"Dayhoff+F","Dayhoff_F.mdl","19"} | +
| + | {"JTT", "Jones.mdl", "0"} | +
| + | {"JTT+F", "Jones_F.mdl", "19"} | +
| + | {"WAG", "WAG.mdl", "0"} | +
| + | {"WAG+F", "WAG_F.mdl", "19"} | +
| + | {"rtREV", "rtREV.mdl", "0"} | +
| + | {"rtREV+F", "rtREV_F.mdl", "19"} | +
| + | {"mtMAM", "mtMAM.mdl", "0"} | +
| + | {"mtMAM+F", "mtMAM_F.mdl", "19"} | +
| + | {"mtREV 24", "mtREV_24.mdl", "0"} | +
| + | {"mtREV 24+F", "mtREV_24_F.mdl", "19"} | +
| + | {"HIV within", "HIVwithin.mdl", "0"} | +
| + | {"HIV within+F", "HIVwithin+F.mdl", "19"} | +
| + | {"HIV between", "HIVbetween.mdl", "0"} | +
| + | {"HIV between+F", "HIVbetween+F.mdl", "19"} | +
| + | {"REV-1 step", "reducedREV.mdl", "19"} | +
| + | {"REV", "mtREV.mdl", "19"} | +
| + | }; | +
| + | } | +
| + | + |
| + | /*___________________________________________________________________________________________________________*/ | +
| + | + |
| + | function runAModel (modelID, fileName, xtraP, midx) | +
| + | { | +
| + | ExecuteCommands ("#include \"TemplateModels/"+fileName+"\";"); | +
| + | Tree givenTree = treeString; | +
| + | LikelihoodFunction lf = (filteredData,givenTree); | +
| + | + |
| + | GetString (lf_info, lf, -1); | +
| + | locals = lf_info["Local Independent"]; | +
| + | + |
| + | if (Columns (branchLengthStash)) | +
| + | { | +
| + | USE_LAST_RESULTS = 1; | +
| + | for (_iv = 0; _iv < Columns (locals); _iv = _iv+1) | +
| + | { | +
| + | ExecuteCommands (locals[_iv] + "=1;\n"); | +
| + | } | +
| + | currentBL = BranchLength (givenTree,0); | +
| + | currentBN = BranchName (givenTree,-1); | +
| + | for (_iv = 0; _iv < Columns (currentBN); _iv = _iv+1) | +
| + | { | +
| + | ExecuteCommands ("givenTree."+currentBN[_iv]+".t="+branchLengthStash[_iv]/currentBL+";"); | +
| + | + |
| + | } | +
| + | } | +
| + | else | +
| + | { | +
| + | for (_iv = 0; _iv < Columns (locals); _iv = _iv+1) | +
| + | { | +
| + | ExecuteCommands (locals[_iv] + "=0.1;\n"); | +
| + | } | +
| + | USE_LAST_RESULTS = 1; | +
| + | } | +
| + | + |
| + | Optimize (res,lf); | +
| + | + |
| + | fprintf (stdout, "| ", modelID); | +
| + | for (k=0; k<maxModelWidth-Abs(modelID)-1; k=k+1) | +
| + | { | +
| + | fprintf (stdout, " "); | +
| + | } | +
| + | + |
| + | params = res[1][1]+xtraP; | +
| + | AIC = 2(-res[1][0]+params); | +
| + | + |
| + | if (filteredData.sites-params>1) | +
| + | { | +
| + | cAIC = 2(-res[1][0]+params*(filteredData.sites/(filteredData.sites-params-1))); | +
| + | } | +
| + | else | +
| + | { | +
| + | cAIC = 0; | +
| + | } | +
| + | + |
| + | branchLengths = BranchLength (givenTree,-1); | +
| + | TL = 0; | +
| + | for (k=Rows(branchLengths)*Columns(branchLengths)-1; k>=0; k=k-1) | +
| + | { | +
| + | TL = TL + branchLengths[k]; | +
| + | } | +
| + | + |
| + | fprintf (stdout, "| ", Format (res[1][0],14,3), " | ", Format (params,5,0), " | ", | +
| + | Format (AIC, 9,3), " | ",); | +
| + | + |
| + | if (cAIC > 0) | +
| + | { | +
| + | fprintf (stdout, Format (cAIC,11,3), " | "); | +
| + | } | +
| + | else | +
| + | { | +
| + | fprintf (stdout, " N/A | "); | +
| + | } | +
| + | + |
| + | fprintf (stdout, Format (TL,11,3), " |\n", sepString); | +
| + | + |
| + | resultMatrix[midx][0] = res[1][0]; | +
| + | resultMatrix[midx][1] = params; | +
| + | resultMatrix[midx][2] = AIC; | +
| + | resultMatrix[midx][3] = cAIC; | +
| + | resultMatrix[midx][4] = TL; | +
| + | + |
| + | if (AIC < bestAIC) | +
| + | { | +
| + | bestAIC = AIC; | +
| + | bestAICidx = midx; | +
| + | branchLengthStash = BranchLength (givenTree,-1); | +
| + | } | +
| + | + |
| + | if (cAIC > 0) | +
| + | { | +
| + | if (bestCAIC > cAIC) | +
| + | { | +
| + | bestCAIC = cAIC; | +
| + | bestCAICidx = midx; | +
| + | + |
| + | } | +
| + | } | +
| + | + |
| + | return 0; | +
| + | } | +
| + | + |
| + | + |
| + | /*___________________________________________________________________________________________________________*/ | +
| + | + |
| + | + |
| + | + |
| + | maxModelWidth = 7; | +
| + | skipCodeSelectionStep = 0; | +
| + | + |
| + | ChoiceList (doREV, "Include REV?", 1, SKIP_NONE, "Yes", "Include REV and reduced REV models. CAUTION: these models take a long time to fit.", | +
| + | "No", "Only use empirical models"); | +
| + | + |
| + | if (doREV < 0) | +
| + | { | +
| + | return 0; | +
| + | } | +
| + | + |
| + | if (doREV == 0) | +
| + | { | +
| + | #include "TemplateModels/chooseGeneticCode.def"; | +
| + | skipCodeSelectionStep = 1; | +
| + | } | +
| + | + |
| + | modelCount = Rows (modelMatrixList) - 2*doREV; | +
| + | + |
| + | for (k=0; k<modelCount; k=k+1) | +
| + | { | +
| + | maxModelWidth = Max(maxModelWidth,Abs (modelMatrixList[k][0])+2); | +
| + | } | +
| + | + |
| + | sepString = ""; | +
| + | capString = ""; | +
| + | sepString * 256; | +
| + | sepString * "+"; | +
| + | + |
| + | capString * 256; | +
| + | capString * "| Model"; | +
| + | + |
| + | for (k=0; k<maxModelWidth; k=k+1) | +
| + | { | +
| + | sepString * "-"; | +
| + | } | +
| + | + |
| + | for (k=0; k<maxModelWidth-6; k=k+1) | +
| + | { | +
| + | capString * " "; | +
| + | } | +
| + | + |
| + | capString * "| Log Likelihood | #prms | AIC Score | c-AIC Score | Tree Length |\n"; | +
| + | sepString * "+----------------+-------+-----------+-------------+-------------+\n"; | +
| + | sepString * 0; | +
| + | capString * 0; | +
| + | + |
| + | branchLengthStash = 0; | +
| + | + |
| + | SKIP_MODEL_PARAMETER_LIST = 0; | +
| + | + |
| + | #include "TemplateModels/modelParameters2.mdl"; | +
| + | if (modelType == 1) | +
| + | { | +
| + | #include "TemplateModels/defineGamma.mdl"; | +
| + | } | +
| + | + |
| + | if (modelType == 2) | +
| + | { | +
| + | #include "TemplateModels/defineHM.mdl"; | +
| + | } | +
| + | SKIP_MODEL_PARAMETER_LIST = 1; | +
| + | + |
| + | SetDialogPrompt ("Please load an amino-acid data file:"); | +
| + | + |
| + | DataSet ds = ReadDataFile (PROMPT_FOR_FILE); | +
| + | DataSetFilter filteredData = CreateFilter (ds,1); | +
| + | + |
| + | fprintf (stdout,"\nRunning aminoacid model comparisons on ", LAST_FILE_PATH, "\n\nThe alignment has ",ds.species, " sequences and ", ds.sites, " sites\n"); | +
| + | + |
| + | _DO_TREE_REBALANCE_ = 1; | +
| + | + |
| + | #include "queryTree.bf"; | +
| + | + |
| + | resultMatrix = {modelCount, 5}; | +
| + | + |
| + | fprintf (stdout, "\n",sepString,capString,sepString); | +
| + | + |
| + | bestAIC = 1e100; | +
| + | bestCAIC = 1e100; | +
| + | bestAICidx = 0; | +
| + | bestCAICidx = -1; | +
| + | + |
| + | for (mid=0; mid<modelCount; mid=mid+1) | +
| + | { | +
| + | runAModel (modelMatrixList[mid][0], modelMatrixList[mid][1], 0+modelMatrixList[mid][2], mid); | +
| + | } | +
| + | + |
| + | fprintf (stdout, "\n\nBest AIC model:\n\t", modelMatrixList[bestAICidx][0], " with the score of ", bestAIC); | +
| + | + |
| + | if (bestCAICidx>=0) | +
| + | { | +
| + | fprintf (stdout, "\n\nBest c-AIC model:\n\t", modelMatrixList[bestCAICidx][0], " with the score of ", bestCAIC); | +
| + | } | +
| + | + |
| + | labelMatrix = {{"Log-likelihood","Parameters","AIC","c-AIC","Total tree length",""}}; | +
| + | + |
| + | aaString = "Model"; | +
| + | + |
| + | for (fC = 0; fC < modelCount; fC = fC+1) | +
| + | { | +
| + | aaString = aaString + ";" + modelMatrixList[fC][0]; | +
| + | } | +
| + | + |
| + | USE_LAST_RESULTS = 0; | +
| + | + |
| + | labelMatrix[5] = aaString; | +
| + | skipCodeSelectionStep = 0; | +
| + | OpenWindow (CHARTWINDOW,{{"Model Fits"} | +
| + | {"labelMatrix"}, | +
| + | {"resultMatrix"}, | +
| + | {"Bar Chart"}, | +
| + | {"Index"}, | +
| + | {"c-AIC"}, | +
| + | {"Model Index"}, | +
| + | {""}, | +
| + | {"AIC"} | +
| + | }, | +
| + | "SCREEN_WIDTH-60;SCREEN_HEIGHT-60;30;30"); | +
+ You can clone with + HTTPS, SSH, or Subversion. + + + +
+ + + + Clone in Desktop + + + + + + + Download ZIP + +![]()
Cannot retrieve contributors at this time
+| + | NICETY_LEVEL = 3; | +
| + | + |
| + | #include "TemplateModels/chooseGeneticCode.def"; | +
| + | #include "simpleBootstrap.bf"; | +
| + | + |
| + | SetDialogPrompt ("Please specify a codon data file:"); | +
| + | + |
| + | COUNT_GAPS_IN_FREQUENCIES = 0; | +
| + | VERBOSITY_LEVEL = 1; | +
| + | + |
| + | DataSet ds = ReadDataFile (PROMPT_FOR_FILE); | +
| + | DataSetFilter filteredData = CreateFilter (ds,3,"","",GeneticCodeExclusions); | +
| + | + |
| + | fprintf (stdout,"\n______________READ THE FOLLOWING DATA______________\n",ds); | +
| + | + |
| + | SelectTemplateModel(filteredData); | +
| + | + |
| + | _DO_TREE_REBALANCE_ = 1; | +
| + | #include "queryTree.bf"; | +
| + | + |
| + | if (modelType) | +
| + | { | +
| + | ChoiceList (branchLengths, "Branch Lengths", 1, SKIP_NONE, | +
| + | "Estimate", "Estimate branch lengths by ML", | +
| + | "Proportional to input tree", "Branch lengths are proportional to those in input tree"); | +
| + | + |
| + | if (branchLengths < 0) | +
| + | { | +
| + | return; | +
| + | } | +
| + | + |
| + | if (branchLengths == 1) | +
| + | { | +
| + | global treeScaler = 1; | +
| + | ReplicateConstraint ("this1.?.?:=treeScaler*this2.?.?__", givenTree, givenTree); | +
| + | } | +
| + | } | +
| + | + |
| + | LikelihoodFunction lf = (filteredData,givenTree); | +
| + | + |
| + | Optimize (res,lf); | +
| + | + |
| + | fprintf (stdout, "\n______________RESULTS______________\n",lf); | +
| + | + |
| + | /* compute syn and non-syn stencils for current genetic code */ | +
| + | + |
| + | #include "categoryEcho.bf"; | +
| + | + |
| + | GetString (sendMeBack,lf,-1); | +
| + | sendMeBack["LogL"] = res[1][0]; | +
| + | sendMeBack["NP"] = res[1][1]; | +
| + | + |
| + | return sendMeBack; | +
+ You can clone with + HTTPS, SSH, or Subversion. + + + +
+ + + + Clone in Desktop + + + + + + + Download ZIP + +![]()
Cannot retrieve contributors at this time
+| + | SetDialogPrompt ("Please specify a di-nucleotide (e.g. stem RNA) file:"); | +
| + | + |
| + | DataSet ds = ReadDataFile (PROMPT_FOR_FILE); | +
| + | DataSetFilter filteredData = CreateFilter (ds,2); | +
| + | + |
| + | fprintf (stdout,"\n______________READ THE FOLLOWING DATA______________\n",ds); | +
| + | + |
| + | SelectTemplateModel(filteredData); | +
| + | + |
| + | _DO_TREE_REBALANCE_ = 1; | +
| + | + |
| + | #include "queryTree.bf"; | +
| + | + |
| + | LikelihoodFunction lf = (filteredData,givenTree); | +
| + | + |
| + | timer = Time(0); | +
| + | + |
| + | USE_ADAPTIVE_VARIABLE_STEP = 0; | +
| + | + |
| + | Optimize (res,lf); | +
| + | + |
| + | timer = Time(0)-timer; | +
| + | + |
| + | + |
| + | fprintf (stdout, "\n______________RESULTS______________\nTime taken = ", timer, " seconds\nAIC Score = ", | +
| + | 2(res[1][1]-res[1][0]),"\n",lf); | +
| + | + |
| + | #include "categoryEcho.bf"; | +
+ You can clone with + HTTPS, SSH, or Subversion. + + + +
+ + + + Clone in Desktop + + + + + + + Download ZIP + +![]()
Cannot retrieve contributors at this time
+| + | _Genetic_Code = 0; | +
| + | + |
| + | ExecuteAFile("simpleBootstrap.bf"); | +
| + | + |
| + | SetDialogPrompt ("Please specify a nucleotide or amino-acid data file:"); | +
| + | + |
| + | DataSet ds = ReadDataFile (PROMPT_FOR_FILE); | +
| + | DataSetFilter filteredData = CreateFilter (ds,1); | +
| + | + |
| + | fprintf (stdout,"\n______________READ THE FOLLOWING DATA______________\n",ds); | +
| + | + |
| + | SelectTemplateModel(filteredData); | +
| + | _DO_TREE_REBALANCE_ = 1; | +
| + | + |
| + | ExecuteAFile ("queryTree.bf"); | +
| + | + |
| + | LikelihoodFunction lf = (filteredData,givenTree); | +
| + | timer = Time(0); | +
| + | Optimize (res,lf); | +
| + | timer = Time(0)-timer; | +
| + | + |
| + | fprintf (stdout, "\n______________RESULTS______________\nTime taken = ", timer, " seconds\nAIC Score = ", | +
| + | 2(res[1][1]-res[1][0]),"\n",lf); | +
| + | + |
| + | ExecuteAFile ("categoryEcho.bf"); | +
| + | GetString (lfInfo, lf, -1); | +
| + | + |
| + | return {"Log(L)": res[1][0], "DF": res[1][1], "Tree": Format (givenTree,1,1)} | +
+ You can clone with + HTTPS, SSH, or Subversion. + + + +
+ + + + Clone in Desktop + + + + + + + Download ZIP + +![]()
Cannot retrieve contributors at this time
+| + | /* 1. include a file to define the genetic code | +
| + | Note the use of base directory and path forming variables to make this analysis | +
| + | independent of directory placement | +
| + | */ | +
| + | + |
| + | incFileName = HYPHY_LIB_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TemplateModels"+DIRECTORY_SEPARATOR+"chooseGeneticCode.def"; | +
| + | ExecuteCommands ("#include \""+incFileName+"\";"); | +
| + | + |
| + | /* 2. load a codon partition */ | +
| + | + |
| + | SetDialogPrompt ("Please locate a coding alignment:"); | +
| + | DataSet ds = ReadDataFile (PROMPT_FOR_FILE); | +
| + | DataSetFilter filteredData = CreateFilter (ds,3,"","",GeneticCodeExclusions); | +
| + | coding_path = LAST_FILE_PATH; | +
| + | + |
| + | fprintf (stdout, "\nLoaded a ", filteredData.species, " sequence alignment with ", filteredData.sites, " codons from\n",coding_path,"\n"); | +
| + | + |
| + | /* 3. include a file to prompt for a tree */ | +
| + | + |
| + | incFileName = HYPHY_LIB_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"queryTree.bf"; | +
| + | ExecuteCommands ("#include \""+incFileName+"\";"); | +
| + | + |
| + | /* 4. Compute nucleotide counts by position for the F3x4 estimator */ | +
| + | + |
| + | COUNT_GAPS_IN_FREQUENCIES = 0; | +
| + | HarvestFrequencies (baseFreqs,filteredData,3,1,1); | +
| + | COUNT_GAPS_IN_FREQUENCIES = 1; | +
| + | + |
| + | fprintf (stdout, "\nBase composition:\n\tA: ", Format (baseFreqs[0][0],10,5),",",Format (baseFreqs[0][1],10,5),",",Format (baseFreqs[0][2],10,5), | +
| + | "\n\tC: ", Format (baseFreqs[1][0],10,5),",",Format (baseFreqs[1][1],10,5),",",Format (baseFreqs[1][2],10,5), | +
| + | "\n\tG: ", Format (baseFreqs[2][0],10,5),",",Format (baseFreqs[2][1],10,5),",",Format (baseFreqs[2][2],10,5), | +
| + | "\n\tT: ", Format (baseFreqs[3][0],10,5),",",Format (baseFreqs[3][1],10,5),",",Format (baseFreqs[3][2],10,5), "\n"); | +
| + | + |
| + | + |
| + | + |
| + | /* 6. define the GY94 rate matrix; for now each branch will have it's own | +
| + | dS and dN, we will constrain them later */ | +
| + | + |
| + | global kappa_inv = 1; | +
| + | + |
| + | ModelMatrixDimension = 64; | +
| + | for (h = 0; h<64; h=h+1) | +
| + | { | +
| + | if (_Genetic_Code[h]==10) /* stop codon */ | +
| + | { | +
| + | ModelMatrixDimension = ModelMatrixDimension-1; | +
| + | } | +
| + | } | +
| + | + |
| + | GY_Matrix = {ModelMatrixDimension,ModelMatrixDimension}; | +
| + | hshift = 0; | +
| + | for (h=0; h<64; h=h+1) | +
| + | { | +
| + | if (_Genetic_Code[h]==10) | +
| + | { | +
| + | hshift = hshift+1; | +
| + | } | +
| + | else | +
| + | { | +
| + | vshift = hshift; | +
| + | for (v = h+1; v<64; v=v+1) | +
| + | { | +
| + | diff = v-h; | +
| + | if (_Genetic_Code[v]==10) | +
| + | { | +
| + | vshift = vshift+1; | +
| + | } | +
| + | else | +
| + | { | +
| + | if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0)) /* one step */ | +
| + | { | +
| + | if (h$4==v$4) | +
| + | { | +
| + | transition = v%4; | +
| + | transition2= h%4; | +
| + | } | +
| + | else | +
| + | { | +
| + | if(diff%16==0) | +
| + | { | +
| + | transition = v$16; | +
| + | transition2= h$16; | +
| + | } | +
| + | else | +
| + | { | +
| + | transition = v%16$4; | +
| + | transition2= h%16$4; | +
| + | } | +
| + | } | +
| + | if (_Genetic_Code[0][h]==_Genetic_Code[0][v]) /* synonymous */ | +
| + | { | +
| + | if (Abs(transition-transition2)%2) /* transversion */ | +
| + | { | +
| + | GY_Matrix[h-hshift][v-vshift] := kappa_inv*synRate; | +
| + | GY_Matrix[v-vshift][h-hshift] := kappa_inv*synRate; | +
| + | } | +
| + | else | +
| + | { | +
| + | GY_Matrix[h-hshift][v-vshift] := synRate; | +
| + | GY_Matrix[v-vshift][h-hshift] := synRate; | +
| + | } | +
| + | } | +
| + | else | +
| + | { | +
| + | if (Abs(transition-transition2)%2) /* transversion */ | +
| + | { | +
| + | GY_Matrix[h-hshift][v-vshift] := kappa_inv*nonSynRate; | +
| + | GY_Matrix[v-vshift][h-hshift] := kappa_inv*nonSynRate; | +
| + | } | +
| + | else | +
| + | { | +
| + | GY_Matrix[h-hshift][v-vshift] := nonSynRate; | +
| + | GY_Matrix[v-vshift][h-hshift] := nonSynRate; | +
| + | } | +
| + | } | +
| + | } | +
| + | } | +
| + | } | +
| + | } | +
| + | } | +
| + | + |
| + | /*8. build codon frequencies (use the F3x4 estimator) */ | +
| + | + |
| + | PIStop = 1.0; | +
| + | codonFreqs = {ModelMatrixDimension,1}; | +
| + | hshift = 0; | +
| + | + |
| + | for (h=0; h<64; h=h+1) | +
| + | { | +
| + | first = h$16; | +
| + | second = h%16$4; | +
| + | third = h%4; | +
| + | if (_Genetic_Code[h]==10) | +
| + | { | +
| + | hshift = hshift+1; | +
| + | PIStop = PIStop-baseFreqs[first][0]*baseFreqs[second][1]*baseFreqs[third][2]; | +
| + | continue; | +
| + | } | +
| + | codonFreqs[h-hshift]=baseFreqs[first][0]*baseFreqs[second][1]*baseFreqs[third][2]; | +
| + | } | +
| + | + |
| + | codonFreqs = codonFreqs*(1.0/PIStop); | +
| + | + |
| + | /*9. define the codon model */ | +
| + | + |
| + | Model GY_Model = (GY_Matrix,codonFreqs,1); | +
| + | + |
| + | /*10. Define the tree and pick the foreground branch, displaying a tree window to facilitate selection; | +
| + | the latter step is executed for 2 of 3 model choices */ | +
| + | + |
| + | Tree givenTree1 = treeString; | +
| + | Tree givenTree2 = treeString; | +
| + | Tree givenTree3 = treeString; | +
| + | Tree givenTree4 = treeString; | +
| + | + |
| + | USE_LAST_RESULTS = 0; | +
| + | OPTIMIZATION_METHOD = 4; | +
| + | + |
| + | /* Approximate kappa and branch lengths using an HKY85 fit */ | +
| + | + |
| + | HKY85_Matrix = {{*,t*kappa_inv,t,t*kappa_inv} | +
| + | {t*kappa_inv,*,kappa_inv*t,t} | +
| + | {t,t*kappa_inv,*,kappa_inv*t} | +
| + | {t*kappa_inv,t,kappa_inv*t,*}}; | +
| + | + |
| + | HarvestFrequencies (nucFreqs,ds,1,1,1); | +
| + | Model HKY85_Model = (HKY85_Matrix,nucFreqs,1); | +
| + | + |
| + | Tree nucTree = treeString; | +
| + | DataSetFilter nucData = CreateFilter (ds,1); | +
| + | + |
| + | fprintf (stdout, "Obtaining nucleotide branch lengths and kappa to be used as starting values...\n"); | +
| + | LikelihoodFunction nuc_lf = (nucData,nucTree); | +
| + | Optimize (nuc_mle,nuc_lf); | +
| + | fprintf (stdout, "\n", Format (nucTree,1,1), "\nkappa=", Format (1/kappa_inv,8,3), "\n"); | +
| + | + |
| + | USE_LAST_RESULTS = 1; | +
| + | + |
| + | mxTreeSpec = {5,1}; | +
| + | mxTreeSpec [0] = "nucTree"; | +
| + | mxTreeSpec [1] = "8240"; | +
| + | mxTreeSpec [2] = "10,40,-10,-175,1"; | +
| + | mxTreeSpec [3] = ""; | +
| + | mxTreeSpec [4] = ""; | +
| + | OpenWindow (TREEWINDOW, mxTreeSpec,"(SCREEN_WIDTH-50)/2;(SCREEN_HEIGHT-50)/2;30+(SCREEN_WIDTH-30)/2;45"); | +
| + | + |
| + | leafNodes = TipCount (givenTree); | +
| + | internalNodes = BranchCount(givenTree); | +
| + | + |
| + | choiceMatrix = {internalNodes+leafNodes,2}; | +
| + | for (bc=0; bc<internalNodes; bc=bc+1) | +
| + | { | +
| + | choiceMatrix[bc][0] = BranchName(givenTree,bc); | +
| + | choiceMatrix[bc][1] = "Internal Branch Rooting " + givenTree[bc]; | +
| + | } | +
| + | for (bc=0; bc<leafNodes; bc=bc+1) | +
| + | { | +
| + | choiceMatrix[bc+internalNodes][0] = TipName(givenTree,bc); | +
| + | choiceMatrix[bc+internalNodes][1] = "Leaf node " + choiceMatrix[bc+internalNodes][0]; | +
| + | } | +
| + | + |
| + | ChoiceList (stOption,"Choose the foreground branch",0,NO_SKIP,choiceMatrix); | +
| + | + |
| + | if (stOption[0] < 0) | +
| + | { | +
| + | return -1; | +
| + | } | +
| + | + |
| + | fprintf (stdout, "\n\n", Columns (stOption)," foreground branch(es) set to: ", "\n"); | +
| + | for (bc = 0; bc < Columns (stOption); bc = bc + 1) | +
| + | { | +
| + | fprintf (stdout, choiceMatrix[stOption[bc]][0], "\n"); | +
| + | } | +
| + | OpenWindow (CLOSEWINDOW, "Tree nucTree"); | +
| + | + |
| + | /* 15. Constrain dS and dN in the tree to based upon different models */ | +
| + | + |
| + | global omega_1 = 0.25; | +
| + | global omega_2 = 0.5; | +
| + | global omega_3 = 1.5; | +
| + | + |
| + | omega_1 :< 1; | +
| + | omega_2 :< 1; | +
| + | omega_3 :> 1; | +
| + | + |
| + | ClearConstraints (givenTree1); | +
| + | ClearConstraints (givenTree2); | +
| + | ClearConstraints (givenTree3); | +
| + | ClearConstraints (givenTree4); | +
| + | + |
| + | /* 16. define and optimize the likelihood function */ | +
| + | + |
| + | bNames = BranchName (givenTree,-1); | +
| + | nucBL = BranchLength (nucTree,-1); | +
| + | + |
| + | for (bc=0; bc<Columns(bNames)-1; bc=bc+1) | +
| + | { | +
| + | ExecuteCommands ("givenTree1."+bNames[bc]+".synRate =nucTree."+bNames[bc]+".t;"); | +
| + | ExecuteCommands ("givenTree1."+bNames[bc]+".nonSynRate =nucTree."+bNames[bc]+".t;"); | +
| + | ExecuteCommands ("givenTree2."+bNames[bc]+".synRate =nucTree."+bNames[bc]+".t;"); | +
| + | ExecuteCommands ("givenTree2."+bNames[bc]+".nonSynRate =nucTree."+bNames[bc]+".t;"); | +
| + | ExecuteCommands ("givenTree3."+bNames[bc]+".synRate =nucTree."+bNames[bc]+".t;"); | +
| + | ExecuteCommands ("givenTree3."+bNames[bc]+".nonSynRate =nucTree."+bNames[bc]+".t;"); | +
| + | ExecuteCommands ("givenTree4."+bNames[bc]+".synRate =nucTree."+bNames[bc]+".t;"); | +
| + | ExecuteCommands ("givenTree4."+bNames[bc]+".nonSynRate =nucTree."+bNames[bc]+".t;"); | +
| + | } | +
| + | + |
| + | codBL = BranchLength (givenTree1,-1); | +
| + | + |
| + | for (bc=0; bc<Columns(bNames)-1; bc=bc+1) | +
| + | { | +
| + | if (nucBL[bc]>0) | +
| + | { | +
| + | scalingFactor = nucBL[bc]/codBL[bc]; | +
| + | ExecuteCommands ("givenTree1."+bNames[bc]+".synRate=nucTree."+bNames[bc]+".t*"+scalingFactor+";"); | +
| + | ExecuteCommands ("givenTree1."+bNames[bc]+".nonSynRate=nucTree."+bNames[bc]+".t*"+scalingFactor+";"); | +
| + | ExecuteCommands ("givenTree2."+bNames[bc]+".synRate=nucTree."+bNames[bc]+".t*"+scalingFactor+";"); | +
| + | ExecuteCommands ("givenTree2."+bNames[bc]+".nonSynRate=nucTree."+bNames[bc]+".t*"+scalingFactor+";"); | +
| + | ExecuteCommands ("givenTree3."+bNames[bc]+".synRate=nucTree."+bNames[bc]+".t*"+scalingFactor+";"); | +
| + | ExecuteCommands ("givenTree3."+bNames[bc]+".nonSynRate=nucTree."+bNames[bc]+".t*"+scalingFactor+";"); | +
| + | ExecuteCommands ("givenTree4."+bNames[bc]+".synRate =nucTree."+bNames[bc]+".t;"); | +
| + | ExecuteCommands ("givenTree4."+bNames[bc]+".nonSynRate =nucTree."+bNames[bc]+".t;"); | +
| + | } | +
| + | } | +
| + | + |
| + | for (bc = 0; bc < Columns (stOption); bc = bc + 1) | +
| + | { | +
| + | bName = choiceMatrix[stOption[bc]][0]; | +
| + | ExecuteCommands ("givenTree1."+bName+".nonSynRate:=omega_1*givenTree1."+bName+".synRate"); | +
| + | ExecuteCommands ("givenTree2."+bName+".nonSynRate:=omega_2*givenTree2."+bName+".synRate"); | +
| + | ExecuteCommands ("givenTree3."+bName+".nonSynRate:=givenTree3."+bName+".synRate"); | +
| + | ExecuteCommands ("givenTree4."+bName+".nonSynRate:=omega_3*givenTree4."+bName+".synRate"); | +
| + | } | +
| + | + |
| + | + |
| + | OPTIMIZATION_PRECISION = 0.001; | +
| + | + |
| + | global P_1 = 1/4; | +
| + | global P_2 = 1/3; | +
| + | global P_3 = 1/2; | +
| + | P_1 :< 1; | +
| + | P_2 :< 1; | +
| + | P_3 :< 1; | +
| + | + |
| + | + |
| + | fprintf (stdout, "\nFitting the model with selection...\n"); | +
| + | + |
| + | LikelihoodFunction lf = (filteredData, givenTree1, filteredData, givenTree2,filteredData, givenTree3,filteredData, givenTree4, | +
| + | "Log(P_1*SITE_LIKELIHOOD[0]+(1-P_1)*P_2*SITE_LIKELIHOOD[1]+(1-P_1)(1-P_2)*P_3*SITE_LIKELIHOOD[2]+(1-P_1)(1-P_2)(1-P_3)*SITE_LIKELIHOOD[3])"); | +
| + | + |
| + | + |
| + | Optimize (mles,lf); | +
| + | fprintf (stdout, lf); | +
| + | + |
| + | + |