Skip to content
  • Watch
  • Fork

/hyphy

HTTPS clone URL

SSH clone URL

Subversion checkout URL

You can clone with HTTPS, SSH, or Subversion.

Clone in Desktop Download ZIP
538 lines (451 sloc) 14.93 kb
ExecuteAFile ("Utility/GrabBag.bf");
ExecuteAFile ("Utility/DBTools.ibf");
alignOptions = {};
SetDialogPrompt ("Sequence File:");
DataSet unal = ReadDataFile (PROMPT_FOR_FILE);
BASE_FILE_PATH = LAST_FILE_PATH;
DB_FILE_PATH = LAST_FILE_PATH + ".db";
ANALYSIS_DB_ID = _openCacheDB (DB_FILE_PATH);
haveTable = _TableExists (ANALYSIS_DB_ID, "SETTINGS");
if (haveTable)
{
existingSettings = _ExecuteSQL (ANALYSIS_DB_ID, "SELECT * FROM SETTINGS");
}
if (Abs(existingSettings))
{
existingSettings = existingSettings [0];
ExecuteCommands ("_Genetic_Code = " + existingSettings["GENETIC_CODE"]);
ExecuteCommands (existingSettings["OPTIONS"] ^ {{"_hyphyAssociativeArray","alignOptions"}});
masterReferenceSequence = existingSettings["REFERENCE"];
dbSequences = _ExecuteSQL (ANALYSIS_DB_ID, "SELECT SEQUENCE_ID FROM SEQUENCES WHERE STAGE = 0");
unalSequenceCount = Abs(dbSequences);
toDoSequences = {unalSequenceCount,1};
for (k = 0; k < unalSequenceCount; k=k+1)
{
toDoSequences[k] = (dbSequences[k])["SEQUENCE_ID"];
}
dbSequences = 0;
fprintf (stdout, "[PHASE 1] Reloaded ", unalSequenceCount, " unprocessed sequences\n");
}
else
{
tableDefines = {};
tableDefines ["SETTINGS"] = {};
(tableDefines ["SETTINGS"])["RUN_DATE"] = "DATE";
(tableDefines ["SETTINGS"])["OPTIONS"] = "TEXT";
(tableDefines ["SETTINGS"])["REFERENCE"] = "TEXT";
(tableDefines ["SETTINGS"])["GENETIC_CODE"] = "TEXT";
(tableDefines ["SETTINGS"])["THRESHOLD"] = "REAL";
tableDefines ["SEQUENCES"] = {};
(tableDefines ["SEQUENCES"])["SEQUENCE_ID"] = "TEXT UNIQUE";
(tableDefines ["SEQUENCES"])["LENGTH"] = "INTEGER";
(tableDefines ["SEQUENCES"])["STAGE"] = "INTEGER";
/*
0 - initial import
1 - in frame without a fix
2 - one frame shift / fixed
3 - out-of-frame; not fixed / not aligned
*/
(tableDefines ["SEQUENCES"])["RAW"] = "TEXT";
(tableDefines ["SEQUENCES"])["ALIGNED_AA"] = "TEXT"; /* aligned aa. sequence */
(tableDefines ["SEQUENCES"])["ALIGNED"] = "TEXT"; /* aligned nucleotide sequence */
(tableDefines ["SEQUENCES"])["OFFSET"] = "INTEGER"; /* start offset w.r.t the reference sequence */
(tableDefines ["SEQUENCES"])["END_OFFSET"] = "INTEGER"; /* end offset w.r.t the reference sequence */
(tableDefines ["SEQUENCES"])["SCORE"] = "REAL";
(tableDefines ["SEQUENCES"])["FRAME"] = "INTEGER";
_CreateTableIfNeeded (ANALYSIS_DB_ID, "SETTINGS", tableDefines["SETTINGS"], 0);
_CreateTableIfNeeded (ANALYSIS_DB_ID, "SEQUENCES", tableDefines["SEQUENCES"], 1);
/* START ALIGNMENT SETTINGS */
LoadFunctionLibrary ("SeqAlignShared.ibf");
DataSetFilter filteredData = CreateFilter (unal,1);
GetInformation (UnalignedSeqs,filteredData);
/* preprocess sequences */
unalSequenceCount = Rows(UnalignedSeqs)*Columns(UnalignedSeqs);
GetString (sequenceNames, unal, -1);
longestSequence = 0;
longestSequenceIDX = 0;
seqRecord = {};
fprintf (stdout, "[PHASE 1] Initial Processing of ", unalSequenceCount, " sequences\n");
for (seqCounter = 0; seqCounter < unalSequenceCount; seqCounter = seqCounter+1)
{
UnalignedSeqs[seqCounter] = UnalignedSeqs[seqCounter]^{{"[^a-zA-Z]",""}};
UnalignedSeqs[seqCounter] = UnalignedSeqs[seqCounter]^{{"^N+",""}};
UnalignedSeqs[seqCounter] = UnalignedSeqs[seqCounter]^{{"N+$",""}};
seqRecord ["SEQUENCE_ID"] = sequenceNames[seqCounter];
seqRecord ["LENGTH"] = Abs (UnalignedSeqs[seqCounter]);
seqRecord ["STAGE"] = 0;
seqRecord ["RAW"] = UnalignedSeqs[seqCounter];
if (doLongestSequence)
{
if (doLongestSequence == 1 || seqCounter != unalSequenceCount-1)
{
if (Abs (UnalignedSeqs[seqCounter]) > longestSequence)
{
longestSequence = Abs (UnalignedSeqs[seqCounter]);
longestSequenceIDX = seqCounter;
}
}
}
_InsertRecord (ANALYSIS_DB_ID, "SEQUENCES", seqRecord);
SetParameter (STATUS_BAR_STATUS_STRING, "Initial processing ("+seqCounter+"/"+unalSequenceCount+" done)",0);
}
if (refSeq == 0)
{
masterReferenceSequence = UnalignedSeqs[0];
}
if (doLongestSequence)
{
fprintf (stdout, "\nSelected sequence ", sequenceNames[longestSequenceIDX], " as reference.");
masterReferenceSequence = UnalignedSeqs[longestSequenceIDX];
}
incFileName = HYPHY_LIB_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TemplateModels"+DIRECTORY_SEPARATOR+"chooseGeneticCode.def";
ExecuteCommands ("#include \""+incFileName+"\";");
doLongestSequence = (refSeq==1);
aRecord = {};
aRecord["RUN_DATE"] = _ExecuteSQL(ANALYSIS_DB_ID,"SELECT DATE('NOW') AS CURRENT_DATE");
aRecord["RUN_DATE"] = ((aRecord["RUN_DATE"])[0])["CURRENT_DATE"];
aRecord["OPTIONS"] = "" + alignOptions;
aRecord["REFERENCE"] = masterReferenceSequence;
aRecord["GENETIC_CODE"] = "" + _Genetic_Code;
_InsertRecord (ANALYSIS_DB_ID, "SETTINGS", aRecord);
toDoSequences = sequenceNames;
UnalignedSequences = 0;
}
skipOutliers = 1;
doRC = 1;
/* build codon translation table */
codonToAAMap = {};
codeToAA = "FLIMVSPTAYXHQNKDECWRG";
nucChars = "ACGT";
for (p1=0; p1<64; p1=p1+1)
{
codon = nucChars[p1$16]+nucChars[p1%16$4]+nucChars[p1%4];
ccode = _Genetic_Code[p1];
codonToAAMap[codon] = codeToAA[ccode];
}
/* determine reading frames */
ProteinSequences = {};
AllTranslations = {};
ReadingFrames = {};
StopCodons = {};
StopPositions = {};
RC = {};
fprintf (stdout, "\n[PHASE 2] Detecting reading frames for each unprocessed sequence...\n");
frameCounter = {3,2};
stillHasStops = {};
aRecord = {};
for (seqCounter = 0; seqCounter < unalSequenceCount; seqCounter = seqCounter+1)
{
rawSeq = (_ExecuteSQL(ANALYSIS_DB_ID,"SELECT RAW FROM SEQUENCES WHERE SEQUENCE_ID = '" + toDoSequences[seqCounter] + "'"));
aSeq = (rawSeq[0])["RAW"];
seqLen = Abs(aSeq)-2;
minStops = 1e20;
tString = "";
rFrame = 0;
rrc = 0;
allTran = {3,2};
stopPosn = {6,2};
for (rc = 0; rc<=doRC; rc = rc+1)
{
if (rc)
{
aSeq = nucleotideReverseComplement (aSeq)
}
for (offset = 0; offset < 3; offset = offset+1)
{
translString = "";
translString * (seqLen/3+1);
for (seqPos = offset; seqPos < seqLen; seqPos = seqPos+3)
{
codon = aSeq[seqPos][seqPos+2];
prot = codonToAAMap[codon];
if (Abs(prot))
{
translString * prot;
}
else
{
translString * "?";
}
}
translString * 0;
translString = translString^{{"X$","?"}};
stopPos = translString||"X";
if (stopPos[0]>=0)
{
stopCount = Rows(stopPos)$2;
stopPosn[3*rc+offset][0] = stopPos[0];
stopPosn[3*rc+offset][1] = stopPos[stopCount*2-1];
}
else
{
stopCount = 0;
}
if (stopCount<minStops)
{
minStops = stopCount;
rFrame = offset;
rrc = rc;
tString = translString;
}
allTran[offset][rc] = translString;
}
}
ReadingFrames [seqCounter] = rFrame;
ProteinSequences [seqCounter] = tString;
frameCounter [rFrame][rrc] = frameCounter[rFrame][rrc]+1;
StopPositions [seqCounter] = stopPosn;
AllTranslations [seqCounter] = allTran;
RC [seqCounter] = rrc;
SetParameter (STATUS_BAR_STATUS_STRING, "Reading frame analysis ("+seqCounter+"/"+unalSequenceCount+" done)",0);
}
_closeCacheDB (ANALYSIS_DB_ID);
return 0;
s1 = ProteinSequences[0];
fprintf (stdout, "\nFound:\n\t", frameCounter[0], " sequences in reading frame 1\n\t",frameCounter[1], " sequences in reading frame 2\n\t",frameCounter[2], " sequences in reading frame 3\n\nThere were ", Abs(stillHasStops), " sequences with apparent frameshift/sequencing errors\n");
skipSeqs = {};
for (k=0; k<Abs(stillHasStops); k=k+1)
{
seqCounter = stillHasStops[k];
seqName = sequenceNames[seqCounter];
fprintf (stdout,"Sequence ", seqCounter+1, " (", seqName, ") seems to have");
stopPosn = StopPositions[seqCounter];
fStart = -1;
fEnd = -1;
fMin = 1e10;
frame1 = 0;
frame2 = 0;
checkFramePosition (stopPosn[0][1],stopPosn[1][0],0,1);
checkFramePosition (stopPosn[1][1],stopPosn[0][0],1,0);
checkFramePosition (stopPosn[0][1],stopPosn[2][0],0,2);
checkFramePosition (stopPosn[2][1],stopPosn[0][0],2,0);
checkFramePosition (stopPosn[2][1],stopPosn[1][0],2,1);
checkFramePosition (stopPosn[1][1],stopPosn[2][0],1,2);
if (fStart>=0)
{
allTran = AllTranslations[seqCounter];
useq = UnalignedSeqs[seqCounter];
fprintf (stdout, " a shift from frame ", frame2+1, " to frame ", frame1+1, " between a.a. positions ", fStart, " and ", fEnd, ".");
fStart2 = Max(fStart-1,0);
fEnd2 = Min(fEnd+1,Min(Abs(allTran[frame1]),Abs(allTran[frame2]))-1);
tempString = allTran[frame2];
fprintf (stdout, "\n\tRegion ", fStart2, "-", fEnd2, " in frame ", frame2+1, ":\n\t", tempString[fStart2][fEnd2]);
fprintf (stdout, "\n\t", useq[3*fStart2+frame2][3*fEnd2+frame2-1]);
tempString = allTran[frame1];
fprintf (stdout, "\n\tRegion ", fStart2, "-", fEnd2, " in frame ", frame1+1, ":\n\t", tempString[fStart2][fEnd2]);
fprintf (stdout, "\n\t", useq[3*fStart2+frame1][3*fEnd2+frame1-1]);
fprintf (stdout, "\n\t\tAttempting to resolve by alignment to reference. ");
f1s = allTran[frame1];
f2s = allTran[frame2];
f1l = Abs(f1s);
bestScore = -1e10;
bestSplice = -1;
for (k2=fStart; k2<fEnd; k2=k2+1)
{
s2 = f2s[0][k2]+f1s[k2+1][Abs(f1s)];
inStr = {{s1,s2}};
AlignSequences(aligned, inStr, alignOptions);
aligned = aligned[0];
aligned = aligned[0];
if (aligned > bestScore)
{
bestScore = aligned;
bestSplice = k2;
bestString = s2;
}
}
fprintf (stdout, "Best splice site appears to be at a.a. position ", bestSplice, "\n");
/* update best spliced string */
ProteinSequences[seqCounter] = bestString;
ReadingFrames[seqCounter] = 0;
UnalignedSeqs[seqCounter] = useq[frame2][frame2+3*bestSplice+2] + useq[frame1+3*bestSplice+3][Abs(useq)-1] + "---";
}
else
{
fprintf (stdout, " multiple frameshifts\n");
skipSeqs[seqCounter] = 1;
}
}
SeqAlignments = {};
startingPosition = {unalSequenceCount,2};
refLength = Abs(ProteinSequences[0]);
refInsertions = {refLength,1};
fprintf (stdout,"\nPerforming pairwise alignment with reference sequences\n");
alignmentScores = {};
for (seqCounter = 1; seqCounter < unalSequenceCount; seqCounter = seqCounter+1)
{
if (skipSeqs[seqCounter] == 0)
{
s2 = ProteinSequences[seqCounter];
inStr = {{s1,s2}};
AlignSequences(aligned, inStr, alignOptions);
aligned = aligned[0];
SeqAlignments[seqCounter] = aligned;
alignmentScores[Abs(alignmentScores)] = aligned[0]/Abs(aligned[1]);
aligned = aligned[1];
myStartingPosition = aligned$"[^-]";
myEndingPosition = Abs (aligned)-1;
while (aligned[myEndingPosition]=="-")
{
myEndingPosition = myEndingPosition - 1;
}
myStartingPosition = myStartingPosition[0];
startingPosition[seqCounter][0] = myStartingPosition;
startingPosition[seqCounter][1] = myEndingPosition;
aligned = aligned[myStartingPosition][myEndingPosition];
refInsert = aligned||"-+";
if (refInsert[0]>0)
{
insCount = Rows (refInsert)/2;
offset = 0;
for (insN = 0; insN < insCount; insN = insN+1)
{
insPos = refInsert[insN*2];
insLength = refInsert[insN*2+1]-insPos+1;
insPos = insPos-offset;
if (refInsertions[insPos]<insLength)
{
refInsertions[insPos]=insLength;
}
offset = offset + insLength;
}
}
}
}
alignmentScoresM = avlToMatrix ("alignmentScores");
ExecuteAFile ("Utility/DescriptiveStatistics.bf");
distInfo = GatherDescriptiveStats (alignmentScoresM);
/*lowerCuttoff = 0.25;*/
distInfo["Mean"] - 2*distInfo["Std.Dev"];
/* produce a fully gapped reference sequence */
fprintf (stdout,"\nMerging pairwise alignments into a MSA\n");
fullRefSeq = "";
fullRefSeq * refLength;
fullRefSeq * (s1[0]);
s1N = UnalignedSeqs[0];
fullRefSeqN = "";
fullRefSeqN * (3*refLength);
fullRefSeqN * (s1N[0][2]);
frameShift = ReadingFrames[0];
for (seqCounter=1;seqCounter<refLength;seqCounter=seqCounter+1)
{
gapCount = refInsertions[seqCounter];
for (k=0; k<gapCount;k=k+1)
{
fullRefSeq*("-");
fullRefSeqN*("---");
}
fullRefSeq * (s1[seqCounter]);
fullRefSeqN * (s1N[frameShift+seqCounter*3][frameShift+seqCounter*3+2]);
}
fullRefSeq * 0;
fullRefSeqN * 0;
return 0;
refLength = Abs(fullRefSeq);
SetDialogPrompt ("Save alignment to:");
seqName=sequenceNames[0];
fprintf (PROMPT_FOR_FILE,CLEAR_FILE,">",seqName,"\n",fullRefSeq);
fName = LAST_FILE_PATH;
fNameC = fName+".nuc";
fprintf (fNameC,CLEAR_FILE,">",seqName,"\n",fullRefSeqN);
alCounter = 0;
for (seqCounter = 1; seqCounter < unalSequenceCount; seqCounter = seqCounter+1)
{
if (skipSeqs[seqCounter] == 0)
{
if (skipOutliers == 0 && alignmentScoresM[alCounter] < lowerCuttoff)
{
seqName=sequenceNames[seqCounter];
fprintf (stdout, "Sequence ", seqName ," was skipped because of a poor alignment score.\n");
skipSeqs[seqCounter] = 1;
alCounter = alCounter + 1;
continue;
}
alCounter = alCounter + 1;
seqName=sequenceNames[seqCounter];
aligned = SeqAlignments[seqCounter];
aligned1 = aligned[1];
aligned2 = aligned[2];
s2 = startingPosition[seqCounter][0];
e2 = startingPosition[seqCounter][1];
gappedSeq = "";
gappedSeq * Abs(aligned2);
k=0;
while (k<refLength)
{
while (fullRefSeq[k]!=aligned1[s2])
{
gappedSeq*("-");
k=k+1;
}
gappedSeq*(aligned2[s2]);
s2=s2+1;
k=k+1;
}
gappedSeq * 0;
gappedSeqN = "";
gappedSeqN * (3*Abs(aligned2));
frameShift = ReadingFrames[seqCounter];
s1N = UnalignedSeqs[seqCounter];
s2N = ProteinSequences[seqCounter];
s2 = startingPosition[seqCounter][0];
k = 0;
e2 = Abs(gappedSeq);
k = 0;
while (k<e2)
{
while ((s2N[s2]!=gappedSeq[k])&&(k<e2))
{
gappedSeqN * ("---");
k=k+1;
}
if (k<e2)
{
gappedSeqN * s1N[frameShift+s2*3][frameShift+s2*3+2];
s2 = s2+1;
k=k+1;
}
}
gappedSeqN * 0;
if (refSeq2 && seqCounter == unalSequenceCount-1)
{
fscanf (fName, "Raw", soFar);
fprintf (fName, CLEAR_FILE,">",seqName,"\n",gappedSeq,"\n",soFar);
fscanf (fNameC, "Raw", soFar);
fprintf (fNameC,CLEAR_FILE,">",seqName,"\n",gappedSeqN,"\n",soFar);
}
else
{
fprintf (fName,"\n>",seqName,"\n",gappedSeq);
fprintf (fNameC,"\n>",seqName,"\n",gappedSeqN);
}
}
}
if (Abs(skipSeqs))
{
fName = fName+".bad";
for (seqCounter = 1; seqCounter < unalSequenceCount; seqCounter = seqCounter+1)
{
if (skipSeqs[seqCounter])
{
seqName=sequenceNames[seqCounter];
fprintf (fName,">",seqName,"\n",UnalignedSeqs[seqCounter],"\n");
}
}
}
function checkFramePosition (pos1, pos2, fr1, fr2)
{
fSpan = pos2-pos1;
if (fSpan>1) /* first followed by second*/
{
if (fSpan < fMin)
{
fMin = fSpan;
frame1 = fr1;
frame2 = fr2;
fStart = pos1+1;
fEnd = pos2;
}
}
return 0;
}
Jump to Line
Something went wrong with that request. Please try again.