Hi Sergei, I am currently using the SimulateData function to simulate codon data from a predefined root sequence. The problem is that the sequences returned are too long by a factor 3, and the 'extra' sequence appears to be junk. I suspect this is a problem with my defintion of the characters matrix, and how the sequence storage matrix is created. I have pasted some test code below that simulates both nucleotides and codons from the same root. the section where i think the problem lies is commented. thanks ./w
Code:function CodonFrequencies ( nucFreqMatrix, mSize )
{
PIStop = 1.0;
result = { mSize, 1 };
hshift = 0;
for ( h = 0; h < 64; h = h + 1 ) {
first = h$16;
second = h%16$4;
third = h%4;
if ( _Genetic_Code[ h ] == 10 ) {
hshift = hshift + 1;
PIStop = PIStop - nucFreqMatrix[first][0]*nucFreqMatrix[second][1]*nucFreqMatrix[third][2];
}
else {
result[ h-hshift ][0] = nucFreqMatrix[first][0]*nucFreqMatrix[second][1]*nucFreqMatrix[third][2];
}
}
return result*( 1.0/PIStop );
}
function SetWDistribution ( omegacats )
{
if ( omegacats == 2 ) {
global P = 0.5;
P :< 1;
global W = 0.1;
W :< 1;
categFreqMatrix = { { P, 1-P } };
categRateMatrix = { { W, 1 } };
category c = ( 2, categFreqMatrix , MEAN, ,categRateMatrix, 0, 1e25 );
}
else {
global P1 = 1/3;
P1 :< 1;
global P2 = 1/2;
P2 :< 1;
global W1 = 0.1;
W1 :< 1;
global W2 = 2.0;
W2 :> 1;
categFreqMatrix = { { P1, (1-P1)*P2, (1-P1)*(1-P2) } };
categRateMatrix = { { W1, 1, W2 } };
category c = ( 3, categFreqMatrix , MEAN, ,categRateMatrix, 0, 1e25 );
}
return 0;
}
function PopulateModelMatrix (ModelMatrixName&, EFV )
{
ModelMatrixName = {ModelMatrixDimension,ModelMatrixDimension};
hshift = 0;
for (h=0; h<64; h=h+1) {
if (_Genetic_Code[h]==10) {
hshift = hshift+1;
continue;
}
vshift = hshift;
for (v = h+1; v<64; v=v+1) {
diff = v-h;
if (_Genetic_Code[v]==10) {
vshift = vshift+1;
continue;
}
nucPosInCodon = 2;
if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0)) {
if (h$4==v$4) {
transition = v%4;
transition2= h%4;
}
else {
if(diff%16==0) {
transition = v$16;
transition2= h$16;
nucPosInCodon = 0;
}
else {
transition = v%16$4;
transition2= h%16$4;
nucPosInCodon = 1;
}
}
if (_Genetic_Code[0][h]==_Genetic_Code[0][v]) {
ModelMatrixName[h-hshift][v-vshift] := t*EFV__[transition__][nucPosInCodon__];
ModelMatrixName[v-vshift][h-hshift] := t*EFV__[transition2__][nucPosInCodon__];
}
else {
ModelMatrixName[h-hshift][v-vshift] := c*t*EFV__[transition__][nucPosInCodon__];
ModelMatrixName[v-vshift][h-hshift] := c*t*EFV__[transition2__][nucPosInCodon__];
}
}
}
}
return 0;
}
/* ============= END OF FUNCTIONS ============== */
/* some constants */
INSTALL_DIR = "/usr/local/HYPHY_Source/";
DATA_FILE_PRINT_FORMAT = 4;
DEFAULT_CODE_TABLE = 0;
INPUTFILE = "10taxa.nex";
TREEFILE = "10taxa.tre";
/* set the genetic code */
skipCodeSelectionStep = 1;
incFileName = INSTALL_DIR + "TemplateBatchFiles" + DIRECTORY_SEPARATOR + "TemplateModels" + DIRECTORY_SEPARATOR + "chooseGeneticCode.def";
ExecuteCommands ( "#include \""+incFileName+"\";");
dummy = ApplyGeneticCodeTable ( DEFAULT_CODE_TABLE );
ModelMatrixDimension = 64;
for ( h = 0; h < 64; h = h + 1 ) {
if ( _Genetic_Code[ h ] == 10 ) {
ModelMatrixDimension = ModelMatrixDimension - 1;
}
}
/* read data, tree and calculate frequencies */
DataSet ds = ReadDataFile( INPUTFILE );
fscanf ( TREEFILE, "String", TREE );
DataSetFilter FullData = CreateFilter ( ds, 3, "", "", GeneticCodeExclusions );
DataSetFilter nucData = CreateFilter (ds,1);
HarvestFrequencies ( nucleotidesByPos, FullData, 3, 1, 1 );
HarvestFrequencies ( nucFreqs, nucData, 1, 1, 1 );
vectorOfFrequencies = CodonFrequencies ( nucleotidesByPos, ModelMatrixDimension );
/* estimate and store nucleotide branch lengths */
global kappa_inv = 1;
HKY85_Matrix = { { *, t*kappa_inv, t, t*kappa_inv }
{ t*kappa_inv, *, kappa_inv*t, t }
{ t, t*kappa_inv, *, kappa_inv*t }
{ t*kappa_inv, t, kappa_inv*t, * } };
Model HKY85_Model = ( HKY85_Matrix, nucFreqs, 1 );
Tree nucTree = TREE;
LikelihoodFunction lf = ( nucData, nucTree );
Optimize ( res, lf );
tvec = { 2*FullData.species-2, 1 };
branchNames = BranchName ( nucTree, -1 );
for ( k = 0; k < 2*FullData.species-2; k = k + 1 ) {
thisbranch = branchNames [ k ];
ExecuteCommands ( "tvec [ k ] = nucTree." + thisbranch + ".t;" );
}
/* simulate nucleotide data using stringBuffer as root */
charactersUniversalCode = { {"A","C","G","T" }
{"1","","",""} };
stringBuffer = "CCGATTTCTCCGATTGACACGGCTCCTGTCAAGCTTAAAACGGGAATGGACGGCTCCAAGGTCAAACAGTGGGCGCTAACACATGAAAAGATGAAGGCGTTGACAGCGATTTGCGAAGAGGTGGAGAAAGAGGGGCATATTACTAAAATTGGGCTACAGAACCCATATATTACTCCAGTCTTCGCAATAAAGAAGAAAAATGCTACCCAATGGCGCAAACTTGTCGACTTCCGAGAACTTCACAAACGCCAGCAAGTCTTCTGGGAGTTACAACTGGGGCTTCCACACCCTGCGGGTTTGAATAAGAGGAAGTCGGTCACGGAACTTGATGTGAGAGACGCGTATCTCCTAGTACCCTTCCATGAAGATTTTAGAAAGTATACCGCTTTTACAATTCCGAGTATTAATAATGGAACACCGAAGATTCGTTACCATTTTAACGTACTACCTCCAGGCTGGAAAGGTAGTCCGGCAATATTACCATCTTCAATGACGAATATTCTGGAGCCGTTTCGAGCAAAAAATCCAGAGATCGTCATATATCAATATATAGACGATTTATTTGTTGGTAGTGACTTGGAGATAGAGCAACAAAGAGCCAAGATAGAGGAACTCCGCGCACATCTACTCAAATCTGACTTTACCACTCCTGATAAAAAACACCAAAAAGAGCCTAGTTTTCTCTGGATCGGTTACGAAATACACCCGGATAAGTGGACAGTGCAGCCCAAACAGTTCCTAGATAAATACAGTTTGACAGTAAAGGACATTCAGAAAGTGGTAGGAATACTAAAGTGGGCGTCTCAATTATATGCCGGAATAAGAGTGCGACAGCTATTTAAACTTCTTAAAGAAGCCAATGCGCTGACCGAT";
DataSet sim = Simulate ( nucTree, nucFreqs, charactersUniversalCode, stringBuffer, 0 );
DataSetFilter simFilter = CreateFilter ( sim, 3, "", "", GeneticCodeExclusions );
fprintf ( stdout, simFilter, "\n" );
/* set up codon model and simulate codon data using stringBuffer as root */
modelMatrix = 0;
SetWDistribution ( 2 );
global R := 1.68923; /* estimated for data in previous optimization, alternatively optimize before simulate to use estimated omega and scaling parameters for simulation */
PopulateModelMatrix ( "modelMatrix", nucleotidesByPos );
Model codonModel = ( modelMatrix, vectorOfFrequencies, 0 );
Tree codonTree = TREE;
for ( k = 0; k < 2*FullData.species-2; k = k + 1 ) {
thisbranch = branchNames [ k ];
ExecuteCommands ( "codonTree." + thisbranch + ".t :=R*" + tvec [ k ] + ";" );
}
LikelihoodFunction lf = ( FullData, codonTree );
charactersUniversalCode = { {"A","C","G","T" }
{"3",GeneticCodeExclusions,"",""} };
DataSet sim = Simulate ( codonTree, vectorOfFrequencies, charactersUniversalCode, stringBuffer, 0 );
DataSetFilter simFilter = CreateFilter ( sim, 3, "", "", GeneticCodeExclusions );
/* the dataset filter created is too large by a factor 3. I think the problem is with the second line of the charactersUniversalCode matrix, yet as I understand it, this can't be 1 since then i'm not simulating codons. the dataset created has believable simulated data up to the length of the stringBuffer (873 nucs), everything after that appears to be junk. Maybe the matrix storing the simulated sequences is too large, yet the simulation procedure is working to the correct length?*/
/* the following fixes the problem, but is not ideal since i want to later concatenate datasets. currently i have to simulate datasets, create filters, write these to file, read from file as new datasets and then concatenate the new datasets.*/
/* ExecuteCommands ( "DataSetFilter simFilter = CreateFilter ( sim, 3, \"0-" + ( (aminoAcidSites*3) - 1) + "\" , \"\", GeneticCodeExclusions );" ); */
fprintf ( stdout, simFilter, "\n" );