Hi Sergei, I am trying to simulate data under a simple partitioned model. the test scenario has four partitions (each of 50 aa), where the same model and tree is set for each partition (model0 - model3, tree0 -tree3). the script returns a dataset, yet when looking at the category classes under which sites were simulated i get the ff:
site # omega_class
38 0.0250114
39 0.0250114
40 1
41 0.0250114
42 0.0250114
43 0.0250114
44 0.0250114
45 0.0250114
46 0.0250114
47 1
48 0.0250114
49 0.0250114
50 0
51 0
52 0
53 0
which up until site 50 makes sense, since the catvar is (0.025, 1) @ frequencies (0.96, 0.04). However, sites 50-149 return 0's consistent with partitions 1 & 2. Partition 3 again returns correct values. any suggestions/comments would be appreciated. thanks ./w
batchfile requires a 600 bp sequence file in order for manual partitioning to work.
Code:_Genetic_Code = {
{14,/*AAA*/ 13,/*AAC*/ 14,/*AAG*/ 13,/*AAT*/
7, /*ACA*/ 7, /*ACC*/ 7, /*ACG*/ 7, /*ACT*/
19, /*AGA*/ 5, /*AGC*/19, /*AGG*/ 5, /*AGT*/
2, /*ATA*/ 2, /*ATC*/ 3, /*ATG*/ 2, /*ATT*/
12,/*CAA*/ 11,/*CAC*/ 12,/*CAG*/ 11,/*CAT*/
6, /*CCA*/ 6, /*CCC*/ 6, /*CCG*/ 6, /*CCT*/
19,/*CGA*/ 19,/*CGC*/ 19,/*CGG*/ 19,/*CGT*/
1, /*CTA*/ 1, /*CTG*/ 1, /*CTC*/ 1, /*CTT*/
16,/*GAA*/ 15,/*GAC*/ 16,/*GAG*/ 15,/*GAT*/
8, /*GCA*/ 8, /*GCC*/ 8, /*GCG*/ 8, /*GCT*/
20,/*GGA*/ 20,/*GGC*/ 20,/*GGG*/ 20,/*GGT*/
4, /*GTA*/ 4, /*GTC*/ 4, /*GTG*/ 4, /*GTT*/
10,/*TAA*/ 9, /*TAC*/10,/*TAG*/ 9, /*TAT*/
5, /*TCA*/ 5, /*TCC*/ 5, /*TCG*/ 5, /*TCT*/
10,/*TGA*/ 17,/*TGC*/ 18,/*TGG*/ 17,/*TGT*/
1, /*TTA*/ 0, /*TTC*/ 1, /*TTG*/ 0 /*TTT*/ }
};
GeneticCodeExclusions = "TAA,TAG,TGA";
function CodonFrequencies ( nucFreqMatrix, mSize ) /* Modified from Kosakovsky Pond */
{
PIStop = 1.0;
result = { mSize, 1 };
hshift = 0;
for ( h = 0; h < 64; h = h + 1 ) {
first = h$16;
second = h%16$4;
third = h%4;
if ( _Genetic_Code[ h ] == 10 ) { /* stop codon */
hshift = hshift + 1;
PIStop = PIStop - nucFreqMatrix[first][0]*nucFreqMatrix[second][1]*nucFreqMatrix[third][2];
}
else {
result[ h-hshift ][0] = nucFreqMatrix[first][0]*nucFreqMatrix[second][1]*nucFreqMatrix[third][2];
}
}
return result*( 1.0/PIStop );
}
function PopulateModelMatrix (ModelMatrixName& )
{
ModelMatrixName = {ModelMatrixDimension,ModelMatrixDimension};
hshift = 0;
for (h=0; h<64; h=h+1)
{
if (_Genetic_Code[h]==10)
{
hshift = hshift+1;
continue;
}
vshift = hshift;
for (v = h+1; v<64; v=v+1)
{
diff = v-h;
if (_Genetic_Code[v]==10)
{
vshift = vshift+1;
continue;
}
if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0))
{
if (h$4==v$4)
{
transition = v%4;
transition2= h%4;
}
else
{
if(diff%16==0)
{
transition = v$16;
transition2= h$16;
}
else
{
transition = v%16$4;
transition2= h%16$4;
}
}
if (_Genetic_Code[0][h]==_Genetic_Code[0][v])
{
ModelMatrixName[h-hshift][v-vshift] := t;
ModelMatrixName[v-vshift][h-hshift] := t;
}
else
{
ModelMatrixName[h-hshift][v-vshift] := c*t;
ModelMatrixName[v-vshift][h-hshift] := c*t;
}
}
}
}
return 0;
}
/*============================================================================================================================================*/
LIKELIHOOD_FUNCTION_OUTPUT = 3;
PARTITION = 1;
ModelMatrixDimension = 64;
for ( h = 0; h < 64; h = h + 1 ) {
if ( _Genetic_Code[ h ] == 10 ) {
ModelMatrixDimension = ModelMatrixDimension - 1;
}
}
/* get the data and calculate various frequencies */
DataSet ds = ReadDataFile ( "10taxa_600bp.nex" );
fscanf ( "10taxa_600bp.tre", "String", TREE );
DataSetFilter FullData = CreateFilter ( ds, 3, "", "", GeneticCodeExclusions );
HarvestFrequencies ( nucleotidesByPos, FullData, 3, 1, 1 );
vectorOfFrequencies = CodonFrequencies ( nucleotidesByPos, ModelMatrixDimension );
/* set omega distribution */
global P = 0.5;
P:<1;
global W = 0.1;
W:<1;
categFreqMatrix = {{P,1-P}};
categRateMatrix = {{W,1}};
category c = (2, categFreqMatrix , MEAN, ,categRateMatrix, 0, 1e25);
modelMatrix = 0;
PopulateModelMatrix ( "modelMatrix" );
if ( PARTITION ) {
fprintf ( stdout, "Dataset partitioned\n" );
/* create four arb partitions */
DataSetFilter codon0 = CreateFilter ( ds, 3, "0-149", "", GeneticCodeExclusions );
DataSetFilter codon1 = CreateFilter ( ds, 3, "150-299", "", GeneticCodeExclusions );
DataSetFilter codon2 = CreateFilter ( ds, 3, "300-449", "", GeneticCodeExclusions );
DataSetFilter codon3 = CreateFilter ( ds, 3, "450-599", "", GeneticCodeExclusions );
stringBuffer = "";
stringBuffer * 128;
First = 1;
for ( i = 0; i < 4; i = i + 1 ) {
if ( i % 2 != 0 ) {
ExecuteCommands ( "Model model" + i + "= ( modelMatrix, vectorOfFrequencies, 1 );" );
}
else {
ExecuteCommands ( "Model model" + i + "= ( modelMatrix, vectorOfFrequencies, 1 );" );
}
ExecuteCommands ( "Tree tree" + i + " = TREE;" );
if ( First ) {
stringBuffer = "LikelihoodFunction lf = ( codon" + i + ", tree" + i;
First = 0;
}
else {
stringBuffer = stringBuffer + ", codon" + i + ", tree" + i;
}
}
stringBuffer = stringBuffer + ");";
ExecuteCommands ( stringBuffer );
fprintf ( stdout, stringBuffer, "\n" );
stringBuffer * 0;
}
else {
fprintf ( stdout, "Dataset unpartitioned\n" );
Model model = ( modelMatrix, vectorOfFrequencies, 1 );
Tree tree = TREE;
LikelihoodFunction lf = ( FullData, tree );
}
Optimize ( res, lf );
fprintf ( stdout, "W1 < 1\t\t", Format (W, 3, 3), "\t", P, "\n" );
fprintf ( stdout, "W = 1\t\t1\t", 1-P, "\n" );
fprintf ( stdout, lf, "\n" );
fprintf ( stdout, "\n" );
DataSet simData = SimulateDataSet ( lf, GeneticCodeExclusions, simRates, catVarNames );
fprintf ( stdout, "*** Sites by category class ***\n" );
fprintf ( stdout, "Category variable = ", catVarNames[0], "\n" );
for ( i = 0; i < FullData.sites; i = i + 1 ) {
fprintf ( stdout, i, " ", simRates [ i ], "\n" );
}
DataSetFilter simDataFilter = CreateFilter ( simData, 3, "", "", GeneticCodeExclusions );
DATAFILE_PRINT_FORMAT = 4;
fprintf ( "simData.nex", simDataFilter, "\n" );