HyPhy message board
http://www.hyphy.org/cgi-bin/hyphy_forums/YaBB.pl
HYPHY Package >> HyPhy feedback >> SimulateDataSet
http://www.hyphy.org/cgi-bin/hyphy_forums/YaBB.pl?num=1176826805

Message started by wayne on Apr 17th, 2007 at 9:20am

Title: SimulateDataSet
Post by wayne on Apr 17th, 2007 at 9:20am
Hi Sergei, I am using SimulateDataSet to do exactly that. Problem is that I think I am misunderstanding how the function works. As I understand it the data filter in the likelihood function would provide a starting point (root) from which sites evolve along a tree according to the given model. However I seem to be getting some strange results from a script that reads data, sets a lf, optimizes the lf (base data), simulates a dataset with category variables constrained to chosen values, and finally optimizes the lf of the new simulated dataset. Essentially for a simple positive selection model i get:

*** Optimized parameter values: base data ***
W1 < 1: 0.035   P:      0.956
W  = 1: 1       P:      0.000
W2 > 1: 1.587   P:      0.044

*** Simulated parameter values (Constrained to these values) ***
W1 < 1: 0.100   P:      0.333
W  = 1: 1       P:      0.333
W2 > 1: 5.000   P:      0.333

*** Optimized parameter values: simulated data ***
W1 < 1: 0.041   P:      0.959
W  = 1: 1       P:      0.000
W2 > 1: 1.952   P:      0.041

So it seems there is an effect of the constrained values on the simulated dataset, yet the simulated data still "remembers" the base state. I want to use this to test how well another model is working. Is there something I'm missing as to how the SimulateDataSet function works? thanks ./w

Title: Re: SimulateDataSet
Post by Sergei on Apr 17th, 2007 at 10:27pm
Dear Wayne,

SimulateDataSet should behave as you expected; could you send me the code which generated the outcome you see?

Cheers,
Sergei

Title: Re: SimulateDataSet
Post by wayne on Apr 18th, 2007 at 1:07am
attached. cheers ./w


Code (]
_Genetic_Code = {
           
           {14,/*AAA*/ 13,/*AAC*/ 14,/*AAG*/  13,/*AAT*/
             7, /*ACA*/ 7, /*ACC*/ 7, /*ACG*/  7, /*ACT*/
            19, /*AGA*/ 5, /*AGC*/19, /*AGG*/  5, /*AGT*/
             2, /*ATA*/ 2, /*ATC*/      3, /*ATG*/  2, /*ATT*/
            12,/*CAA*/ 11,/*CAC*/ 12,/*CAG*/  11,/*CAT*/
             6, /*CCA*/ 6, /*CCC*/ 6, /*CCG*/  6, /*CCT*/
            19,/*CGA*/ 19,/*CGC*/ 19,/*CGG*/  19,/*CGT*/
             1, /*CTA*/ 1, /*CTG*/ 1, /*CTC*/  1, /*CTT*/
            16,/*GAA*/ 15,/*GAC*/ 16,/*GAG*/  15,/*GAT*/
             8, /*GCA*/ 8, /*GCC*/ 8, /*GCG*/  8, /*GCT*/
            20,/*GGA*/ 20,/*GGC*/ 20,/*GGG*/  20,/*GGT*/
             4, /*GTA*/ 4, /*GTC*/ 4, /*GTG*/  4, /*GTT*/
            10,/*TAA*/  9, /*TAC*/10,/*TAG*/   9, /*TAT*/
             5, /*TCA*/ 5, /*TCC*/ 5, /*TCG*/  5, /*TCT*/
            10,/*TGA*/ 17,/*TGC*/ 18,/*TGG*/  17,/*TGT*/
             1, /*TTA*/ 0, /*TTC*/ 1, /*TTG*/  0  /*TTT*/ }
     };

GeneticCodeExclusions = "TAA,TAG,TGA";

function CodonFrequencies ( nucFreqMatrix, mSize )
{
     PIStop = 1.0;
     result = { mSize, 1 };
     hshift = 0;
     for ( h = 0; h < 64; h = h + 1 )      {
           first = h$16;
           second = h%16$4;
           third = h%4;
           if ( _Genetic_Code[ h ):

== 10 ) {      /* stop codon */
                 hshift = hshift + 1;
                 PIStop = PIStop - nucFreqMatrix[first][0]*nucFreqMatrix[second][1]*nucFreqMatrix[third][2];
           }
           else {
                 result[ h-hshift ][0] = nucFreqMatrix[first][0]*nucFreqMatrix[second][1]*nucFreqMatrix[third][2];
           }
     }
     return result*( 1.0/PIStop );
}

function PopulateModelMatrix (ModelMatrixName& )
{
     ModelMatrixName = {ModelMatrixDimension,ModelMatrixDimension};
     hshift = 0;
     
     for (h=0; h<64; h=h+1)
     {
           if (_Genetic_Code[h]==10)
           {
                 hshift = hshift+1;
                 continue;
           }
           vshift = hshift;
           for (v = h+1; v<64; v=v+1)
           {
                 diff = v-h;
                 if (_Genetic_Code[v]==10)
                 {
                       vshift = vshift+1;
                       continue;
                 }
                 if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0))
                 {
                       if (_Genetic_Code[0][h]==_Genetic_Code[0][v])
                       {
                             ModelMatrixName[h-hshift][v-vshift] := t;
                             ModelMatrixName[v-vshift][h-hshift] := t;
                       }
                       else
                       {
                             ModelMatrixName[h-hshift][v-vshift] := c*t;
                             ModelMatrixName[v-vshift][h-hshift] := c*t;
                       }
                 }
           }
     }
     return 0;
}      

function SetWDistribution (resp)
{
     global P1 = 1/3;
     global P2 = 1/2;
     P1:<1;
     P2:<1;
     global W_1 = 0.1;
     W_1:<1;
     global W_2 = 5;
     W_2:>1;
     categFreqMatrix = [P1,(1-P1)*P2, (1-P1)*(1-P2)];
     categRateMatrix = [W_1,1,W_2];
     category c = (3, categFreqMatrix , MEAN, ,categRateMatrix, 0, 1e25);
     return 0;
}

function SetWDistributionSims ( resp )
{
     global P1 := 1/3;
     global P2 := 1/2;
     global W_1 := 0.1;
     global W_2 := 5;
     categFreqMatrix = [P1,(1-P1)*P2, (1-P1)*(1-P2)];
     categRateMatrix = [W_1,1,W_2];
     category c = (3, categFreqMatrix , MEAN, ,categRateMatrix, 0, 1e25);
     return 0;
}

/*======================================================================================================================*/

LIKELIHOOD_FUNCTION_OUTPUT = 3;

ModelMatrixDimension = 64;
for ( h = 0; h < 64; h = h + 1 ) {
     if ( _Genetic_Code[ h ] == 10 ) {
           ModelMatrixDimension = ModelMatrixDimension - 1;
     }
}

DataSet ds = ReadDataFile ( "30taxa.nex" );
fscanf ( "30taxa.tre", "String", TREE );
DataSetFilter FullData = CreateFilter ( ds, 3, "", "", GeneticCodeExclusions );
HarvestFrequencies ( nucleotidesByPos, FullData, 3, 1, 1 );
vectorOfFrequencies = CodonFrequencies ( nucleotidesByPos, ModelMatrixDimension );

/* Infer parameters from data */
SetWDistribution ( 1 );
modelMatrix = 0;
PopulateModelMatrix ( "modelMatrix" );
Model model = ( modelMatrix, vectorOfFrequencies, 1 );
Tree tree = TREE;
LikelihoodFunction lf = ( FullData, tree );
Optimize ( res, lf );
fprintf ( stdout, "\n*** Optimized parameter values: base data ***\n" );
fprintf ( stdout, "W1 < 1:\t", Format (W_1,0,3), "\tP:\t", Format (P1,0,3), "\n" );
fprintf ( stdout, "W  = 1:\t1\tP:\t", Format( (1-P1)*P2, 0, 3), "\n" );
fprintf ( stdout, "W2 > 1:\t", Format (W_2,0,3), "\tP:\t", Format((1-P1)*(1-P2),0,3), "\n" );
fprintf ( stdout, "\n" );

/* Simulate data with category variable constraints */
SetWDistributionSims ( 1 );
modelMatrix = 0;      
PopulateModelMatrix ( "modelMatrix" );
Model model = ( modelMatrix, vectorOfFrequencies, 1 );
Tree tree = TREE;
LikelihoodFunction lf = ( FullData, tree );
DataSet simData = SimulateDataSet ( lf, GeneticCodeExclusions, simRates, catVarNames );
fprintf ( stdout, "\n*** Simulated parameter values ***\n" );
fprintf ( stdout, "W1 < 1:\t", Format (W_1,0,3), "\tP:\t", Format (P1,0,3), "\n" );
fprintf ( stdout, "W  = 1:\t1\tP:\t", Format( (1-P1)*P2, 0, 3), "\n" );
fprintf ( stdout, "W2 > 1:\t", Format (W_2,0,3), "\tP:\t", Format((1-P1)*(1-P2),0,3), "\n" );
fprintf ( stdout, "\n" );

/* Infer parameters from the simulated data */
DataSetFilter simDataFilter = CreateFilter ( simData, 3, "", "", GeneticCodeExclusions );
HarvestFrequencies ( nucleotidesByPos, simDataFilter, 3, 1, 1 );
vectorOfFrequencies = CodonFrequencies ( nucleotidesByPos, ModelMatrixDimension );
SetWDistribution ( 1 );
modelMatrix = 0;      
PopulateModelMatrix ( "modelMatrix" );
Model model = ( modelMatrix, vectorOfFrequencies, 1 );
Tree tree = TREE;
LikelihoodFunction lf = ( FullData, tree );
Optimize ( res, lf );
fprintf ( stdout, "\n*** Optimized parameter values: simulated data ***\n" );
fprintf ( stdout, "W1 < 1:\t", Format (W_1,0,3), "\tP:\t", Format (P1,0,3), "\n" );
fprintf ( stdout, "W  = 1:\t1\tP:\t", Format( (1-P1)*P2, 0, 3), "\n" );
fprintf ( stdout, "W2 > 1:\t", Format (W_2,0,3), "\tP:\t", Format((1-P1)*(1-P2),0,3), "\n" );
fprintf ( stdout, "\n" );

Title: Re: SimulateDataSet
Post by Sergei on Apr 18th, 2007 at 8:42am
Dear Wayne,

In the last LikelihoodFunction construct (on line 163), you are still using the original data (hence the original estimates)

Replace LikelihoodFunction lf = ( FullData, tree ); with LikelihoodFunction lf = ( simDataFilter, tree );

Cheers,
Sergei

Title: Re: SimulateDataSet
Post by wayne on Apr 18th, 2007 at 11:21am
oops, i'm a little embarrassed. thanks for the heads up and apologies for wasting your time. cheers ./w

HyPhy message board » Powered by YaBB 2.5.2!
YaBB Forum Software © 2000-2024. All Rights Reserved.