HyPhy message board
http://www.hyphy.org/cgi-bin/hyphy_forums/YaBB.pl
HYPHY Package >> HyPhy feedback >> SimulateDataSet with partitions
http://www.hyphy.org/cgi-bin/hyphy_forums/YaBB.pl?num=1175777847

Message started by wayne on Apr 5th, 2007 at 5:57am

Title: SimulateDataSet with partitions
Post by wayne on Apr 5th, 2007 at 5:57am
Hi Sergei, I am trying to simulate data under a simple partitioned model. the test scenario has four partitions (each of 50 aa), where the same model and tree is set for each partition (model0 - model3, tree0 -tree3). the script returns a dataset, yet when looking at the category classes under which sites were simulated i get the ff:

site # omega_class

38 0.0250114
39 0.0250114
40 1
41 0.0250114
42 0.0250114
43 0.0250114
44 0.0250114
45 0.0250114
46 0.0250114
47 1
48 0.0250114
49 0.0250114
50 0
51 0
52 0
53 0

which up until site 50 makes sense, since the catvar is (0.025, 1) @ frequencies (0.96, 0.04). However, sites 50-149 return 0's consistent with partitions 1 & 2. Partition 3 again returns correct values. any suggestions/comments would be appreciated. thanks ./w

batchfile requires a 600 bp sequence file in order for manual partitioning to work.


Code (]_Genetic_Code = {
           
           {14,/*AAA*/ 13,/*AAC*/ 14,/*AAG*/  13,/*AAT*/
             7, /*ACA*/ 7, /*ACC*/ 7, /*ACG*/  7, /*ACT*/
            19, /*AGA*/ 5, /*AGC*/19, /*AGG*/  5, /*AGT*/
             2, /*ATA*/ 2, /*ATC*/      3, /*ATG*/  2, /*ATT*/
            12,/*CAA*/ 11,/*CAC*/ 12,/*CAG*/  11,/*CAT*/
             6, /*CCA*/ 6, /*CCC*/ 6, /*CCG*/  6, /*CCT*/
            19,/*CGA*/ 19,/*CGC*/ 19,/*CGG*/  19,/*CGT*/
             1, /*CTA*/ 1, /*CTG*/ 1, /*CTC*/  1, /*CTT*/
            16,/*GAA*/ 15,/*GAC*/ 16,/*GAG*/  15,/*GAT*/
             8, /*GCA*/ 8, /*GCC*/ 8, /*GCG*/  8, /*GCT*/
            20,/*GGA*/ 20,/*GGC*/ 20,/*GGG*/  20,/*GGT*/
             4, /*GTA*/ 4, /*GTC*/ 4, /*GTG*/  4, /*GTT*/
            10,/*TAA*/  9, /*TAC*/10,/*TAG*/   9, /*TAT*/
             5, /*TCA*/ 5, /*TCC*/ 5, /*TCG*/  5, /*TCT*/
            10,/*TGA*/ 17,/*TGC*/ 18,/*TGG*/  17,/*TGT*/
             1, /*TTA*/ 0, /*TTC*/ 1, /*TTG*/  0  /*TTT*/ }
     };

GeneticCodeExclusions = "TAA,TAG,TGA";

function CodonFrequencies ( nucFreqMatrix, mSize ) /* Modified from Kosakovsky Pond */
{
     PIStop = 1.0;
     result = { mSize, 1 };
     hshift = 0;
     for ( h = 0; h < 64; h = h + 1 )      {
           first = h$16;
           second = h%16$4;
           third = h%4;
           if ( _Genetic_Code[ h ):

== 10 ) {      /* stop codon */
                 hshift = hshift + 1;
                 PIStop = PIStop - nucFreqMatrix[first][0]*nucFreqMatrix[second][1]*nucFreqMatrix[third][2];
           }
           else {
                 result[ h-hshift ][0] = nucFreqMatrix[first][0]*nucFreqMatrix[second][1]*nucFreqMatrix[third][2];
           }
     }
     return result*( 1.0/PIStop );
}

function PopulateModelMatrix (ModelMatrixName& )
{
     ModelMatrixName = {ModelMatrixDimension,ModelMatrixDimension};
     hshift = 0;
     
     for (h=0; h<64; h=h+1)
     {
           if (_Genetic_Code[h]==10)
           {
                 hshift = hshift+1;
                 continue;
           }
           vshift = hshift;
           for (v = h+1; v<64; v=v+1)
           {
                 diff = v-h;
                 if (_Genetic_Code[v]==10)
                 {
                       vshift = vshift+1;
                       continue;
                 }
                 if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0))
                 {
                       if (h$4==v$4)
                       {
                             transition = v%4;
                             transition2= h%4;
                       }
                       else
                       {
                             if(diff%16==0)
                             {
                                   transition = v$16;
                                   transition2= h$16;
                             }
                             else
                             {
                                   transition = v%16$4;
                                   transition2= h%16$4;
                             }
                       }
                       if (_Genetic_Code[0][h]==_Genetic_Code[0][v])
                       {
                             ModelMatrixName[h-hshift][v-vshift] := t;
                             ModelMatrixName[v-vshift][h-hshift] := t;
                       }
                       else
                       {
                             ModelMatrixName[h-hshift][v-vshift] := c*t;
                             ModelMatrixName[v-vshift][h-hshift] := c*t;
                       }
                 }
           }
     }
     return 0;
}      

/*============================================================================================================================================*/

LIKELIHOOD_FUNCTION_OUTPUT = 3;
PARTITION = 1;

ModelMatrixDimension = 64;
for ( h = 0; h < 64; h = h + 1 ) {
     if ( _Genetic_Code[ h ] == 10 ) {
           ModelMatrixDimension = ModelMatrixDimension - 1;
     }
}

/* get the data and calculate various frequencies */
DataSet ds = ReadDataFile ( "10taxa_600bp.nex" );
fscanf ( "10taxa_600bp.tre", "String", TREE );
DataSetFilter FullData = CreateFilter ( ds, 3, "", "", GeneticCodeExclusions );
HarvestFrequencies ( nucleotidesByPos, FullData, 3, 1, 1 );
vectorOfFrequencies = CodonFrequencies ( nucleotidesByPos, ModelMatrixDimension );

/* set omega distribution */
global P = 0.5;
P:<1;            
global W = 0.1;
W:<1;
categFreqMatrix = [P,1-P];
categRateMatrix = [W,1];            
category c = (2, categFreqMatrix , MEAN, ,categRateMatrix, 0, 1e25);
modelMatrix = 0;      
PopulateModelMatrix ( "modelMatrix" );

if ( PARTITION ) {
     fprintf ( stdout, "Dataset partitioned\n" );
     /* create four arb partitions */
     DataSetFilter codon0 = CreateFilter ( ds, 3, "0-149", "", GeneticCodeExclusions );
     DataSetFilter codon1 = CreateFilter ( ds, 3, "150-299", "", GeneticCodeExclusions );
     DataSetFilter codon2 = CreateFilter ( ds, 3, "300-449", "", GeneticCodeExclusions );
     DataSetFilter codon3 = CreateFilter ( ds, 3, "450-599", "", GeneticCodeExclusions );
     
     stringBuffer = "";
     stringBuffer * 128;
     First = 1;
     for ( i = 0; i < 4; i = i + 1 ) {
           if ( i % 2 != 0 ) {
                 ExecuteCommands ( "Model model" + i + "= ( modelMatrix, vectorOfFrequencies, 1 );" );
           }
           else {
                 ExecuteCommands ( "Model model" + i + "= ( modelMatrix, vectorOfFrequencies, 1 );" );
           }
           ExecuteCommands ( "Tree tree" + i + " = TREE;" );
           if ( First ) {
                 stringBuffer = "LikelihoodFunction lf = ( codon" + i + ", tree" + i;
                 First = 0;
           }
           else {
                 stringBuffer = stringBuffer + ", codon" + i + ", tree" + i;
           }
     }
     stringBuffer = stringBuffer + ");";
     ExecuteCommands ( stringBuffer );
     fprintf ( stdout, stringBuffer, "\n" );
     stringBuffer * 0;
}
else {
     fprintf ( stdout, "Dataset unpartitioned\n" );
     Model model = ( modelMatrix, vectorOfFrequencies, 1 );
     Tree tree = TREE;
     LikelihoodFunction lf = ( FullData, tree );
}

Optimize ( res, lf );
fprintf ( stdout, "W1 < 1\t\t", Format (W, 3, 3), "\t", P, "\n" );
fprintf ( stdout, "W = 1\t\t1\t", 1-P, "\n" );

fprintf ( stdout, lf, "\n" );
fprintf ( stdout, "\n" );
DataSet simData = SimulateDataSet ( lf, GeneticCodeExclusions, simRates, catVarNames );
fprintf ( stdout, "*** Sites by category class ***\n" );
fprintf ( stdout, "Category variable = ", catVarNames[0], "\n" );
for ( i = 0; i < FullData.sites; i = i + 1 ) {
     fprintf ( stdout, i, " ", simRates [ i ], "\n" );
}

DataSetFilter simDataFilter = CreateFilter ( simData, 3, "", "", GeneticCodeExclusions );
DATAFILE_PRINT_FORMAT = 4;
fprintf ( "simData.nex", simDataFilter, "\n" );




Title: Re: SimulateDataSet with partitions
Post by Sergei on Apr 9th, 2007 at 9:23am
Dear Wayne,

Looks like there is a bug in HyPhy when it tries to populate simulated rates for multiple partitions (I am pretty sure the rates themselves are drawn accurately - just that the values are not written properly) - I'll check into it today and try to roll a fix into the next build.

Cheers,
Sergei

Title: Re: SimulateDataSet with partitions
Post by Sergei on Apr 9th, 2007 at 11:57am
Dear Wayne,

Good catch: HyPhy had an internal indexing bug which would not write simulated rates to appropriate cells in the rate matrix for codon data. I fixed the problem - look for the next build to work properly.

Thanks!
Sergei

Title: Re: SimulateDataSet with partitions
Post by wayne on Apr 10th, 2007 at 1:02am
thanks ./w

HyPhy message board » Powered by YaBB 2.5.2!
YaBB Forum Software © 2000-2024. All Rights Reserved.