Welcome, Guest. Please Login
YaBB - Yet another Bulletin Board
 
  HomeHelpSearchLogin  
 
SimulateDataSet with partitions (Read 2940 times)
wayne
Global Moderator
*****
Offline


I love YaBB 1G - SP1!

Posts: 57
San Diego, CA
Gender: male
SimulateDataSet with partitions
Apr 5th, 2007 at 5:57am
 
Hi Sergei, I am trying to simulate data under a simple partitioned model. the test scenario has four partitions (each of 50 aa), where the same model and tree is set for each partition (model0 - model3, tree0 -tree3). the script returns a dataset, yet when looking at the category classes under which sites were simulated i get the ff:

site # omega_class

38 0.0250114
39 0.0250114
40 1
41 0.0250114
42 0.0250114
43 0.0250114
44 0.0250114
45 0.0250114
46 0.0250114
47 1
48 0.0250114
49 0.0250114
50 0
51 0
52 0
53 0

which up until site 50 makes sense, since the catvar is (0.025, 1) @ frequencies (0.96, 0.04). However, sites 50-149 return 0's consistent with partitions 1 & 2. Partition 3 again returns correct values. any suggestions/comments would be appreciated. thanks ./w

batchfile requires a 600 bp sequence file in order for manual partitioning to work.

Code:
_Genetic_Code = {

		{14,/*AAA*/ 13,/*AAC*/ 14,/*AAG*/  13,/*AAT*/
		  7, /*ACA*/ 7, /*ACC*/ 7, /*ACG*/  7, /*ACT*/
		 19, /*AGA*/ 5, /*AGC*/19, /*AGG*/  5, /*AGT*/
		  2, /*ATA*/ 2, /*ATC*/	3, /*ATG*/  2, /*ATT*/
		 12,/*CAA*/ 11,/*CAC*/ 12,/*CAG*/  11,/*CAT*/
		  6, /*CCA*/ 6, /*CCC*/ 6, /*CCG*/  6, /*CCT*/
		 19,/*CGA*/ 19,/*CGC*/ 19,/*CGG*/  19,/*CGT*/
		  1, /*CTA*/ 1, /*CTG*/ 1, /*CTC*/  1, /*CTT*/
		 16,/*GAA*/ 15,/*GAC*/ 16,/*GAG*/  15,/*GAT*/
		  8, /*GCA*/ 8, /*GCC*/ 8, /*GCG*/  8, /*GCT*/
		 20,/*GGA*/ 20,/*GGC*/ 20,/*GGG*/  20,/*GGT*/
		  4, /*GTA*/ 4, /*GTC*/ 4, /*GTG*/  4, /*GTT*/
		 10,/*TAA*/  9, /*TAC*/10,/*TAG*/   9, /*TAT*/
		  5, /*TCA*/ 5, /*TCC*/ 5, /*TCG*/  5, /*TCT*/
		 10,/*TGA*/ 17,/*TGC*/ 18,/*TGG*/  17,/*TGT*/
		  1, /*TTA*/ 0, /*TTC*/ 1, /*TTG*/  0  /*TTT*/ }
	};

GeneticCodeExclusions = "TAA,TAG,TGA";

function CodonFrequencies ( nucFreqMatrix, mSize ) /* Modified from Kosakovsky Pond */
{
	PIStop = 1.0;
	result = { mSize, 1 };
	hshift = 0;
	for ( h = 0; h < 64; h = h + 1 )	{
		first = h$16;
		second = h%16$4;
		third = h%4;
		if ( _Genetic_Code[ h ] == 10 ) {	/* stop codon */
			hshift = hshift + 1;
			PIStop = PIStop - nucFreqMatrix[first][0]*nucFreqMatrix[second][1]*nucFreqMatrix[third][2];
		}
		else {
			result[ h-hshift ][0] = nucFreqMatrix[first][0]*nucFreqMatrix[second][1]*nucFreqMatrix[third][2];
		}
	}
	return result*( 1.0/PIStop );
}

function PopulateModelMatrix (ModelMatrixName& )
{
	ModelMatrixName = {ModelMatrixDimension,ModelMatrixDimension};
	hshift = 0;

	for (h=0; h<64; h=h+1)
	{
		if (_Genetic_Code[h]==10)
		{
			hshift = hshift+1;
			continue;
		}
		vshift = hshift;
		for (v = h+1; v<64; v=v+1)
		{
			diff = v-h;
			if (_Genetic_Code[v]==10)
			{
				vshift = vshift+1;
				continue;
			}
			if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0))
			{
				if (h$4==v$4)
				{
					transition = v%4;
					transition2= h%4;
				}
				else
				{
					if(diff%16==0)
					{
						transition = v$16;
						transition2= h$16;
					}
					else
					{
						transition = v%16$4;
						transition2= h%16$4;
					}
				}
				if (_Genetic_Code[0][h]==_Genetic_Code[0][v])
				{
					ModelMatrixName[h-hshift][v-vshift] := t;
					ModelMatrixName[v-vshift][h-hshift] := t;
				}
				else
				{
					ModelMatrixName[h-hshift][v-vshift] := c*t;
					ModelMatrixName[v-vshift][h-hshift] := c*t;
				}
			}
		}
	}
	return 0;
}

/*============================================================================================================================================*/

LIKELIHOOD_FUNCTION_OUTPUT = 3;
PARTITION = 1;

ModelMatrixDimension = 64;
for ( h = 0; h < 64; h = h + 1 ) {
	if ( _Genetic_Code[ h ] == 10 ) {
		ModelMatrixDimension = ModelMatrixDimension - 1;
	}
}

/* get the data and calculate various frequencies */
DataSet ds = ReadDataFile ( "10taxa_600bp.nex" );
fscanf ( "10taxa_600bp.tre", "String", TREE );
DataSetFilter FullData = CreateFilter ( ds, 3, "", "", GeneticCodeExclusions );
HarvestFrequencies ( nucleotidesByPos, FullData, 3, 1, 1 );
vectorOfFrequencies = CodonFrequencies ( nucleotidesByPos, ModelMatrixDimension );

/* set omega distribution */
global P = 0.5;
P:<1;
global W = 0.1;
W:<1;
categFreqMatrix = {{P,1-P}};
categRateMatrix = {{W,1}};
category c = (2, categFreqMatrix , MEAN, ,categRateMatrix, 0, 1e25);
modelMatrix = 0;
PopulateModelMatrix ( "modelMatrix" );

if ( PARTITION ) {
	fprintf ( stdout, "Dataset partitioned\n" );
	/* create four arb partitions */
	DataSetFilter codon0 = CreateFilter ( ds, 3, "0-149", "", GeneticCodeExclusions );
	DataSetFilter codon1 = CreateFilter ( ds, 3, "150-299", "", GeneticCodeExclusions );
	DataSetFilter codon2 = CreateFilter ( ds, 3, "300-449", "", GeneticCodeExclusions );
	DataSetFilter codon3 = CreateFilter ( ds, 3, "450-599", "", GeneticCodeExclusions );

	stringBuffer = "";
	stringBuffer * 128;
	First = 1;
	for ( i = 0; i < 4; i = i + 1 ) {
		if ( i % 2 != 0 ) {
			ExecuteCommands ( "Model model" + i + "= ( modelMatrix, vectorOfFrequencies, 1 );" );
		}
		else {
			ExecuteCommands ( "Model model" + i + "= ( modelMatrix, vectorOfFrequencies, 1 );" );
		}
		ExecuteCommands ( "Tree tree" + i + " = TREE;" );
		if ( First ) {
			stringBuffer = "LikelihoodFunction lf = ( codon" + i + ", tree" + i;
			First = 0;
		}
		else {
			stringBuffer = stringBuffer + ", codon" + i + ", tree" + i;
		}
	}
	stringBuffer = stringBuffer + ");";
	ExecuteCommands ( stringBuffer );
	fprintf ( stdout, stringBuffer, "\n" );
	stringBuffer * 0;
}
else {
	fprintf ( stdout, "Dataset unpartitioned\n" );
	Model model = ( modelMatrix, vectorOfFrequencies, 1 );
	Tree tree = TREE;
	LikelihoodFunction lf = ( FullData, tree );
}

Optimize ( res, lf );
fprintf ( stdout, "W1 < 1\t\t", Format (W, 3, 3), "\t", P, "\n" );
fprintf ( stdout, "W = 1\t\t1\t", 1-P, "\n" );

fprintf ( stdout, lf, "\n" );
fprintf ( stdout, "\n" );
DataSet simData = SimulateDataSet ( lf, GeneticCodeExclusions, simRates, catVarNames );
fprintf ( stdout, "*** Sites by category class ***\n" );
fprintf ( stdout, "Category variable = ", catVarNames[0], "\n" );
for ( i = 0; i < FullData.sites; i = i + 1 ) {
	fprintf ( stdout, i, " ", simRates [ i ], "\n" );
}

DataSetFilter simDataFilter = CreateFilter ( simData, 3, "", "", GeneticCodeExclusions );
DATAFILE_PRINT_FORMAT = 4;
fprintf ( "simData.nex", simDataFilter, "\n" );



 

Back to top
 

Assistant Project Scientist&&Antiviral Research Center&&Department of Pathology&&University of California, San Diego
WWW WWW  
IP Logged
 
Sergei
YaBB Administrator
*****
Offline


Datamonkeys are forever...

Posts: 1658
UCSD
Gender: male
Re: SimulateDataSet with partitions
Reply #1 - Apr 9th, 2007 at 9:23am
 
Dear Wayne,

Looks like there is a bug in HyPhy when it tries to populate simulated rates for multiple partitions (I am pretty sure the rates themselves are drawn accurately - just that the values are not written properly) - I'll check into it today and try to roll a fix into the next build.

Cheers,
Sergei
Back to top
 

Associate Professor
Division of Infectious Diseases
Division of Biomedical Informatics
School of Medicine
University of California San Diego
WWW WWW  
IP Logged
 
Sergei
YaBB Administrator
*****
Offline


Datamonkeys are forever...

Posts: 1658
UCSD
Gender: male
Re: SimulateDataSet with partitions
Reply #2 - Apr 9th, 2007 at 11:57am
 
Dear Wayne,

Good catch: HyPhy had an internal indexing bug which would not write simulated rates to appropriate cells in the rate matrix for codon data. I fixed the problem - look for the next build to work properly.

Thanks!
Sergei
Back to top
 

Associate Professor
Division of Infectious Diseases
Division of Biomedical Informatics
School of Medicine
University of California San Diego
WWW WWW  
IP Logged
 
wayne
Global Moderator
*****
Offline


I love YaBB 1G - SP1!

Posts: 57
San Diego, CA
Gender: male
Re: SimulateDataSet with partitions
Reply #3 - Apr 10th, 2007 at 1:02am
 
thanks ./w
Back to top
 

Assistant Project Scientist&&Antiviral Research Center&&Department of Pathology&&University of California, San Diego
WWW WWW  
IP Logged