Hi, thanks. was doing this. i've included some test code. unless there is something i'm not following correctly with the setting of branch lengths for simulation. even so the results shouldn't differ between the two models right?
running the attached code with the GY option i get:
Tree length of base data = 1.48502
Tree length of simulated data = 0.40802
whereas i get the following with MG:
Tree length of base data = 1.48502
Tree length of simulated data = 4.89443
there is a 3*multiplier on the calculated nucleotide branch lengths so i do expect the tree length of simulated data to be longer, yet this should be consistent between the two models, right? please let me know if you can spot my error. thanks and regards ./w
Code:/* set some parameters */
INPUTFILE = "Brumme_nef.30.nex";
TREEFILE = "Brumme_nef.30.tre";
SIMDATAFILE = "testsim.simdata";
INSTALL_DIR = "/usr/local/HYPHY_Source/";
fprintf ( SIMDATAFILE, CLEAR_FILE );
DATA_FILE_PRINT_FORMAT = 4;
BLMULTI = 3;
SIM1 = 0; /* if SIM1 = 1, then uses Simulate function, else uses SimulateDataSet function */
GY = 0; /* if true uses Goldman Yang parametisation, else Muse Gaut */
CODONS = { { "AAA", "AAC", "AAG", "AAT",
"ACA", "ACC", "ACG", "ACT",
"AGA", "AGC", "AGG", "AGT",
"ATA", "ATC", "ATG", "ATT",
"CAA", "CAC", "CAG", "CAT",
"CCA", "CCC", "CCG", "CCT",
"CGA", "CGC", "CGG", "CGT",
"CTA", "CTC", "CTG", "CTT",
"GAA", "GAC", "GAG", "GAT",
"GCA", "GCC", "GCG", "GCT",
"GGA", "GGC", "GGG", "GGT",
"GTA", "GTC", "GTG", "GTT",
"TAA", "TAC", "TAG", "TAT",
"TCA", "TCC", "TCG", "TCT",
"TGA", "TGC", "TGG", "TGT",
"TTA", "TTC", "TTG", "TTT" } };
AA = "FLIMVSPTAY*HQNKDECWRG";
DATA_FILE_PRINT_FORMAT = 4;
/* functions */
function GeneticCodeStuff ( nil )
{
skipCodeSelectionStep = 1;
incFileName = INSTALL_DIR + "TemplateBatchFiles" + DIRECTORY_SEPARATOR + "TemplateModels" + DIRECTORY_SEPARATOR + "chooseGeneticCode.def";
ExecuteCommands ( "#include \""+incFileName+"\";");
dummy = ApplyGeneticCodeTable ( DEFAULT_CODE_TABLE );
ModelMatrixDimension = 64;
for ( h = 0; h < 64; h = h + 1 ) {
if ( _Genetic_Code[ h ] == 10 ) {
ModelMatrixDimension = ModelMatrixDimension - 1;
}
}
/* count number of alternate codons per aa and fill matrix */
NumberAlternateCodons = { 21, 1 };
for ( a = 0; a < 21; a = a + 1 ) {
countAlternates = 0;
for ( h = 0; h < 64; h = h + 1 ) {
if ( _Genetic_Code [ h ] != 10 ) {
if ( a == _Genetic_Code [ h ] ) {
countAlternates = countAlternates + 1;
}
}
}
NumberAlternateCodons [ a ] = countAlternates;
}
return 0;
}
function CodonFrequencies ( nucFreqMatrix, mSize )
{
PIStop = 1.0;
result = { mSize, 1 };
hshift = 0;
for ( h = 0; h < 64; h = h + 1 ) {
first = h$16;
second = h%16$4;
third = h%4;
if ( _Genetic_Code[ h ] == 10 ) {
hshift = hshift + 1;
PIStop = PIStop - nucFreqMatrix[first][0]*nucFreqMatrix[second][1]*nucFreqMatrix[third][2];
}
else {
result[ h-hshift ][0] = nucFreqMatrix[first][0]*nucFreqMatrix[second][1]*nucFreqMatrix[third][2];
}
}
return result*( 1.0/PIStop );
}
function PopulateModelMatrix_GY ( ModelMatrixName&, EFV )
{
ModelMatrixName = {ModelMatrixDimension,ModelMatrixDimension};
hshift = 0;
for (h=0; h<64; h=h+1) {
if (_Genetic_Code[h]==10) {
hshift = hshift+1;
continue;
}
vshift = hshift;
for (v = h+1; v<64; v=v+1) {
diff = v-h;
if (_Genetic_Code[v]==10) {
vshift = vshift+1;
continue;
}
if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0)) {
if (h$4==v$4) {
transition = v%4;
transition2= h%4;
}
else {
if(diff%16==0) {
transition = v$16;
transition2= h$16;
}
else {
transition = v%16$4;
transition2= h%16$4;
}
}
if (_Genetic_Code[0][h]==_Genetic_Code[0][v]) {
if (Abs(transition-transition2)%2) {
ModelMatrixName[h-hshift][v-vshift] := kappa*khs_synRate*t;
ModelMatrixName[v-vshift][h-hshift] := kappa*khs_synRate*t;
}
else {
ModelMatrixName[h-hshift][v-vshift] := khs_synRate*t;
ModelMatrixName[v-vshift][h-hshift] := khs_synRate*t;
}
}
else {
if (Abs(transition-transition2)%2) {
ModelMatrixName[h-hshift][v-vshift] := kappa*c*khs_synRate*t;
ModelMatrixName[v-vshift][h-hshift] := kappa*c*khs_synRate*t;
}
else {
ModelMatrixName[h-hshift][v-vshift] := c*khs_synRate*t;
ModelMatrixName[v-vshift][h-hshift] := c*khs_synRate*t;
}
}
}
}
}
return 0;
}
function PopulateModelMatrix_MG ( ModelMatrixName&, EFV )
{
ModelMatrixName = {ModelMatrixDimension,ModelMatrixDimension};
hshift = 0;
for (h=0; h<64; h=h+1) {
if (_Genetic_Code[h]==10) {
hshift = hshift+1;
continue;
}
vshift = hshift;
for (v = h+1; v<64; v=v+1) {
diff = v-h;
if (_Genetic_Code[v]==10) {
vshift = vshift+1;
continue;
}
if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0)) {
if (h$4==v$4) {
transition = v%4;
transition2= h%4;
nucPositionInCodon = 2;
}
else {
if(diff%16==0) {
transition = v$16;
transition2= h$16;
nucPositionInCodon = 0;
}
else {
transition = v%16$4;
transition2= h%16$4;
nucPositionInCodon = 1;
}
}
if (_Genetic_Code[0][h]==_Genetic_Code[0][v]) {
if (Abs(transition-transition2)%2) {
ModelMatrixName[h-hshift][v-vshift] := kappa*khs_synRate*t*EFV__[transition__][nucPositionInCodon__];
ModelMatrixName[v-vshift][h-hshift] := kappa*khs_synRate*t*EFV__[transition2__][nucPositionInCodon__];
}
else {
ModelMatrixName[h-hshift][v-vshift] := khs_synRate*t*EFV__[transition__][nucPositionInCodon__];
ModelMatrixName[v-vshift][h-hshift] := khs_synRate*t*EFV__[transition2__][nucPositionInCodon__];
}
}
else {
if (Abs(transition-transition2)%2) {
ModelMatrixName[h-hshift][v-vshift] := kappa*c*khs_synRate*t*EFV__[transition__][nucPositionInCodon__];
ModelMatrixName[v-vshift][h-hshift] := kappa*c*khs_synRate*t*EFV__[transition2__][nucPositionInCodon__];
}
else {
ModelMatrixName[h-hshift][v-vshift] := c*khs_synRate*t*EFV__[transition__][nucPositionInCodon__];
ModelMatrixName[v-vshift][h-hshift] := c*khs_synRate*t*EFV__[transition2__][nucPositionInCodon__];
}
}
}
}
}
return 0;
}
/* end of functions */
DataSet ds = ReadDataFile( INPUTFILE );
fscanf ( TREEFILE, "String", TREE );
/* get some nucleotide branch lengths */
DataSetFilter nucData = CreateFilter ( ds, 1 );
HarvestFrequencies ( nucFreqs, nucData, 1, 1, 1 );
global kappa_inv = 1;
HKY85_Matrix = { { *, t*kappa_inv, t, t*kappa_inv }
{ t*kappa_inv, *, kappa_inv*t, t }
{ t, t*kappa_inv, *, kappa_inv*t }
{ t*kappa_inv, t, kappa_inv*t, * } };
Model HKY85_Model = ( HKY85_Matrix, nucFreqs, 1 );
Tree nucTree = TREE;
LikelihoodFunction lf = ( nucData, nucTree );
Optimize ( res, lf );
tvec = { 2*nucData.species-2, 1 };
branchNames = BranchName ( nucTree, -1 );
nucBL = BranchLength ( nucTree, -1);
sumBL = 0;
for ( k = 0; k < 2*nucData.species-2; k = k + 1 ) {
thisbranch = branchNames [ k ];
ExecuteCommands ( "tvec [ k ] = nucTree." + thisbranch + ".t;" );
sumBL = sumBL + nucBL [ k ];
}
fprintf ( stdout, "Tree length of base data = ", sumBL, "\n" );
/* get base codon frequencies for simulation*/
dummy = GeneticCodeStuff ( 0 );
DataSetFilter codonData = CreateFilter ( ds, 3, "", "", GeneticCodeExclusions );
HarvestFrequencies ( nucleotidesByPos, codonData, 3, 1, 1 );
vectorOfFrequencies = CodonFrequencies ( nucleotidesByPos, ModelMatrixDimension );
/* set up simulation parameters and model */
global khs_synRate = 1;
global kappa = kappa_inv;
category c = ( 3, {{0.30,0.50,0.20}} , MEAN, ,{{0.05,1,4}}, 0, 1e25 );
modelMatrix = 0;
if ( GY ) {
PopulateModelMatrix_GY ( "modelMatrix", nucleotidesByPos );
Model M2amodel = ( modelMatrix, vectorOfFrequencies, 1 );
}
else {
PopulateModelMatrix_MG ( "modelMatrix", nucleotidesByPos );
Model M2amodel = ( modelMatrix, vectorOfFrequencies, 0 );
}
Tree M2atree = TREE;
/* set branches to scaled nucleotide length */
branchNames = BranchName ( M2atree, -1 );
for ( k = 0; k < Columns ( branchNames) - 1; k = k + 1 ) {
thisbranch = branchNames [ k ];
ExecuteCommands ( "M2atree." + thisbranch + ".t :=" + BLMULTI + "*" + tvec [ k ] + ";" );
}
charactersUniversalCode = { {"A","C","G","T" }
{"3",GeneticCodeExclusions,"",""} };
if ( SIM1 ) {
DataSet simData = Simulate ( M2atree, vectorOfFrequencies, charactersUniversalCode, codonData.sites, 0);
}
else {
LikelihoodFunction lf = ( codonData, M2atree );
DataSet simData = SimulateDataSet(lf);
}
/* calculate simulated BLs using nuc model */
DataSetFilter simDataFilter = CreateFilter ( simData, 1 );
HarvestFrequencies ( simFreqs, simDataFilter, 1, 1, 1 );
global kappa_inv = 1;
HKY85_Matrix = { { *, t*kappa_inv, t, t*kappa_inv }
{ t*kappa_inv, *, kappa_inv*t, t }
{ t, t*kappa_inv, *, kappa_inv*t }
{ t*kappa_inv, t, kappa_inv*t, * } };
Model HKY85_Model = ( HKY85_Matrix, simFreqs, 1 );
Tree simTree = TREE;
LikelihoodFunction lf = ( simDataFilter, simTree );
Optimize ( res, lf );
BL = BranchLength ( simTree, -1);
sumBL = 0;
for ( k = 0; k < 2*simDataFilter.species-2; k = k + 1 ) {
sumBL = sumBL + BL [ k ];
}
fprintf ( stdout, "Tree length of simulated data = ", sumBL, "\n" );
fprintf ( SIMDATAFILE, simDataFilter, "\n" );