Hi Sergei,
Could you please confirm that my substitution matrix below is in fact an implementation of your dual GY94 model with 4-category independent general discrete distributions for each of the synonymous and non-synonymous substitution rates?
Code:/* ------------------------- SUBSTITUTION RATE MATRIX --------------------------- */
global kappa = 1;
/* synonomous rate variation */
global dS_1 := 0; /* invariant category */
global dS_2;
global dS_3;
global dS_4;
global dS_p1;
global dS_p2;
global dS_p3;
global dS_p4 := 1-(dS_p1+dS_p2+dS_p3);
dS_p1:>-0.0000000000000001; dS_p1:<1.00000000000000001;
dS_p2:>-0.0000000000000001; dS_p2:<1.00000000000000001;
dS_p3:>-0.0000000000000001; dS_p3:<1.00000000000000001;
dS_p4:>-0.0000000000000001; dS_p4:<1.00000000000000001;
categFreqMatrixdS = {{dS_p1, dS_p2, dS_p3, dS_p4}};
categRateMatrixdS = {{ dS_1, dS_2, dS_3, dS_4}};
category dS = (4, categFreqMatrixdS, MEAN, , categRateMatrixdS, 0, 1e25);
/* nonsynonomous rate variation */
global dN_1 := 0; /* invariant category */
global dN_2;
global dN_3;
global dN_4;
global dN_p1;
global dN_p2;
global dN_p3;
global dN_p4 := 1-(dN_p1+dN_p2+dN_p3);
dN_p1:>-0.0000000000000001; dN_p1:<1.00000000000000001;
dN_p2:>-0.0000000000000001; dN_p2:<1.00000000000000001;
dN_p3:>-0.0000000000000001; dN_p3:<1.00000000000000001;
dN_p4:>-0.0000000000000001; dN_p4:<1.00000000000000001;
categFreqMatrixdN = {{dN_p1, dN_p2, dN_p3, dN_p4}};
categRateMatrixdN = {{ dN_1, dN_2, dN_3, dN_4}};
category dN = (4, categFreqMatrixdN, MEAN, , categRateMatrixdN, 0, 1e25);
SubsMatrix = {61,61};
hshift = 0;
for (h=0; h<63; h=h+1)
{
if (UniversalGeneticCode[h] == 10) /* stop codon */
{
hshift = hshift + 1;
}
else
{
vshift = hshift;
for (v=h+1; v<64; v=v+1)
{
if (UniversalGeneticCode[v] == 10) /* stop codon */
{
vshift = vshift + 1;
}
else
{
/* We must determine how many subsitutions are needed to get from codon h to codon v.
Only pairs with exactly one substitution are assigned non-zero rates.
Two codons indexed by h and v differ in a single nucleotide position if and only if one of the following conditions
are met:
1). h$4==v$4 (same result of integer division by 4), the change is in the third position
2). (v-h)%16 is 0 (same remainder of integer division by 16), the change is in the first position
3). (h$16==v$16) and (v-h)%4 is 0 (same result of integer division by 16 and same remainder by integer division by 4), the change in the second position
*/
diff = v-h;
if ((h$4==v$4)||((diff%4==0)&&(h$16==v$16))||(diff%16==0)) /* differ by one subsitution only */
{
if (h$4==v$4) /* third position */
{
transition = v%4; /* get targets nucleotides for h->v and v->h substitutions */
transition2= h%4;
nucPosInCodon = 2; /* change is in the third position */
}
else
{
if(diff%16==0) /* first position */
{global scale = 1;
transition = v$16; /* get targets nucleotides for h->v and v->h substitutions */
transition2= h$16;
nucPosInCodon = 0; /* change is in the first position */
}
else
{
transition = v%16$4; /* get targets nucleotides for h->v and v->h substitutions */
transition2= h%16$4;
nucPosInCodon = 1; /* change is in the second position */
}
}
if (UniversalGeneticCode[h]==UniversalGeneticCode[v]) /* synonymous */
{
if (Abs(transition-transition2)%2) /* transversion: difference is not divisible by 2 */
{
SubsMatrix[h-hshift][v-vshift] := dS*t;
SubsMatrix[v-vshift][h-hshift] := dS*t;
}
else /* transition */
{
SubsMatrix[h-hshift][v-vshift] := kappa*dS*t;
SubsMatrix[v-vshift][h-hshift] := kappa*dS*t;
}
}
else /* non-synonymous */
{
if (Abs(transition-transition2)%2) /* transversion: difference is not divisible by 2 */
{
SubsMatrix[h-hshift][v-vshift] := dN*t;
SubsMatrix[v-vshift][h-hshift] := dN*t;
}
else /* transition */
{
SubsMatrix[h-hshift][v-vshift] := kappa*dN*t;
SubsMatrix[v-vshift][h-hshift] := kappa*dN*t;
}
}
}
}
}
}
}
I am working with over 500 sequences of 370 codons each and the code is very slow, even with fewer rate categories. Optimising branch lengths and model parameters using, for example, dSdNRateAnalysis.bf does not appear to be an option - it just takes far too long. Any suggestions?
Thanks
Miguel