Skip to content

Commit ca896a2

Browse files
committed
BugFix in Phase Sets (martin) + Migrating from HMM float to double for scaffolded samples
1 parent 247d0c3 commit ca896a2

13 files changed

Lines changed: 717 additions & 87 deletions

makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ BOOST_LIB_PO=/usr/lib/x86_64-linux-gnu/libboost_program_options.a
2020
#BOOST_LIB_PO=/software/lib64/libboost_program_options.a
2121

2222
#COMPILER & LINKER FLAGS
23-
2423
#Best performance is achieved with this. Use it if running on the same plateform you're compiling, it's definitely worth it!
2524
#CXXFLAG=-O3 -march=native
2625
#Good performance and portable on most intel CPUs

src/io/genotype_reader2.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ void genotype_reader::readGenotypes0(string funphased) {
5858
bool mi = (gt_arr_main[i+0] == bcf_gt_missing || gt_arr_main[i+1] == bcf_gt_missing);
5959
bool he = !mi && a0 != a1;
6060
bool ho = !mi && a0 == a1;
61-
bool ps = (mi || he) && use_PS_field;
61+
//bool ps = (mi || he) && use_PS_field;
62+
bool ps = he && use_PS_field;
6263
bool ph = (bcf_gt_is_phased(gt_arr_main[i+0]) || bcf_gt_is_phased(gt_arr_main[i+1])) && he && PScodes.size() > 0;
6364
if (a0) VAR_SET_HAP0(MOD2(i_variant), G.vecG[DIV2(i)]->Variants[DIV2(i_variant)]);
6465
if (a1) VAR_SET_HAP1(MOD2(i_variant), G.vecG[DIV2(i)]->Variants[DIV2(i_variant)]);
@@ -134,7 +135,8 @@ void genotype_reader::readGenotypes1(string funphased, string freference) {
134135
bool mi = (gt_arr_main[i+0] == bcf_gt_missing || gt_arr_main[i+1] == bcf_gt_missing);
135136
bool he = !mi && a0 != a1;
136137
bool ho = !mi && a0 == a1;
137-
bool ps = (mi || he) && use_PS_field;
138+
//bool ps = (mi || he) && use_PS_field;
139+
bool ps = he && use_PS_field;
138140
bool ph = (bcf_gt_is_phased(gt_arr_main[i+0]) || bcf_gt_is_phased(gt_arr_main[i+1])) && he && PScodes.size() > 0;
139141
if (a0) VAR_SET_HAP0(MOD2(i_variant), G.vecG[DIV2(i)]->Variants[DIV2(i_variant)]);
140142
if (a1) VAR_SET_HAP1(MOD2(i_variant), G.vecG[DIV2(i)]->Variants[DIV2(i_variant)]);
@@ -234,7 +236,8 @@ void genotype_reader::readGenotypes2(string funphased, string fphased) {
234236
bool mi = (gt_arr_main[i+0] == bcf_gt_missing || gt_arr_main[i+1] == bcf_gt_missing);
235237
bool he = !mi && a0 != a1;
236238
bool ho = !mi && a0 == a1;
237-
bool ps = (mi || he) && use_PS_field;
239+
//bool ps = (mi || he) && use_PS_field;
240+
bool ps = he && use_PS_field;
238241
bool ph = (bcf_gt_is_phased(gt_arr_main[i+0]) || bcf_gt_is_phased(gt_arr_main[i+1])) && he && PScodes.size() > 0;
239242
if (a0) VAR_SET_HAP0(MOD2(i_variant), G.vecG[DIV2(i)]->Variants[DIV2(i_variant)]);
240243
if (a1) VAR_SET_HAP1(MOD2(i_variant), G.vecG[DIV2(i)]->Variants[DIV2(i_variant)]);
@@ -349,7 +352,8 @@ void genotype_reader::readGenotypes3(string funphased, string freference, string
349352
bool mi = (gt_arr_main[i+0] == bcf_gt_missing || gt_arr_main[i+1] == bcf_gt_missing);
350353
bool he = !mi && a0 != a1;
351354
bool ho = !mi && a0 == a1;
352-
bool ps = (mi || he) && use_PS_field;
355+
//bool ps = (mi || he) && use_PS_field;
356+
bool ps = he && use_PS_field;
353357
bool ph = (bcf_gt_is_phased(gt_arr_main[i+0]) || bcf_gt_is_phased(gt_arr_main[i+1])) && he && PScodes.size() > 0;
354358
if (a0) VAR_SET_HAP0(MOD2(i_variant), G.vecG[DIV2(i)]->Variants[DIV2(i_variant)]);
355359
if (a1) VAR_SET_HAP1(MOD2(i_variant), G.vecG[DIV2(i)]->Variants[DIV2(i_variant)]);
Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
////////////////////////////////////////////////////////////////////////////////
2+
// Copyright (C) 2018 Olivier Delaneau, University of Lausanne
3+
//
4+
// Permission is hereby granted, free of charge, to any person obtaining a copy
5+
// of this software and associated documentation files (the "Software"), to deal
6+
// in the Software without restriction, including without limitation the rights
7+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8+
// copies of the Software, and to permit persons to whom the Software is
9+
// furnished to do so, subject to the following conditions:
10+
//
11+
// The above copyright notice and this permission notice shall be included in
12+
// all copies or substantial portions of the Software.
13+
//
14+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15+
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17+
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18+
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19+
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20+
// SOFTWARE.
21+
////////////////////////////////////////////////////////////////////////////////
22+
#include <models/haplotype_segment_double.h>
23+
24+
25+
haplotype_segment_double::haplotype_segment_double(genotype * _G, bitmatrix & _H, vector < unsigned int > & _idxH, coordinates & C, hmm_parameters & _M) : G(_G), H(_H), idxH(_idxH), M(_M) {
26+
segment_first = C.start_segment;
27+
segment_last = C.stop_segment;
28+
locus_first = C.start_locus;
29+
locus_last = C.stop_locus;
30+
ambiguous_first = C.start_ambiguous;
31+
ambiguous_last = C.stop_ambiguous;
32+
missing_first = C.start_missing;
33+
missing_last = C.stop_missing;
34+
transition_first = C.start_transition;
35+
n_cond_haps = idxH.size();
36+
prob1 = aligned_vector32 < double > (HAP_NUMBER * n_cond_haps, 1.0);
37+
prob2 = aligned_vector32 < double > (HAP_NUMBER * n_cond_haps, 1.0);
38+
probSumH1 = aligned_vector32 < double > (HAP_NUMBER, 1.0);
39+
probSumH2 = aligned_vector32 < double > (HAP_NUMBER, 1.0);
40+
probSumK1 = aligned_vector32 < double > (n_cond_haps, 1.0);
41+
probSumK2 = aligned_vector32 < double > (n_cond_haps, 1.0);
42+
probSumT1 = 1.0;
43+
probSumT2 = 1.0;
44+
Alpha = vector < aligned_vector32 < double > > (segment_last - segment_first + 1, aligned_vector32 < double > (HAP_NUMBER * n_cond_haps, 0.0));
45+
Beta = vector < aligned_vector32 < double > > (segment_last - segment_first + 1, aligned_vector32 < double > (HAP_NUMBER * n_cond_haps, 0.0));
46+
AlphaSum = vector < aligned_vector32 < double > > (segment_last - segment_first + 1, aligned_vector32 < double > (HAP_NUMBER, 0.0));
47+
AlphaSumSum = aligned_vector32 < double > (segment_last - segment_first + 1, 0.0);
48+
BetaSum = aligned_vector32 < double > (HAP_NUMBER, 0.0);
49+
n_missing = missing_last - missing_first + 1;
50+
//cout << G->name << " " << missing_first << " " << missing_last << " " << n_missing << endl;
51+
if (n_missing > 0) {
52+
AlphaMissing = vector < aligned_vector32 < double > > (n_missing, aligned_vector32 < double > (HAP_NUMBER * n_cond_haps, 0.0));
53+
AlphaSumMissing = vector < aligned_vector32 < double > > (n_missing, aligned_vector32 < double > (HAP_NUMBER, 0.0));
54+
ProbM1sums = aligned_vector32 < double >(HAP_NUMBER, 0.0);
55+
ProbM0sums = aligned_vector32 < double >(HAP_NUMBER, 0.0);
56+
ProbM = aligned_vector32 < double >(HAP_NUMBER, 0.0);
57+
}
58+
}
59+
60+
haplotype_segment_double::~haplotype_segment_double() {
61+
G = NULL;
62+
segment_first = 0;
63+
segment_last = 0;
64+
locus_first = 0;
65+
locus_last = 0;
66+
ambiguous_first = 0;
67+
ambiguous_last = 0;
68+
transition_first = 0;
69+
n_cond_haps = 0;
70+
curr_segment_index = 0;
71+
curr_segment_locus = 0;
72+
curr_abs_locus = 0;
73+
curr_rel_locus = 0;
74+
curr_rel_segment_index = 0;
75+
curr_abs_ambiguous = 0;
76+
curr_abs_transition = 0;
77+
probSumT1 = 0.0;
78+
probSumT2 = 0.0;
79+
prob1.clear();
80+
prob2.clear();
81+
probSumK1.clear();
82+
probSumK2.clear();
83+
probSumH1.clear();
84+
probSumH2.clear();
85+
Alpha.clear();
86+
Beta.clear();
87+
AlphaSum.clear();
88+
AlphaSumSum.clear();
89+
BetaSum.clear();
90+
}
91+
92+
void haplotype_segment_double::forward() {
93+
curr_segment_index = segment_first;
94+
curr_segment_locus = 0;
95+
curr_abs_ambiguous = ambiguous_first;
96+
curr_abs_missing = missing_first;
97+
98+
for (curr_abs_locus = locus_first ; curr_abs_locus <= locus_last ; curr_abs_locus++) {
99+
curr_rel_locus = curr_abs_locus - locus_first;
100+
curr_rel_missing = curr_abs_missing - missing_first;
101+
bool paired = (curr_rel_locus % 2 == 0);
102+
bool amb = VAR_GET_AMB(MOD2(curr_abs_locus), G->Variants[DIV2(curr_abs_locus)]);
103+
bool mis = VAR_GET_MIS(MOD2(curr_abs_locus), G->Variants[DIV2(curr_abs_locus)]);
104+
105+
106+
if (mis) MIS(paired);
107+
else if (amb) AMB(paired);
108+
else HOM(paired);
109+
110+
if (curr_rel_locus != 0) {
111+
if (curr_segment_locus == 0) COLLAPSE(true, paired);
112+
else RUN(true, paired);
113+
}
114+
SUM(paired);
115+
if (curr_segment_locus == (G->Lengths[curr_segment_index] - 1)) SUMK(paired);
116+
if (curr_segment_locus == G->Lengths[curr_segment_index] - 1) {
117+
//if (paired) copy(prob2.begin(), prob2.end(), Alpha[curr_segment_index - segment_first].begin());
118+
//else copy(prob1.begin(), prob1.end(), Alpha[curr_segment_index - segment_first].begin());
119+
Alpha[curr_segment_index - segment_first] = (paired?prob2:prob1); //does not compile with aligned vectors (needs to define operator = from std vector)
120+
AlphaSum[curr_segment_index - segment_first] = (paired?probSumH2:probSumH1);
121+
AlphaSumSum[curr_segment_index - segment_first] = (paired?probSumT2:probSumT1);
122+
}
123+
if (mis) {
124+
AlphaMissing[curr_rel_missing] = (paired?prob2:prob1);
125+
AlphaSumMissing[curr_rel_missing] = (paired?probSumH2:probSumH1);
126+
curr_abs_missing ++;
127+
}
128+
129+
curr_segment_locus ++;
130+
curr_abs_ambiguous += amb;
131+
if (curr_segment_locus >= G->Lengths[curr_segment_index]) {
132+
curr_segment_index++;
133+
curr_segment_locus = 0;
134+
}
135+
}
136+
}
137+
138+
void haplotype_segment_double::backward(vector < float > & missing_probabilities) {
139+
curr_segment_index = segment_last;
140+
curr_segment_locus = G->Lengths[segment_last] - 1;
141+
curr_abs_ambiguous = ambiguous_last;
142+
curr_abs_missing = missing_last;
143+
for (curr_abs_locus = locus_last ; curr_abs_locus >= locus_first ; curr_abs_locus--) {
144+
curr_rel_locus = curr_abs_locus - locus_first;
145+
curr_rel_missing = curr_abs_missing - missing_first;
146+
//cout << curr_abs_missing << " " << missing_first << endl;
147+
//assert(curr_rel_missing >= 0);
148+
bool paired = (curr_rel_locus % 2 == 0);
149+
bool amb = VAR_GET_AMB(MOD2(curr_abs_locus), G->Variants[DIV2(curr_abs_locus)]);
150+
bool mis = VAR_GET_MIS(MOD2(curr_abs_locus), G->Variants[DIV2(curr_abs_locus)]);
151+
152+
if (mis) MIS(paired);
153+
else if (amb) AMB(paired);
154+
else HOM(paired);
155+
156+
if (curr_abs_locus != locus_last) {
157+
if (curr_segment_locus == G->Lengths[curr_segment_index] - 1) COLLAPSE(false, paired);
158+
else RUN(false, paired);
159+
}
160+
SUM(paired);
161+
if (curr_segment_locus == 0) SUMK(paired);
162+
if (curr_segment_locus == 0 && curr_abs_locus != locus_first) {
163+
Beta[curr_segment_index - segment_first] = (paired?prob2:prob1); //does not compile with aligned vectors (needs to define operator = from std vector)
164+
//if (paired) copy(prob2.begin(), prob2.end(), Beta[curr_segment_index - segment_first].begin());
165+
//else copy(prob1.begin(), prob1.end(), Beta[curr_segment_index - segment_first].begin());
166+
}
167+
if (mis) {
168+
//Impute missing for 8 possible haplotypes
169+
fill(ProbM1sums.begin(), ProbM1sums.end(), 0.0);
170+
fill(ProbM0sums.begin(), ProbM0sums.end(), 0.0);
171+
for(int k = 0, i = 0 ; k != n_cond_haps ; ++k, i += HAP_NUMBER) {
172+
for (int h = 0 ; h < HAP_NUMBER ; h ++) ProbM[h] = (AlphaMissing[curr_rel_missing][i + h] / AlphaSumMissing[curr_rel_missing][h]) * (paired?prob2[i + h]:prob1[i + h]);
173+
if (H.get(idxH[k], curr_abs_locus)) for (int h = 0 ; h < HAP_NUMBER ; h ++) ProbM1sums[h] += ProbM[h];
174+
else for (int h = 0 ; h < HAP_NUMBER ; h ++) ProbM0sums[h] += ProbM[h];
175+
}
176+
//cout << curr_abs_missing;
177+
for (int h = 0 ; h < HAP_NUMBER ; h ++) {
178+
missing_probabilities[curr_abs_missing * HAP_NUMBER + h] = ProbM1sums[h] / (ProbM0sums[h]+ProbM1sums[h]);
179+
//cout << " " << stb.str(missing_probabilities[curr_abs_missing * HAP_NUMBER + h], 3);
180+
}
181+
//cout << endl;
182+
curr_abs_missing--;
183+
}
184+
185+
if (curr_abs_locus == 0) BetaSum=(paired?probSumH2:probSumH1);
186+
curr_segment_locus--;
187+
curr_abs_ambiguous -= amb;
188+
if (curr_segment_locus < 0 && curr_segment_index > 0) {
189+
curr_segment_index--;
190+
curr_segment_locus = G->Lengths[curr_segment_index] - 1;
191+
}
192+
}
193+
}
194+
195+
int haplotype_segment_double::expectation(vector < double > & transition_probabilities, vector < float > & missing_probabilities) {
196+
//cout << "ok1 " << n_cond_haps << endl;
197+
forward();
198+
//cout << "ok2" << endl;
199+
backward(missing_probabilities);
200+
//cout << "ok3" << endl;
201+
202+
unsigned int n_transitions = 0;
203+
if (!segment_first) {
204+
double sumHap = 0.0, sumDip = 0.0;
205+
n_transitions = G->countDiplotypes(G->Diplotypes[0]);
206+
for (int h = 0 ; h < HAP_NUMBER ; h ++) sumHap += BetaSum[h];
207+
vector < double > cprobs = vector < double > (n_transitions, 0.0);
208+
for (unsigned int d = 0, t = 0 ; d < 64 ; ++d) {
209+
if (DIP_GET(G->Diplotypes[0], d)) {
210+
cprobs[t] = (double)(BetaSum[DIP_HAP0(d)]/sumHap) * (double)(BetaSum[DIP_HAP1(d)]/sumHap);
211+
sumDip += cprobs[t];
212+
t++;
213+
}
214+
}
215+
for (unsigned int t = 0 ; t < n_transitions ; t ++) transition_probabilities[t] = (cprobs[t] / sumDip);
216+
curr_abs_transition += n_transitions;
217+
}
218+
219+
unsigned int curr_abs_transition = transition_first;
220+
unsigned int curr_dipcount = 0, prev_dipcount = G->countDiplotypes(G->Diplotypes[segment_first]);
221+
222+
curr_segment_index = segment_first;
223+
curr_segment_locus = 0;
224+
int n_underflow_recovered = 0;
225+
for (curr_abs_locus = locus_first ; curr_abs_locus <= locus_last ; curr_abs_locus ++) {
226+
curr_rel_locus = curr_abs_locus - locus_first;
227+
curr_rel_segment_index = curr_segment_index - segment_first;
228+
229+
if (curr_rel_locus != 0 && curr_segment_locus == 0) {
230+
if (TRANSH()) return -1;
231+
if (TRANSD(n_underflow_recovered)) return -2;
232+
curr_dipcount = G->countDiplotypes(G->Diplotypes[curr_segment_index]);
233+
n_transitions = curr_dipcount * prev_dipcount;
234+
double scaling = 1.0 / sumDProbs;
235+
236+
//Unrolling of: // for (int t = 0 ; t < n_transitions ; t ++) transition_probabilities[curr_abs_transition + t] = DProbs[t] * scaling;
237+
int t = 0, repeat = (n_transitions / 4), left = (n_transitions % 4);
238+
while (repeat --) {
239+
transition_probabilities[curr_abs_transition + t + 0] = DProbs[t+0] * scaling;
240+
transition_probabilities[curr_abs_transition + t + 1] = DProbs[t+1] * scaling;
241+
transition_probabilities[curr_abs_transition + t + 2] = DProbs[t+2] * scaling;
242+
transition_probabilities[curr_abs_transition + t + 3] = DProbs[t+3] * scaling;
243+
t += 4;
244+
}
245+
switch (left) {
246+
case 3: transition_probabilities[curr_abs_transition + t + 2] = DProbs[t+2] * scaling;
247+
case 2: transition_probabilities[curr_abs_transition + t + 1] = DProbs[t+1] * scaling;
248+
case 1: transition_probabilities[curr_abs_transition + t + 0] = DProbs[t+0] * scaling;
249+
}
250+
//Fin unrolling
251+
252+
253+
curr_abs_transition += n_transitions;
254+
prev_dipcount = curr_dipcount;
255+
}
256+
257+
curr_segment_locus ++;
258+
if (curr_segment_locus >= G->Lengths[curr_segment_index]) {
259+
curr_segment_index++;
260+
curr_segment_locus = 0;
261+
}
262+
}
263+
//cout << "ok4" << endl;
264+
return n_underflow_recovered;
265+
}
266+
267+
268+

0 commit comments

Comments
 (0)