Skip to content

Commit de6d8d9

Browse files
authored
Merge pull request #10 from DecodeGenetics/checksum_from_cram
Checksum from cram
2 parents 1c98778 + 2fdb5d0 commit de6d8d9

790 files changed

Lines changed: 163163 additions & 148688 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Makefile

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,26 @@
11
# include SeqAn libraries, don't warn about MD5 deprecation
22
CXXFLAGS+=-I. -Wno-deprecated-declarations
33

4+
#include htslib by setting -I<path_to_htslib/include> and link to the htslib library
5+
HTSDIR=<path_to_htsdir>
6+
CXXFLAGS+=-I$(HTSDIR)/include
47

5-
# RELEASE build
6-
CXXFLAGS+= -O3 -DSEQAN_ENABLE_TESTING=0 -DSEQAN_ENABLE_DEBUG=0 -DSEQAN_HAS_ZLIB=1
7-
LDLIBS=-lz -lssl -lcrypto
8-
8+
# RELEASE build
9+
CXX=g++ -std=c++11 -pthread
10+
CXXFLAGS+= -O3 -DSEQAN_ENABLE_TESTING=0 -DSEQAN_ENABLE_DEBUG=0 -DSEQAN_HAS_ZLIB=1
11+
LDFLAGS=-L$(HTSDIR)/lib -lz -lssl -lcrypto -Wl,-rpath,$(HTSDIR)/lib -lhts
912

1013
TARGET = bamhash_checksum_bam bamhash_checksum_fastq bamhash_checksum_fasta
1114
all: $(TARGET)
1215

1316
bamhash_checksum_bam: bamhash_checksum_common.o bamhash_checksum_bam.o
14-
$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
17+
$(CXX) $(LDFLAGS) -o $@ $^
1518

1619
bamhash_checksum_fastq: bamhash_checksum_common.o bamhash_checksum_fastq.o
17-
$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
20+
$(CXX) $(LDFLAGS) -o $@ $^
1821

1922
bamhash_checksum_fasta: bamhash_checksum_common.o bamhash_checksum_fasta.o
20-
$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
21-
23+
$(CXX) $(LDFLAGS) -o $@ $^
2224

2325
clean:
24-
$(RM) *.o *~ $(TARGET)
26+
$(RM) *.o *~ $(TARGET)

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ composed of the readname, whether it is first or last in pair, sequence and qual
88
All the hash values are summed up so the result is independent of the ordering within the files.
99
The result can be compared to verify that the pair of FASTQ files contain the same read
1010
information as the aligned BAM file.
11+
The program is written in C++ and uses SeqAnHTS v1.0 for parsing FASTQ, gzip compressed FASTQ and BAM files.
12+
SeqAnHTS is a fork of SeqAn library ( Döring etal. , 2008 ) that uses htslib to read SAM/BAM/CRAM files.
1113

1214
## Manuscript
1315

@@ -35,6 +37,7 @@ Both multiline FASTA and FASTQ are supported and gzipped input for FASTA and FAS
3537

3638
~~~
3739
bamhash_checksum_bam [OPTIONS] <in.bam> <in2.bam> ...
40+
bamhash_checksum_bam [OPTIONS] -r <reference-file> <in.cram>
3841
~~~
3942

4043
processes a number of BAM files. BAM files are assumed to contain paired end reads. If you run with `--no-paired` it treats all reads as single end and displays a warning if any read is marked as "second in pair" in the BAM file.
@@ -57,4 +60,8 @@ processes a number of FASTA files. All FASTA files are assumed to be single end
5760

5861
## Compiling
5962

60-
The only external dependency is on OpenSSL for the MD5 implementation.
63+
External dependencies are on:
64+
OpenSSL for the MD5 implementation
65+
htslib library (version 1.9)
66+
67+

bamhash_checksum_bam.cpp

Lines changed: 119 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
#include "htslib/hts.h"
2+
#include <seqan/bam_io.h>
13
#include <iostream>
24
#include <seqan/stream.h>
3-
#include <seqan/bam_io.h>
45
#include <stdio.h>
56
#include <stdlib.h>
67
#include <string>
@@ -9,26 +10,29 @@
910
#include <stdint.h>
1011
#include <vector>
1112
#include <seqan/arg_parse.h>
13+
#include <seqan/hts_io.h>
1214

1315

14-
#include "bamhash_checksum_common.h"
1516

17+
#include "bamhash_checksum_common.h"
1618

17-
/** only needed for seqan 1.4.1 and lower
18-
inline bool
19-
hasFlagSupplementary(seqan::BamAlignmentRecord const & record)
20-
{
21-
return (record.flag & 0x0800) == 0x0800;
22-
}
23-
*/
2419
struct Baminfo {
2520
std::vector<std::string> bamfiles;
2621
bool debug;
2722
bool noReadNames;
2823
bool noQuality;
2924
bool paired;
25+
seqan::CharString reference;
3026

31-
Baminfo() : debug(false), noReadNames(false), noQuality(false), paired(true) {}
27+
Baminfo() : debug(false), noReadNames(false), noQuality(false), paired(true), reference("") {}
28+
29+
};
30+
31+
struct Counts {
32+
uint64_t sum;
33+
uint64_t count;
34+
35+
Counts() : sum(0), count(0) {}
3236

3337
};
3438

@@ -38,24 +42,27 @@ parseCommandLine(Baminfo& options, int argc, char const **argv) {
3842
seqan::ArgumentParser parser("bamhash_checksum_bam");
3943
//readlink("/proc/self/exe", options.bindir, sizeof(options.bindir)-1);
4044

41-
setShortDescription(parser, "Checksum of a bam file"); //TODO change description
45+
setShortDescription(parser, "Checksum of a sam, bam or cram file"); //TODO change description
4246
setVersion(parser, BAMHASH_VERSION);
43-
setDate(parser, "May 2015");
47+
setDate(parser, "Oct 2018");
4448

4549
addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fI<in.bam> <in2.bam> ...\\fP");
4650
addDescription(parser, "Program for checksum of sequence reads. ");
4751

48-
addArgument(parser, seqan::ArgParseArgument(seqan::ArgParseArgument::INPUTFILE,"bamfile", "False", 1));
49-
50-
setValidValues(parser, 0,"bam sam");
52+
addArgument(parser, seqan::ArgParseArgument(seqan::ArgParseArgument::INPUT_FILE,"bamfile", "False", 1));
53+
setValidValues(parser, 0,"sam bam cram");
5154

5255
addSection(parser, "Options");
5356

5457
//add debug option:
5558
addOption(parser, seqan::ArgParseOption("d", "debug", "Debug mode. Prints full hex for each read to stdout"));
5659
addOption(parser, seqan::ArgParseOption("R", "no-readnames", "Do not use read names as part of checksum"));
5760
addOption(parser, seqan::ArgParseOption("Q", "no-quality", "Do not use read quality as part of checksum"));
58-
addOption(parser, seqan::ArgParseOption("P", "no-paired", "Bam files were not generated with paired-end reads"));
61+
addOption(parser, seqan::ArgParseOption("P", "no-paired", "Cram files were not generated with paired-end reads"));
62+
addOption(parser, seqan::ArgParseOption("r", "reference-file", "Path to reference-file if reference not given in header",
63+
seqan::ArgParseArgument::INPUT_FILE));
64+
65+
setValidValues(parser, "reference-file", "fa");
5966

6067
// Parse command line.
6168
seqan::ArgumentParser::ParseResult res = seqan::parse(parser, argc, argv);
@@ -67,13 +74,76 @@ parseCommandLine(Baminfo& options, int argc, char const **argv) {
6774
options.noReadNames = isSet(parser, "no-readnames");
6875
options.noQuality = isSet(parser, "no-quality");
6976
options.paired = !isSet(parser, "no-paired");
77+
getOptionValue(options.reference, parser, "reference-file");
7078

7179
options.bamfiles = getArgumentValues(parser, 0);
7280

7381
return seqan::ArgumentParser::PARSE_OK;
7482
}
7583

7684

85+
// -----------------------------------------------------------------------------
86+
// FUNCTION getSampleIdAndLaneNames()
87+
// -----------------------------------------------------------------------------
88+
89+
void getLaneNames(std::map<seqan::CharString, unsigned> & laneNames, std::string const & header)
90+
{
91+
for (int i = 0; i < header.size(); /*empty on purpose*/)
92+
{
93+
auto hdr_find_it = std::find(header.begin() + i, header.end(), '\n');
94+
std::string line = header.substr(i, hdr_find_it - header.begin() - i);
95+
96+
if (line.size() > 7 && line[0] == '@' && line[1] == 'R' && line[2] == 'G' && line[3] == '\t')
97+
{
98+
for (int j = 0; j < static_cast<int>(line.size()); /*empty on purpose*/)
99+
{
100+
auto line_find_it = std::find(line.begin() + j, line.end(), '\t');
101+
std::string field = line.substr(j, line_find_it - line.begin() - j);
102+
103+
if (field.size() > 3 && field[0] == 'I' && field[1] == 'D' && field[2] == ':')
104+
{
105+
seqan::CharString read_group_name = field.substr(3);
106+
int read_group_index = laneNames.size();
107+
laneNames[read_group_name] = read_group_index;
108+
}
109+
110+
j = std::distance(line.begin(), line_find_it) + 1;
111+
}
112+
}
113+
114+
i = std::distance(header.begin(), hdr_find_it) + 1;
115+
}
116+
}
117+
118+
// -----------------------------------------------------------------------------
119+
// FUNCTION getLane()
120+
// -----------------------------------------------------------------------------
121+
122+
int getLane(seqan::BamAlignmentRecord & record,
123+
seqan::BamTagsDict & tagsDict,
124+
std::map<seqan::CharString, unsigned> & laneNames)
125+
{
126+
unsigned tagIdx = 0;
127+
128+
if (!seqan::findTagKey(tagIdx, tagsDict, "RG"))
129+
{
130+
std::cerr << "ERROR: Found a read with a missing read group (RG) tag\n";
131+
return -1;
132+
}
133+
134+
seqan::CharString read_group;
135+
136+
if(!seqan::extractTagValue(read_group, tagsDict, tagIdx))
137+
{
138+
std::cerr << "ERROR: Failed to extract read group (RG) tag value\n";
139+
return -1;
140+
}
141+
142+
return laneNames[read_group];
143+
}
144+
145+
146+
77147
int main(int argc, char const **argv) {
78148

79149
Baminfo info; // Define structure variable
@@ -83,58 +153,48 @@ int main(int argc, char const **argv) {
83153
return res == seqan::ArgumentParser::PARSE_ERROR;
84154
}
85155

86-
uint64_t sum = 0;
87-
uint64_t count = 0;
156+
//Moving below to struct
157+
// uint64_t sum = 0;
158+
// uint64_t count = 0;
88159
bool pairedWarning = false;
89160

90-
161+
//adding new stuff
162+
std::map<seqan::CharString, unsigned> laneNames;
163+
// Initialize all counts for each lane.
164+
seqan::String<Counts> counts;
91165

92166
for (int i = 0; i < info.bamfiles.size(); i++) {
93-
// Open BGZF Stream for reading.
94-
seqan::Stream<seqan::Bgzf> inStream;
95167

96168
const char* bamfile = info.bamfiles[i].c_str();
97-
98-
if (!open(inStream, bamfile, "r")) {
99-
std::cerr << "ERROR: Could not open " << bamfile << " for reading.\n";
100-
return 1;
101-
}
169+
const char* reference = toCString(info.reference);
102170

103-
// Setup name store, cache, and BAM I/O context.
104-
typedef seqan::StringSet<seqan::CharString> TNameStore;
105-
typedef seqan::NameStoreCache<TNameStore> TNameStoreCache;
106-
typedef seqan::BamIOContext<TNameStore> TBamIOContext;
107-
TNameStore nameStore;
108-
TNameStoreCache nameStoreCache(nameStore);
109-
TBamIOContext context(nameStore, nameStoreCache);
110-
111-
// Read header.
112-
seqan::BamHeader header;
113-
if (readRecord(header, context, inStream, seqan::Bam()) != 0) {
114-
std::cerr << "ERROR: Could not read header from BAM file " << bamfile << "\n";
115-
return 1;
116-
}
117-
seqan::clear(header);
171+
// Open stream for reading
172+
seqan::HtsFile inStream(bamfile, "r", reference);
173+
174+
// Initialize lane names (read groups).
175+
std::string header(inStream.hdr->text, inStream.hdr->l_text);
176+
getLaneNames(laneNames, header);
177+
unsigned lanecount = laneNames.size();
178+
resize(counts, lanecount);
118179

119180
// Define:
120181
seqan::BamAlignmentRecord record;
121182
seqan::CharString string2hash;
122-
//char hexCstr[33];
123183

124184
// Read record
125-
while (!atEnd(inStream)) {
126-
if (readRecord(record, context, inStream, seqan::Bam()) != 0) {
127-
std::cerr << "ERROR: Could not read record from BAM File " << bamfile << "\n";
128-
return 1;
129-
}
185+
while (seqan::readRecord(record, inStream)){
186+
seqan::BamTagsDict tagsDict(record.tags);
187+
int l = getLane(record, tagsDict, laneNames);
188+
if (l == -1) return 1;
189+
130190
// Check if flag: reverse complement and change record accordingly
131191
if (hasFlagRC(record)) {
132-
reverseComplement(record.seq);
133-
reverse(record.qual);
192+
seqan::reverseComplement(record.seq);
193+
seqan::reverse(record.qual);
134194
}
135195
// Check if flag: supplementary and exclude those
136196
if (!hasFlagSupplementary(record) && !hasFlagSecondary(record)) {
137-
count +=1;
197+
counts[l].count +=1;
138198
// Construct one string from record
139199
if (!info.noReadNames) {
140200
seqan::append(string2hash, record.qName);
@@ -143,7 +203,7 @@ int main(int argc, char const **argv) {
143203
seqan::append(string2hash, "/2");
144204
} else {
145205
if (!pairedWarning) {
146-
std::cerr << "WARNING: BamHash was run with --no-paired mode, but BAM file has reads marked as second pair" << std::endl;
206+
std::cerr << "WARNING: seqread was run with --no-paired mode, but BAM file has reads marked as second pair" << std::endl;
147207
pairedWarning = true;
148208
}
149209
seqan::append(string2hash, "/1");
@@ -162,26 +222,25 @@ int main(int argc, char const **argv) {
162222
// Get MD5 hash
163223
hash_t hex = str2md5(toCString(string2hash), length(string2hash));
164224

165-
166225
if (info.debug) {
167226
std::cout << string2hash << " " << std::hex << hex.p.low << "\n";
168227
} else {
169-
hexSum(hex, sum);
228+
hexSum(hex, counts[l].sum);
170229
}
171230

172231
seqan::clear(string2hash);
173232
}
174233
}
175-
176-
// print result
177-
178234
}
179235

180236
if (!info.debug) {
181-
std::cout << std::hex << sum << "\t";
182-
std::cout << std::dec << count << "\n";
237+
for (std::map<seqan::CharString, unsigned>::iterator it = laneNames.begin(); it != laneNames.end(); ++it) {
238+
std::cout << it->first << "\t";
239+
int lid = it->second;
240+
std::cout << std::hex << counts[lid].sum << "\t";
241+
std::cout << std::dec << counts[lid].count << "\n";
242+
}
183243
}
184-
244+
185245
return 0;
186246
}
187-

bamhash_checksum_common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#ifndef BAMHASH_CHECKSUM_COMMON_H
22
#define BAMHASH_CHECKSUM_COMMON_H
33

4-
#define BAMHASH_VERSION "1.1"
4+
#define BAMHASH_VERSION "1.3"
55

66
#include <string>
77
#include <stdint.h>

0 commit comments

Comments
 (0)