Skip to content

Commit fc70630

Browse files
committed
Adds the option of including multiple fastq and bam files and running in non-paired-end mode
1 parent a7ccfe2 commit fc70630

3 files changed

Lines changed: 218 additions & 162 deletions

File tree

bamhash_checksum_bam.cpp

Lines changed: 93 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
#include <seqan/bam_io.h>
44
#include <stdio.h>
55
#include <stdlib.h>
6-
#include <string.h>
6+
#include <string>
77
#include <cmath>
88
#include <cstdlib>
99
#include <stdint.h>
10+
#include <vector>
1011
#include <seqan/arg_parse.h>
1112

1213

@@ -21,14 +22,13 @@ hasFlagSupplementary(seqan::BamAlignmentRecord const & record)
2122
}
2223
*/
2324
struct Baminfo {
24-
char version[1024];
25-
char bindir[1024];
26-
seqan::CharString bamFile;
25+
std::vector<std::string> bamfiles;
2726
bool debug;
2827
bool noReadNames;
29-
bool noQuality;
28+
bool noQuality;
29+
bool paired;
3030

31-
Baminfo() : debug(false), noReadNames(false), noQuality(false) {}
31+
Baminfo() : debug(false), noReadNames(false), noQuality(false), paired(true) {}
3232

3333
};
3434

@@ -40,9 +40,9 @@ parseCommandLine(Baminfo& options, int argc, char const **argv) {
4040

4141
setShortDescription(parser, "Checksum of a bam file"); //TODO change description
4242
setVersion(parser, BAMHASH_VERSION);
43-
setDate(parser, "Feb 2015");
43+
setDate(parser, "May 2015");
4444

45-
addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fI<in.bam>\\fP");
45+
addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fI<in.bam> <in2.bam> ...\\fP");
4646
addDescription(parser, "Program for checksum of sequence reads. ");
4747

4848
addArgument(parser, seqan::ArgParseArgument(seqan::ArgParseArgument::INPUTFILE,"bamfile", "False", 1));
@@ -55,6 +55,7 @@ parseCommandLine(Baminfo& options, int argc, char const **argv) {
5555
addOption(parser, seqan::ArgParseOption("d", "debug", "Debug mode. Prints full hex for each read to stdout"));
5656
addOption(parser, seqan::ArgParseOption("R", "no-readnames", "Do not use read names as part of checksum"));
5757
addOption(parser, seqan::ArgParseOption("Q", "no-quality", "Do not use read quality as part of checksum"));
58+
addOption(parser, seqan::ArgParseOption("P", "no-paired", "Bam files were not generated with paired-end reads"));
5859

5960
// Parse command line.
6061
seqan::ArgumentParser::ParseResult res = seqan::parse(parser, argc, argv);
@@ -65,7 +66,9 @@ parseCommandLine(Baminfo& options, int argc, char const **argv) {
6566
options.debug = isSet(parser, "debug");
6667
options.noReadNames = isSet(parser, "no-readnames");
6768
options.noQuality = isSet(parser, "no-quality");
68-
getArgumentValue(options.bamFile, parser, 0);
69+
options.paired = !isSet(parser, "no-paired");
70+
71+
options.bamfiles = getArgumentValues(parser, 0);
6972

7073
return seqan::ArgumentParser::PARSE_OK;
7174
}
@@ -80,84 +83,105 @@ int main(int argc, char const **argv) {
8083
return res == seqan::ArgumentParser::PARSE_ERROR;
8184
}
8285

83-
// Open BGZF Stream for reading.
84-
seqan::Stream<seqan::Bgzf> inStream;
85-
if (!open(inStream, toCString(info.bamFile), "r")) {
86-
std::cerr << "ERROR: Could not open " << info.bamFile << " for reading.\n";
87-
return 1;
88-
}
89-
90-
// Setup name store, cache, and BAM I/O context.
91-
typedef seqan::StringSet<seqan::CharString> TNameStore;
92-
typedef seqan::NameStoreCache<TNameStore> TNameStoreCache;
93-
typedef seqan::BamIOContext<TNameStore> TBamIOContext;
94-
TNameStore nameStore;
95-
TNameStoreCache nameStoreCache(nameStore);
96-
TBamIOContext context(nameStore, nameStoreCache);
97-
98-
// Read header.
99-
seqan::BamHeader header;
100-
if (readRecord(header, context, inStream, seqan::Bam()) != 0) {
101-
std::cerr << "ERROR: Could not read header from BAM file " << info.bamFile << "\n";
102-
return 1;
103-
}
104-
seqan::clear(header);
105-
106-
// Define:
107-
seqan::BamAlignmentRecord record;
10886
uint64_t sum = 0;
10987
uint64_t count = 0;
110-
seqan::CharString string2hash;
111-
//char hexCstr[33];
88+
bool pairedWarning = false;
89+
90+
11291

113-
// Read record
114-
while (!atEnd(inStream)) {
115-
if (readRecord(record, context, inStream, seqan::Bam()) != 0) {
116-
std::cerr << "ERROR: Could not read record from BAM File " << info.bamFile << "\n";
92+
for (int i = 0; i < info.bamfiles.size(); i++) {
93+
// Open BGZF Stream for reading.
94+
seqan::Stream<seqan::Bgzf> inStream;
95+
96+
const char* bamfile = info.bamfiles[i].c_str();
97+
98+
if (!open(inStream, bamfile, "r")) {
99+
std::cerr << "ERROR: Could not open " << bamfile << " for reading.\n";
117100
return 1;
118101
}
119-
// Check if flag: reverse complement and change record accordingly
120-
if (hasFlagRC(record)) {
121-
reverseComplement(record.seq);
122-
reverse(record.qual);
102+
103+
// Setup name store, cache, and BAM I/O context.
104+
typedef seqan::StringSet<seqan::CharString> TNameStore;
105+
typedef seqan::NameStoreCache<TNameStore> TNameStoreCache;
106+
typedef seqan::BamIOContext<TNameStore> TBamIOContext;
107+
TNameStore nameStore;
108+
TNameStoreCache nameStoreCache(nameStore);
109+
TBamIOContext context(nameStore, nameStoreCache);
110+
111+
// Read header.
112+
seqan::BamHeader header;
113+
if (readRecord(header, context, inStream, seqan::Bam()) != 0) {
114+
std::cerr << "ERROR: Could not read header from BAM file " << bamfile << "\n";
115+
return 1;
123116
}
124-
// Check if flag: supplementary and exclude those
125-
if (!hasFlagSupplementary(record) && !hasFlagSecondary(record)) {
126-
count +=1;
127-
// Construct one string from record
128-
if (!info.noReadNames) {
129-
seqan::append(string2hash, record.qName);
130-
if(hasFlagLast(record)) {
131-
seqan::append(string2hash, "/2");
132-
} else {
133-
seqan::append(string2hash, "/1");
134-
}
117+
seqan::clear(header);
118+
119+
// Define:
120+
seqan::BamAlignmentRecord record;
121+
seqan::CharString string2hash;
122+
//char hexCstr[33];
123+
124+
// Read record
125+
while (!atEnd(inStream)) {
126+
if (readRecord(record, context, inStream, seqan::Bam()) != 0) {
127+
std::cerr << "ERROR: Could not read record from BAM File " << bamfile << "\n";
128+
return 1;
135129
}
136-
137-
seqan::append(string2hash, record.seq);
138-
if (!info.noQuality) {
139-
seqan::append(string2hash, record.qual);
130+
// Check if flag: reverse complement and change record accordingly
131+
if (hasFlagRC(record)) {
132+
reverseComplement(record.seq);
133+
reverse(record.qual);
140134
}
141-
seqan::clear(record);
135+
// Check if flag: supplementary and exclude those
136+
if (!hasFlagSupplementary(record) && !hasFlagSecondary(record)) {
137+
count +=1;
138+
// Construct one string from record
139+
if (!info.noReadNames) {
140+
seqan::append(string2hash, record.qName);
141+
if(hasFlagLast(record)) {
142+
if (info.paired) {
143+
seqan::append(string2hash, "/2");
144+
} else {
145+
if (!pairedWarning) {
146+
std::cerr << "WARNING: BamHash was run with --no-paired mode, but BAM file has reads marked as second pair" << std::endl;
147+
pairedWarning = true;
148+
}
149+
seqan::append(string2hash, "/1");
150+
}
151+
} else {
152+
seqan::append(string2hash, "/1");
153+
}
154+
}
155+
156+
seqan::append(string2hash, record.seq);
157+
if (!info.noQuality) {
158+
seqan::append(string2hash, record.qual);
159+
}
160+
seqan::clear(record);
161+
162+
// Get MD5 hash
163+
hash_t hex = str2md5(toCString(string2hash), length(string2hash));
142164

143-
// Get MD5 hash
144-
hash_t hex = str2md5(toCString(string2hash), length(string2hash));
145-
seqan::clear(string2hash);
146165

147-
if (info.debug) {
148-
std::cout << std::hex << hex.p.low << "\n";
149-
} else {
150-
hexSum(hex, sum);
166+
if (info.debug) {
167+
std::cout << string2hash << " " << std::hex << hex.p.low << "\n";
168+
} else {
169+
hexSum(hex, sum);
170+
}
171+
172+
seqan::clear(string2hash);
151173
}
152174
}
175+
176+
// print result
177+
153178
}
154179

155-
// print result
156180
if (!info.debug) {
157181
std::cout << std::hex << sum << "\t";
158182
std::cout << std::dec << count << "\n";
159183
}
160-
184+
161185
return 0;
162186
}
163187

0 commit comments

Comments
 (0)