1+ #include " htslib/hts.h"
2+ #include < seqan/bam_io.h>
13#include < iostream>
24#include < seqan/stream.h>
3- #include < seqan/bam_io.h>
45#include < stdio.h>
56#include < stdlib.h>
67#include < string>
910#include < stdint.h>
1011#include < vector>
1112#include < seqan/arg_parse.h>
13+ #include < seqan/hts_io.h>
1214
1315
14- #include " bamhash_checksum_common.h"
1516
17+ #include " bamhash_checksum_common.h"
1618
17- /* * only needed for seqan 1.4.1 and lower
18- inline bool
19- hasFlagSupplementary(seqan::BamAlignmentRecord const & record)
20- {
21- return (record.flag & 0x0800) == 0x0800;
22- }
23- */
2419struct Baminfo {
2520 std::vector<std::string> bamfiles;
2621 bool debug;
2722 bool noReadNames;
2823 bool noQuality;
2924 bool paired;
25+ seqan::CharString reference;
3026
31- Baminfo () : debug(false ), noReadNames(false ), noQuality(false ), paired(true ) {}
27+ Baminfo () : debug(false ), noReadNames(false ), noQuality(false ), paired(true ), reference(" " ) {}
28+
29+ };
30+
31+ struct Counts {
32+ uint64_t sum;
33+ uint64_t count;
34+
35+ Counts () : sum(0 ), count(0 ) {}
3236
3337};
3438
@@ -38,24 +42,27 @@ parseCommandLine(Baminfo& options, int argc, char const **argv) {
3842 seqan::ArgumentParser parser (" bamhash_checksum_bam" );
3943 // readlink("/proc/self/exe", options.bindir, sizeof(options.bindir)-1);
4044
41- setShortDescription (parser, " Checksum of a bam file" ); // TODO change description
45+ setShortDescription (parser, " Checksum of a sam, bam or cram file" ); // TODO change description
4246 setVersion (parser, BAMHASH_VERSION);
43- setDate (parser, " May 2015 " );
47+ setDate (parser, " Oct 2018 " );
4448
4549 addUsageLine (parser, " [\\ fIOPTIONS\\ fP] \\ fI<in.bam> <in2.bam> ...\\ fP" );
4650 addDescription (parser, " Program for checksum of sequence reads. " );
4751
48- addArgument (parser, seqan::ArgParseArgument (seqan::ArgParseArgument::INPUTFILE," bamfile" , " False" , 1 ));
49-
50- setValidValues (parser, 0 ," bam sam" );
52+ addArgument (parser, seqan::ArgParseArgument (seqan::ArgParseArgument::INPUT_FILE," bamfile" , " False" , 1 ));
53+ setValidValues (parser, 0 ," sam bam cram" );
5154
5255 addSection (parser, " Options" );
5356
5457 // add debug option:
5558 addOption (parser, seqan::ArgParseOption (" d" , " debug" , " Debug mode. Prints full hex for each read to stdout" ));
5659 addOption (parser, seqan::ArgParseOption (" R" , " no-readnames" , " Do not use read names as part of checksum" ));
5760 addOption (parser, seqan::ArgParseOption (" Q" , " no-quality" , " Do not use read quality as part of checksum" ));
58- addOption (parser, seqan::ArgParseOption (" P" , " no-paired" , " Bam files were not generated with paired-end reads" ));
61+ addOption (parser, seqan::ArgParseOption (" P" , " no-paired" , " Cram files were not generated with paired-end reads" ));
62+ addOption (parser, seqan::ArgParseOption (" r" , " reference-file" , " Path to reference-file if reference not given in header" ,
63+ seqan::ArgParseArgument::INPUT_FILE));
64+
65+ setValidValues (parser, " reference-file" , " fa" );
5966
6067 // Parse command line.
6168 seqan::ArgumentParser::ParseResult res = seqan::parse (parser, argc, argv);
@@ -67,13 +74,76 @@ parseCommandLine(Baminfo& options, int argc, char const **argv) {
6774 options.noReadNames = isSet (parser, " no-readnames" );
6875 options.noQuality = isSet (parser, " no-quality" );
6976 options.paired = !isSet (parser, " no-paired" );
77+ getOptionValue (options.reference , parser, " reference-file" );
7078
7179 options.bamfiles = getArgumentValues (parser, 0 );
7280
7381 return seqan::ArgumentParser::PARSE_OK;
7482}
7583
7684
85+ // -----------------------------------------------------------------------------
86+ // FUNCTION getSampleIdAndLaneNames()
87+ // -----------------------------------------------------------------------------
88+
89+ void getLaneNames (std::map<seqan::CharString, unsigned > & laneNames, std::string const & header)
90+ {
91+ for (int i = 0 ; i < header.size (); /* empty on purpose*/ )
92+ {
93+ auto hdr_find_it = std::find (header.begin () + i, header.end (), ' \n ' );
94+ std::string line = header.substr (i, hdr_find_it - header.begin () - i);
95+
96+ if (line.size () > 7 && line[0 ] == ' @' && line[1 ] == ' R' && line[2 ] == ' G' && line[3 ] == ' \t ' )
97+ {
98+ for (int j = 0 ; j < static_cast <int >(line.size ()); /* empty on purpose*/ )
99+ {
100+ auto line_find_it = std::find (line.begin () + j, line.end (), ' \t ' );
101+ std::string field = line.substr (j, line_find_it - line.begin () - j);
102+
103+ if (field.size () > 3 && field[0 ] == ' I' && field[1 ] == ' D' && field[2 ] == ' :' )
104+ {
105+ seqan::CharString read_group_name = field.substr (3 );
106+ int read_group_index = laneNames.size ();
107+ laneNames[read_group_name] = read_group_index;
108+ }
109+
110+ j = std::distance (line.begin (), line_find_it) + 1 ;
111+ }
112+ }
113+
114+ i = std::distance (header.begin (), hdr_find_it) + 1 ;
115+ }
116+ }
117+
118+ // -----------------------------------------------------------------------------
119+ // FUNCTION getLane()
120+ // -----------------------------------------------------------------------------
121+
122+ int getLane (seqan::BamAlignmentRecord & record,
123+ seqan::BamTagsDict & tagsDict,
124+ std::map<seqan::CharString, unsigned > & laneNames)
125+ {
126+ unsigned tagIdx = 0 ;
127+
128+ if (!seqan::findTagKey (tagIdx, tagsDict, " RG" ))
129+ {
130+ std::cerr << " ERROR: Found a read with a missing read group (RG) tag\n " ;
131+ return -1 ;
132+ }
133+
134+ seqan::CharString read_group;
135+
136+ if (!seqan::extractTagValue (read_group, tagsDict, tagIdx))
137+ {
138+ std::cerr << " ERROR: Failed to extract read group (RG) tag value\n " ;
139+ return -1 ;
140+ }
141+
142+ return laneNames[read_group];
143+ }
144+
145+
146+
77147int main (int argc, char const **argv) {
78148
79149 Baminfo info; // Define structure variable
@@ -83,58 +153,48 @@ int main(int argc, char const **argv) {
83153 return res == seqan::ArgumentParser::PARSE_ERROR;
84154 }
85155
86- uint64_t sum = 0 ;
87- uint64_t count = 0 ;
156+ // Moving below to struct
157+ // uint64_t sum = 0;
158+ // uint64_t count = 0;
88159 bool pairedWarning = false ;
89160
90-
161+ // adding new stuff
162+ std::map<seqan::CharString, unsigned > laneNames;
163+ // Initialize all counts for each lane.
164+ seqan::String<Counts> counts;
91165
92166 for (int i = 0 ; i < info.bamfiles .size (); i++) {
93- // Open BGZF Stream for reading.
94- seqan::Stream<seqan::Bgzf> inStream;
95167
96168 const char * bamfile = info.bamfiles [i].c_str ();
97-
98- if (!open (inStream, bamfile, " r" )) {
99- std::cerr << " ERROR: Could not open " << bamfile << " for reading.\n " ;
100- return 1 ;
101- }
169+ const char * reference = toCString (info.reference );
102170
103- // Setup name store, cache, and BAM I/O context.
104- typedef seqan::StringSet<seqan::CharString> TNameStore;
105- typedef seqan::NameStoreCache<TNameStore> TNameStoreCache;
106- typedef seqan::BamIOContext<TNameStore> TBamIOContext;
107- TNameStore nameStore;
108- TNameStoreCache nameStoreCache (nameStore);
109- TBamIOContext context (nameStore, nameStoreCache);
110-
111- // Read header.
112- seqan::BamHeader header;
113- if (readRecord (header, context, inStream, seqan::Bam ()) != 0 ) {
114- std::cerr << " ERROR: Could not read header from BAM file " << bamfile << " \n " ;
115- return 1 ;
116- }
117- seqan::clear (header);
171+ // Open stream for reading
172+ seqan::HtsFile inStream (bamfile, " r" , reference);
173+
174+ // Initialize lane names (read groups).
175+ std::string header (inStream.hdr ->text , inStream.hdr ->l_text );
176+ getLaneNames (laneNames, header);
177+ unsigned lanecount = laneNames.size ();
178+ resize (counts, lanecount);
118179
119180 // Define:
120181 seqan::BamAlignmentRecord record;
121182 seqan::CharString string2hash;
122- // char hexCstr[33];
123183
124184 // Read record
125- while (! atEnd ( inStream)) {
126- if ( readRecord (record, context, inStream, seqan::Bam ()) != 0 ) {
127- std::cerr << " ERROR: Could not read record from BAM File " << bamfile << " \n " ;
128- return 1 ;
129- }
185+ while (seqan::readRecord (record, inStream)){
186+ seqan::BamTagsDict tagsDict (record. tags );
187+ int l = getLane ( record, tagsDict, laneNames) ;
188+ if (l == - 1 ) return 1 ;
189+
130190 // Check if flag: reverse complement and change record accordingly
131191 if (hasFlagRC (record)) {
132- reverseComplement (record.seq );
133- reverse (record.qual );
192+ seqan:: reverseComplement (record.seq );
193+ seqan:: reverse (record.qual );
134194 }
135195 // Check if flag: supplementary and exclude those
136196 if (!hasFlagSupplementary (record) && !hasFlagSecondary (record)) {
137- count +=1 ;
197+ counts[l]. count +=1 ;
138198 // Construct one string from record
139199 if (!info.noReadNames ) {
140200 seqan::append (string2hash, record.qName );
@@ -143,7 +203,7 @@ int main(int argc, char const **argv) {
143203 seqan::append (string2hash, " /2" );
144204 } else {
145205 if (!pairedWarning) {
146- std::cerr << " WARNING: BamHash was run with --no-paired mode, but BAM file has reads marked as second pair" << std::endl;
206+ std::cerr << " WARNING: seqread was run with --no-paired mode, but BAM file has reads marked as second pair" << std::endl;
147207 pairedWarning = true ;
148208 }
149209 seqan::append (string2hash, " /1" );
@@ -162,26 +222,25 @@ int main(int argc, char const **argv) {
162222 // Get MD5 hash
163223 hash_t hex = str2md5 (toCString (string2hash), length (string2hash));
164224
165-
166225 if (info.debug ) {
167226 std::cout << string2hash << " " << std::hex << hex.p .low << " \n " ;
168227 } else {
169- hexSum (hex, sum);
228+ hexSum (hex, counts[l]. sum );
170229 }
171230
172231 seqan::clear (string2hash);
173232 }
174233 }
175-
176- // print result
177-
178234 }
179235
180236 if (!info.debug ) {
181- std::cout << std::hex << sum << " \t " ;
182- std::cout << std::dec << count << " \n " ;
237+ for (std::map<seqan::CharString, unsigned >::iterator it = laneNames.begin (); it != laneNames.end (); ++it) {
238+ std::cout << it->first << " \t " ;
239+ int lid = it->second ;
240+ std::cout << std::hex << counts[lid].sum << " \t " ;
241+ std::cout << std::dec << counts[lid].count << " \n " ;
242+ }
183243 }
184-
244+
185245 return 0 ;
186246}
187-
0 commit comments