Skip to content

Commit bb240f5

Browse files
committed
Added fasta program
1 parent fc70630 commit bb240f5

2 files changed

Lines changed: 153 additions & 1 deletion

File tree

Makefile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ CXXFLAGS+= -O3 -DSEQAN_ENABLE_TESTING=0 -DSEQAN_ENABLE_DEBUG=0 -DSEQAN_HAS_ZLIB=
77
LDLIBS=-lz -lssl -lcrypto
88

99

10-
TARGET = bamhash_checksum_bam bamhash_checksum_fastq
10+
TARGET = bamhash_checksum_bam bamhash_checksum_fastq bamhash_checksum_fasta
1111
all: $(TARGET)
1212

1313
bamhash_checksum_bam: bamhash_checksum_common.o bamhash_checksum_bam.o
@@ -16,5 +16,9 @@ bamhash_checksum_bam: bamhash_checksum_common.o bamhash_checksum_bam.o
1616
bamhash_checksum_fastq: bamhash_checksum_common.o bamhash_checksum_fastq.o
1717
$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
1818

19+
bamhash_checksum_fasta: bamhash_checksum_common.o bamhash_checksum_fasta.o
20+
$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
21+
22+
1923
clean:
2024
$(RM) *.o *~ $(TARGET)

bamhash_checksum_fasta.cpp

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#include <iostream>
2+
#include <seqan/stream.h>
3+
#include <seqan/seq_io.h>
4+
#include <stdio.h>
5+
#include <stdlib.h>
6+
#include <string>
7+
#include <cmath>
8+
#include <cstdlib>
9+
#include <stdint.h>
10+
#include <vector>
11+
#include <seqan/arg_parse.h>
12+
13+
#include "bamhash_checksum_common.h"
14+
15+
struct Fastainfo {
16+
std::vector<std::string> fastafiles;
17+
bool debug;
18+
bool noReadNames;
19+
20+
Fastainfo() : debug(false), noReadNames(false) {}
21+
22+
};
23+
24+
seqan::ArgumentParser::ParseResult
25+
parseCommandLine(Fastainfo& options, int argc, char const **argv) {
26+
// Setup ArgumentParser.
27+
seqan::ArgumentParser parser("bamhash_checksum_fasta");
28+
29+
setShortDescription(parser, "Checksum of a set of fasta files");
30+
setVersion(parser, BAMHASH_VERSION);
31+
setDate(parser, "May 2015");
32+
33+
addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fI<in1.fasta>\\fP [\\fIin2.fasta ... \\fP]");
34+
addDescription(parser, "Program for checksum of sequence reads. ");
35+
36+
addArgument(parser, seqan::ArgParseArgument(seqan::ArgParseArgument::INPUTFILE,"fastafiles", true));
37+
38+
setValidValues(parser, 0,"fa fa.gz fasta fasta.gz");
39+
40+
addSection(parser, "Options");
41+
//add debug option:
42+
addOption(parser, seqan::ArgParseOption("d", "debug", "Debug mode. Prints full hex for each read to stdout"));
43+
addOption(parser, seqan::ArgParseOption("R", "no-readnames", "Do not use read names as part of checksum"));
44+
45+
// Parse command line.
46+
seqan::ArgumentParser::ParseResult res = seqan::parse(parser, argc, argv);
47+
if (res != seqan::ArgumentParser::PARSE_OK) {
48+
return res;
49+
}
50+
51+
options.debug = seqan::isSet(parser, "debug");
52+
options.noReadNames = seqan::isSet(parser, "no-readnames");
53+
54+
55+
options.fastafiles = getArgumentValues(parser, 0);
56+
57+
58+
return seqan::ArgumentParser::PARSE_OK;
59+
}
60+
61+
62+
int main(int argc, char const **argv) {
63+
Fastainfo info; // Define structure variable
64+
seqan::ArgumentParser::ParseResult res = parseCommandLine(info, argc, argv); // Parse the command line.
65+
66+
if (res != seqan::ArgumentParser::PARSE_OK) {
67+
return res == seqan::ArgumentParser::PARSE_ERROR;
68+
}
69+
70+
// Define:
71+
uint64_t sum = 0;
72+
unsigned count = 0;
73+
seqan::StringSet<seqan::CharString> idSub;
74+
seqan::CharString string2hash;
75+
76+
seqan::CharString id;
77+
seqan::CharString seq;
78+
hash_t hex;
79+
80+
81+
// Open GZStream
82+
seqan::Stream<seqan::GZFile> gzStream;
83+
84+
85+
86+
for (int i = 0; i < info.fastafiles.size(); i++) {
87+
const char* fasta = info.fastafiles[i].c_str();
88+
89+
if (!open(gzStream, fasta, "r")) {
90+
std::cerr << "ERROR: Could not open the file: " << fasta << " for reading.\n";
91+
return 1;
92+
}
93+
94+
//Setup RecordReader for reading FASTA file from gzip-compressed file
95+
seqan::RecordReader<seqan::Stream<seqan::GZFile>, seqan::SinglePass<> > reader(gzStream);
96+
97+
98+
99+
// Read record
100+
while (!atEnd(reader)) {
101+
if (readRecord(id, seq, reader, seqan::Fasta()) != 0) {
102+
if (atEnd(reader)) {
103+
std::cerr << "WARNING: Could not continue reading " << fasta << " at line: " << count+1 << ".\n";
104+
return 1;
105+
}
106+
std::cerr << "ERROR: Could not read from " << fasta << "\n";
107+
return 1;
108+
}
109+
110+
count +=1;
111+
112+
113+
// cut away after first space
114+
seqan::strSplit(idSub, id, ' ', false, 1);
115+
116+
117+
if (!info.noReadNames) {
118+
seqan::append(string2hash, idSub[0]);
119+
seqan::append(string2hash, "/1"); // to be consistent with BAM and FASTQ
120+
}
121+
seqan::append(string2hash, seq);
122+
123+
// Get MD5 hash
124+
hex = str2md5(toCString(string2hash), length(string2hash));
125+
126+
if (info.debug) {
127+
std::cout << string2hash << " " << std::hex << hex.p.low << "\n";
128+
} else {
129+
hexSum(hex, sum);
130+
}
131+
132+
seqan::clear(string2hash);
133+
seqan::clear(idSub);
134+
135+
136+
}
137+
138+
}
139+
140+
if (!info.debug) {
141+
std::cout << std::hex << sum << "\t";
142+
std::cout << std::dec << count << "\n";
143+
}
144+
145+
146+
return 0;
147+
}
148+

0 commit comments

Comments
 (0)