42 #include <unordered_map>
59 <<
"BitMagic DNA Search Sample (c) 2018" << std::endl
60 <<
"-fa file-name -- input FASTA file" << std::endl
61 <<
"-s hi|lo -- run substring search benchmark" << std::endl
62 <<
"-diag -- run diagnostics" << std::endl
63 <<
"-timing -- collect timings" << std::endl
82 for (
int i = 1; i < argc; ++i)
84 std::string arg = argv[i];
85 if ((arg ==
"-h") || (arg ==
"--help"))
91 if (arg ==
"-fa" || arg ==
"--fa")
99 std::cerr <<
"Error: -fa requires file name" << std::endl;
105 if (arg ==
"-diag" || arg ==
"--diag" || arg ==
"-d" || arg ==
"--d")
107 if (arg ==
"-timing" || arg ==
"--timing" || arg ==
"-t" || arg ==
"--t")
109 if (arg ==
"-bench" || arg ==
"--bench" || arg ==
"-b" || arg ==
"--b")
111 if (arg ==
"-search" || arg ==
"--search" || arg ==
"-s" || arg ==
"--s")
116 std::string a = argv[i+1];
119 if (a ==
"l" || a ==
"lo")
125 if (a ==
"h" || a ==
"hi")
142 typedef std::vector<std::pair<unsigned, std::string> >
dict_vect;
152 int load_FASTA(
const std::string& fname, std::vector<char>& seq_vect)
157 std::ifstream fin(fname.c_str(), std::ios::in);
162 for (
unsigned i = 0; std::getline(fin, line); ++i)
168 for (std::string::iterator it = line.begin(); it != line.end(); ++it)
169 seq_vect.push_back(*it);
183 enum { eA = 0, eC, eG,
eT, eN, eEnd };
189 void Build(
const vector<char>& sequence)
199 for (
size_t i = 0; i < sequence.size(); ++i)
201 unsigned pos = unsigned(i);
231 return m_FPrintBV[eA];
233 return m_FPrintBV[eC];
235 return m_FPrintBV[eG];
237 return m_FPrintBV[eT];
239 return m_FPrintBV[eN];
243 throw runtime_error(
"Error. Invalid letter!");
250 void Find(
const string& word, vector<unsigned>& res)
257 for (
size_t i = 1; i < word.size(); ++i)
270 unsigned ws = unsigned(word.size()) - 1;
271 TranslateResults(bv, ws, res);
283 for (
size_t i = 0; i < word.size(); ++i)
292 m_Agg.combine_shift_right_and(bv);
295 unsigned ws = unsigned(word.size()) - 1;
296 TranslateResults(bv, ws, res);
303 vector<vector<unsigned>>& hits)
305 vector<unique_ptr<aggregator_type> > agg_pipeline;
308 for (
const auto& w : words)
313 const string& word = get<0>(w);
314 for (
size_t i = 0; i < word.size(); ++i)
317 agg_ptr->add(&bv_mask);
320 agg_pipeline.emplace_back(agg_ptr.release());
321 ws = unsigned(word.size()) - 1;
326 vector<unique_ptr<aggregator_type> >::iterator>(agg_pipeline.begin(), agg_pipeline.end());
329 for (
size_t i = 0; i < agg_pipeline.size(); ++i)
333 vector<unsigned> res;
335 TranslateResults(*bv, ws, res);
336 hits.emplace_back(res);
346 vector<unsigned>& res)
349 for (; en.
valid(); ++en)
352 res.push_back(pos - left_shift);
368 vector<tuple<string,int>>& lo_words,
369 const vector<char>& data,
373 cout <<
"k-mer generation... " << endl;
378 if (data.size() < word_size)
381 size_t end_pos = data.size() - word_size;
383 map<string, int> words;
386 string s(&data[i], word_size);
387 if (s.find(
'N') == string::npos)
392 cout <<
"\r" << i <<
"/" << end_pos << flush;
396 cout << endl <<
"Picking k-mer samples..." << flush;
398 multimap<int,string, greater<int>> dst;
399 for_each(words.begin(), words.end(), [&](
const std::pair<string,int>& p)
401 dst.emplace(p.second, p.first);
404 auto it = dst.begin();
405 for(
size_t count = 0; count < N && it !=dst.end(); ++it,++count)
406 top_words.emplace_back(it->second, it->first);
410 auto it = dst.rbegin();
411 for(
size_t count = 0; count < N && it !=dst.rend(); ++it, ++count)
412 lo_words.emplace_back(it->second, it->first);
415 cout <<
"OK" << endl;
422 const char* word,
unsigned word_size,
425 if (data.size() < word_size)
429 size_t end_pos = data.size() - word_size;
433 for (
size_t j = i, k = 0, l = word_size - 1; l > k; ++j, ++k, --l)
435 if (data[j] != word[k] || data[i + l] != word[l])
442 r.push_back(
unsigned(i));
452 vector<const char*> words,
454 vector<vector<unsigned>>& hits)
456 if (data.size() < word_size)
460 size_t end_pos = data.size() - word_size;
461 size_t words_size = words.size();
464 for (
size_t idx = 0; idx < words_size; ++idx)
466 auto& word = words[idx];
468 for (
size_t j = i, k = 0, l = word_size - 1; l > k; ++j, ++k, --l)
470 if (data[j] != word[k] || data[i + l] != word[l])
478 hits[idx].push_back(
unsigned(i));
492 if (h1.size() != h2.size())
494 cerr <<
"size1 = " << h1.size() <<
" size2 = " << h2.size() << endl;
497 for (
size_t i = 0; i < h1.size(); ++i)
508 int main(
int argc,
char *argv[])
516 std::vector<char> seq_vect;
531 std::cout <<
"FASTA sequence size=" << seq_vect.size() << std::endl;
542 vector<tuple<string,int> > h_words;
543 vector<tuple<string,int> > l_words;
545 vector<tuple<string,int>>& words =
h_word_set ? h_words : l_words;
552 vector<THitList> word_hits;
553 vector<THitList> word_hits_agg;
561 vector<const char*> word_list;
562 for (
const auto& w : words)
564 word_list.push_back(get<0>(w).c_str());
566 word_hits.resize(words.size());
567 for_each(word_hits.begin(), word_hits.end(), [](
THitList& ht) {
587 for (
size_t word_idx = 0; word_idx < words.size(); ++ word_idx)
589 auto& word = get<0>(words[word_idx]);
595 word.c_str(),
unsigned(word.size()),
601 idx.
Find(word, hits2);
613 cout <<
"Mismatch ERROR for: " << word << endl;
619 cout <<
"Sigle pass mismatch ERROR for: " << word << endl;
623 cout << word_idx <<
": " << word <<
": " << hits1.size() <<
" hits " << endl;
631 std::cout << std::endl <<
"Performance:" << std::endl;
635 catch (std::exception& ex)
637 std::cerr <<
"Error:" << ex.what() << std::endl;