Raptor 3.0.0-rc.1
A fast and space-efficient pre-filter for querying very large collections of nucleotide sequences
 
file_reader.hpp
Go to the documentation of this file.
1// --------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md
6// --------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <seqan3/io/sequence_file/input.hpp>
16#include <seqan3/search/views/minimiser_hash.hpp>
17
20
21namespace raptor
22{
23
24enum class file_types
25{
26 sequence,
27 minimiser
28};
29
30template <file_types file_type>
32{};
33
34template <>
35class file_reader<file_types::sequence>
36{
37public:
38 file_reader() = default;
39 file_reader(file_reader const &) = default;
40 file_reader(file_reader &&) = default; // GCOVR_EXCL_LINE
41 file_reader & operator=(file_reader const &) = default;
42 file_reader & operator=(file_reader &&) = default;
43 ~file_reader() = default;
44
45 explicit file_reader(seqan3::shape const shape, uint32_t const window_size) :
46 minimiser_view{seqan3::views::minimiser_hash(shape,
47 seqan3::window_size{window_size},
48 seqan3::seed{adjust_seed(shape.count())})}
49 {}
50
51 template <std::output_iterator<uint64_t> it_t>
52 void hash_into(std::vector<std::string> const & filenames, it_t target) const
53 {
54 for (auto && filename : filenames)
55 hash_into(filename, target);
56 }
57
58 template <std::output_iterator<uint64_t> it_t>
59 void hash_into(std::string const & filename, it_t target) const
60 {
61 sequence_file_t fin{filename};
62 for (auto && record : fin)
63 std::ranges::copy(record.sequence() | minimiser_view, target);
64 }
65
66 template <std::output_iterator<uint64_t> it_t>
67 void hash_into_if(std::vector<std::string> const & filenames, it_t target, auto && pred) const
68 {
69 for (auto && filename : filenames)
70 hash_into_if(filename, target, pred);
71 }
72
73 template <std::output_iterator<uint64_t> it_t>
74 void hash_into_if(std::string const & filename, it_t target, auto && pred) const
75 {
76 sequence_file_t fin{filename};
77 for (auto && record : fin)
78 std::ranges::copy_if(record.sequence() | minimiser_view, target, pred);
79 }
80
81 void on_hash(std::vector<std::string> const & filenames, auto && callback) const
82 {
83 for (auto && filename : filenames)
84 on_hash(filename, callback);
85 }
86
87 void on_hash(std::string const & filename, auto && callback) const
88 {
89 sequence_file_t fin{filename};
90 for (auto && record : fin)
91 callback(record.sequence() | minimiser_view);
92 }
93
94 void for_each_hash(std::vector<std::string> const & filenames, auto && callback) const
95 {
96 for (auto && filename : filenames)
97 for_each_hash(filename, callback);
98 }
99
100 void for_each_hash(std::string const & filename, auto && callback) const
101 {
102 sequence_file_t fin{filename};
103 for (auto && record : fin)
104 std::ranges::for_each(record.sequence() | minimiser_view, callback);
105 }
106
107private:
108 using sequence_file_t = seqan3::sequence_file_input<dna4_traits, seqan3::fields<seqan3::field::seq>>;
109 using view_t = decltype(seqan3::views::minimiser_hash(seqan3::shape{}, seqan3::window_size{}, seqan3::seed{}));
110 view_t minimiser_view = seqan3::views::minimiser_hash(seqan3::shape{}, seqan3::window_size{}, seqan3::seed{});
111};
112
113template <>
114class file_reader<file_types::minimiser>
115{
116public:
117 file_reader() = default;
118 file_reader(file_reader const &) = default;
119 file_reader(file_reader &&) = default;
120 file_reader & operator=(file_reader const &) = default;
121 file_reader & operator=(file_reader &&) = default;
122 ~file_reader() = default;
123
124 explicit file_reader(seqan3::shape const, uint32_t const)
125 {}
126
127 template <std::output_iterator<uint64_t> it_t>
128 void hash_into(std::vector<std::string> const & filenames, it_t target) const
129 {
130 for (auto && filename : filenames)
131 hash_into(filename, target);
132 }
133
134 template <std::output_iterator<uint64_t> it_t>
135 void hash_into(std::string const & filename, it_t target) const
136 {
137 std::ifstream fin{filename, std::ios::binary};
138 uint64_t value;
139 while (fin.read(reinterpret_cast<char *>(&value), sizeof(value)))
140 {
141 *target = value;
142 ++target;
143 }
144 }
145
146 template <std::output_iterator<uint64_t> it_t>
147 void hash_into_if(std::vector<std::string> const & filenames, it_t target, auto && pred) const
148 {
149 for (auto && filename : filenames)
150 hash_into_if(filename, target, pred);
151 }
152
153 template <std::output_iterator<uint64_t> it_t>
154 void hash_into_if(std::string const & filename, it_t target, auto && pred) const
155 {
156 std::ifstream fin{filename, std::ios::binary};
157 uint64_t value;
158 while (fin.read(reinterpret_cast<char *>(&value), sizeof(value)))
159 if (pred(value))
160 {
161 *target = value;
162 ++target;
163 }
164 }
165
166 void for_each_hash(std::vector<std::string> const & filenames, auto && callback) const
167 {
168 for (auto && filename : filenames)
169 for_each_hash(filename, callback);
170 }
171
172 void for_each_hash(std::string const & filename, auto && callback) const
173 {
174 std::ifstream fin{filename, std::ios::binary};
175 uint64_t value;
176 while (fin.read(reinterpret_cast<char *>(&value), sizeof(value)))
177 callback(value);
178 }
179};
180
181} // namespace raptor
Provides raptor::adjust_seed.
Definition: file_reader.hpp:32
T copy(T... args)
Provides raptor::dna4_traits.
T for_each(T... args)