SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
format_genbank.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <iterator>
16 #include <string>
17 #include <string_view>
18 #include <vector>
19 
20 #include <range/v3/view/chunk.hpp>
21 
39 #include <seqan3/std/algorithm>
40 #include <seqan3/std/charconv>
41 #include <seqan3/std/ranges>
42 
43 namespace seqan3
44 {
45 
73 {
74 public:
78  format_genbank() noexcept = default;
79  format_genbank(format_genbank const &) noexcept = default;
80  format_genbank & operator=(format_genbank const &) noexcept = default;
81  format_genbank(format_genbank &&) noexcept = default;
82  format_genbank & operator=(format_genbank &&) noexcept = default;
83  ~format_genbank() noexcept = default;
85 
87  static inline std::vector<std::string> file_extensions
88  {
89  { "genbank" },
90  { "gb" },
91  { "gbk" },
92  };
93 
94 protected:
96  template <typename stream_type, // constraints checked by file
97  typename seq_legal_alph_type, bool seq_qual_combined,
98  typename seq_type, // other constraints checked inside function
99  typename id_type,
100  typename qual_type>
101  void read_sequence_record(stream_type & stream,
103  seq_type & sequence,
104  id_type & id,
105  qual_type & SEQAN3_DOXYGEN_ONLY(qualities))
106  {
107  auto stream_view = views::istreambuf(stream);
108  auto stream_it = std::ranges::begin(stream_view);
109 
110  if (!(std::ranges::equal(stream_view | views::take_until_or_throw(is_cntrl || is_blank), std::string{"LOCUS"})))
111  throw parse_error{"An entry has to start with the code word LOCUS."};
112 
113  //ID
114  if constexpr (!detail::decays_to_ignore_v<id_type>)
115  {
116  if (options.embl_genbank_complete_header)
117  {
118  std::ranges::copy(std::string_view{"LOCUS"}, std::cpp20::back_inserter(id));
119 
120  while (!is_char<'O'>(*std::ranges::begin(stream_view)))
121  {
122  std::ranges::copy(stream_view | views::take_line_or_throw
123  | views::char_to<std::ranges::range_value_t<id_type>>,
124  std::cpp20::back_inserter(id));
125  id.push_back('\n');
126  }
127  }
128  else
129  {
130  detail::consume(stream_view | views::take_until(!is_blank));
131 
132  auto read_id_until = [&stream_view, &id] (auto predicate)
133  {
134  std::ranges::copy(stream_view | views::take_until_or_throw(predicate)
135  | views::char_to<std::ranges::range_value_t<id_type>>,
136  std::cpp20::back_inserter(id));
137  };
138 
139  if (options.truncate_ids)
140  read_id_until(is_space);
141  else
142  read_id_until(is_cntrl);
143 
144  detail::consume(stream_view | views::take_line_or_throw);
145  }
146  }
147 
148  // Jump to sequence
149  while (!(is_char<'O'>(*std::ranges::begin(stream_view)) || options.embl_genbank_complete_header))
150  detail::consume(stream_view | views::take_line_or_throw);
151 
152  // Sequence
153  detail::consume(stream_view | views::take_line_or_throw); // consume "ORIGIN"
154  auto constexpr is_end = is_char<'/'> ;
155  if constexpr (!detail::decays_to_ignore_v<seq_type>)
156  {
157  auto constexpr is_legal_alph = is_in_alphabet<seq_legal_alph_type>;
158  std::ranges::copy(stream_view | std::views::filter(!(is_space || is_digit))
159  | views::take_until_or_throw_and_consume(is_end) // consume "//"
160  | std::views::transform([is_legal_alph] (char const c) // enforce legal alphabet
161  {
162  if (!is_legal_alph(c))
163  {
164  throw parse_error{std::string{"Encountered an unexpected letter: "} +
165  is_legal_alph.msg +
166  " evaluated to false on " +
167  detail::make_printable(c)};
168  }
169  return c;
170  })
171  | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
172  std::cpp20::back_inserter(sequence));
173  }
174  else
175  {
176  detail::consume(stream_view | views::take_until_or_throw_and_consume(is_end)); // consume until "//"
177  ++stream_it; // consume "/n"
178  }
179  }
180 
182  template <typename stream_type, // constraints checked by file
183  typename seq_type, // other constraints checked inside function
184  typename id_type,
185  typename qual_type>
186  void write_sequence_record(stream_type & stream,
187  sequence_file_output_options const & options,
188  seq_type && sequence,
189  id_type && id,
190  qual_type && SEQAN3_DOXYGEN_ONLY(qualities))
191  {
192  std::cpp20::ostreambuf_iterator stream_it{stream};
193  size_t sequence_size{0};
194  [[maybe_unused]] char buffer[50];
195  if constexpr (!detail::decays_to_ignore_v<seq_type>)
196  sequence_size = ranges::size(sequence);
197 
198  // ID
199  if constexpr (detail::decays_to_ignore_v<id_type>)
200  {
201  throw std::logic_error{"The ID field may not be set to ignore when writing genbank files."};
202  }
203  else if (ranges::empty(id)) //[[unlikely]]
204  {
205  throw std::runtime_error{"The ID field may not be empty when writing genbank files."};
206  }
207  else if (options.embl_genbank_complete_header)
208  {
209  std::ranges::copy(id, stream_it);
210  }
211  else
212  {
213  std::ranges::copy(std::string_view{"LOCUS "}, stream_it);
214  std::ranges::copy(id, stream_it);
215  std::ranges::copy(std::string_view{" "}, stream_it);
216  auto res = std::to_chars(&buffer[0], &buffer[0] + sizeof(buffer), sequence_size);
217  std::copy(&buffer[0], res.ptr, stream_it);
218  std::ranges::copy(std::string_view{" bp\n"}, stream_it);
219  }
220 
221  // Sequence
222  if constexpr (detail::decays_to_ignore_v<seq_type>) // sequence
223  {
224  throw std::logic_error{"The SEQ field may not be set to ignore when writing genbank files."};
225  }
226  else if (std::ranges::empty(sequence)) //[[unlikely]]
227  {
228  throw std::runtime_error{"The SEQ field may not be empty when writing genbank files."};
229  }
230  else
231  {
232  std::ranges::copy(std::string_view{"ORIGIN\n"}, stream_it);
233  auto seq = sequence | ranges::views::chunk(60);
234  size_t i = 0;
235  size_t bp = 1;
236 
237  while (bp < sequence_size)
238  {
239  // Sequence length with more than 9 digits are not possible in one genbank entry, maximal 350 kb are
240  // allowed. See: https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html#SequenceLengthA
241  for (size_t j = std::to_string(bp).size(); j < 9; j++)
242  stream_it = ' ';
243  std::ranges::copy(std::to_string(bp), stream_it);
244  stream_it = ' ';
245  std::ranges::copy(seq[i] | views::to_char
246  | views::interleave(10, std::string_view{" "}), stream_it);
247  bp += 60;
248  ++i;
249  detail::write_eol(stream_it,false);
250  }
251  std::ranges::copy(std::string_view{"//"}, stream_it);
252  detail::write_eol(stream_it,false);
253  }
254  }
255 };
256 
257 } // namespace seqan
Adaptations of algorithms from the Ranges TS.
Provides seqan3::views::char_to.
The GenBank format.
Definition: format_genbank.hpp:73
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_genbank.hpp:88
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type, seq_qual_combined > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_genbank.hpp:101
format_genbank() noexcept=default
Defaulted.
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_genbank.hpp:186
T copy(T... args)
Provides seqan3::dna5, container aliases and string literals.
constexpr auto is_blank
Checks whether c is a blank character.
Definition: predicate.hpp:163
constexpr auto is_digit
Checks whether c is a digital character.
Definition: predicate.hpp:287
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition: predicate.hpp:83
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:146
constexpr auto is_cntrl
Checks whether c is a control character.
Definition: predicate.hpp:110
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
@ id
The identifier, usually a string.
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:116
seqan3::type_list< trait_t< pack_t >... > transform
Apply a transformation trait to every type in the pack and return a seqan3::type_list of the results.
Definition: traits.hpp:307
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:65
constexpr auto take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:624
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:113
constexpr auto take_until
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until.hpp:610
auto const char_to
A view over an alphabet, given a range of characters.
Definition: char_to.hpp:69
constexpr auto take_until_or_throw_and_consume
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:652
constexpr auto take_line_or_throw
A view adaptor that returns a single line from the underlying range (throws if there is no end-of-lin...
Definition: take_line.hpp:90
constexpr auto interleave
A view that interleaves a given range into another range at regular intervals.
Definition: interleave.hpp:384
The generic concept for a sequence.
Provides seqan3::views::interleave.
Provides various utility functions.
Provides seqan3::fast_istreambuf_iterator and seqan3::fast_ostreambuf_iterator, as well as,...
Provides seqan3::views::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Provides character predicates for tokenisation.
Provides various utility functions.
Provides various transformation traits used by the range module.
Adaptations of concepts from the Ranges TS.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_input_options.
Provides seqan3::sequence_file_output_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
Provides std::from_chars and std::to_chars if not defined in the stl <charconv> header.
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: exception.hpp:48
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:26
bool embl_genbank_complete_header
Read the complete_header into the seqan3::field::id for embl or genbank format.
Definition: input_options.hpp:30
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: input_options.hpp:28
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:22
bool embl_genbank_complete_header
Complete header given for embl or genbank.
Definition: output_options.hpp:42
Provides seqan3::views::take.
Provides seqan3::views::take_line and seqan3::views::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::views::to_char.
T to_string(T... args)