RESTinio
utf8_checker.hpp
Go to the documentation of this file.
1 /*
2  * RESTinio
3  */
4 
5 /*!
6  * @file
7  * @brief An implementation of checker for UTF-8 sequences.
8  *
9  * @since v.0.6.5
10  */
11 
12 #pragma once
13 
14 #include <restinio/compiler_features.hpp>
15 
16 #include <cstdint>
17 
18 namespace restinio
19 {
20 
21 namespace utils
22 {
23 
24 //
25 // utf8_checker_t
26 //
27 
28 /*!
29  * @brief Helper class for checking UTF-8 byte sequence during parsing
30  * URI or incoming byte stream.
31  */
33 {
34  //! Enumeration of all possible checker states.
35  enum class state_t
36  {
44  invalid,
45  };
46 
47  //! The current UNICODE symbol.
48  /*!
49  * Contains a valid value only if some bytes were successfully
50  * processed by process_byte() and the current state is
51  * wait_first_byte.
52  */
54 
55  //! The current state of the checker.
57 
58  void
59  on_first_byte( std::uint8_t byte ) noexcept
60  {
61  if( byte <= 0x7Fu )
62  {
64  m_current_symbol = byte;
65  }
66  else if( 0xC0u == (byte & 0xE0u) )
67  {
69  m_current_symbol = (byte & 0x1Fu);
70  }
71  else if( 0xE0u == (byte & 0xF0u) )
72  {
74  m_current_symbol = (byte & 0x0Fu);
75  }
76  else if( 0xF0u == (byte & 0xF8u) )
77  {
79  m_current_symbol = (byte & 0x07u);
80  }
81  else
82  {
83  // Because UTF-8 can represent only ranges from:
84  //
85  // 0000 0000-0000 007F
86  // 0000 0080-0000 07FF
87  // 0000 0800-0000 FFFF
88  // 0001 0000-0010 FFFF
89  //
90  // There is no need to check masks like 0b111110xx and so on.
91  //
92  // See https://datatracker.ietf.org/doc/html/rfc3629
93  //
95  }
96  }
97 
98  void
99  on_second_of_two( std::uint8_t byte ) noexcept
100  {
101  if( 0x80u == (byte & 0xC0u) )
102  {
103  m_current_symbol <<= 6;
104  m_current_symbol |= (byte & 0x3Fu);
105 
106  // Check for overlong sequence.
107  // The valid range for two bytes representation is 0x0080..0x07FF.
108  if( m_current_symbol < 0x0080u )
109  {
110  // The value is too small, it's overlong.
112  }
113  else
114  // Three is no need to check the result value against
115  // invalid ranges (0xD800..0xDFFF and 0x110000..)
116  // because two bytes only represents 0x0080..0x07FF.
118  }
119  else
120  {
122  }
123  }
124 
125  void
126  on_second_of_three( std::uint8_t byte ) noexcept
127  {
128  if( 0x80u == (byte & 0xC0u) )
129  {
130  m_current_symbol <<= 6;
131  m_current_symbol |= (byte & 0x3Fu);
132 
134  }
135  else
136  {
138  }
139  }
140 
141  void
142  on_second_of_four( std::uint8_t byte ) noexcept
143  {
144  if( 0x80u == (byte & 0xC0u) )
145  {
146  m_current_symbol <<= 6;
147  m_current_symbol |= (byte & 0x3Fu);
148 
150  }
151  else
152  {
154  }
155  }
156 
157  void
158  on_third_of_three( std::uint8_t byte ) noexcept
159  {
160  if( 0x80u == (byte & 0xC0u) )
161  {
162  m_current_symbol <<= 6;
163  m_current_symbol |= (byte & 0x3Fu);
164 
165  // Check for overlong sequence.
166  // The valid range for three bytes representation is 0x0800..0xFFFF.
167  if( m_current_symbol < 0x0800u )
168  {
169  // The value is too small, it's overlong.
171  }
172  else
173  {
174  // It's necessary to check illigal points 0xD800..0xDFFF.
175  if( m_current_symbol >= 0xD800 && m_current_symbol <= 0xDFFF )
177  else
179  }
180  }
181  else
182  {
184  }
185  }
186 
187  void
188  on_third_of_four( std::uint8_t byte ) noexcept
189  {
190  if( 0x80u == (byte & 0xC0u) )
191  {
192  m_current_symbol <<= 6;
193  m_current_symbol |= (byte & 0x3Fu);
194 
196  }
197  else
198  {
200  }
201  }
202 
203  void
204  on_fourth_of_four( std::uint8_t byte ) noexcept
205  {
206  if( 0x80u == (byte & 0xC0u) )
207  {
208  m_current_symbol <<= 6;
209  m_current_symbol |= (byte & 0x3Fu);
210 
211  // Check for overlong sequence.
212  // The valid range for three bytes representation is 0x10000..0x10FFFF.
213  if( m_current_symbol < 0x10000u )
214  {
215  // The value is too small, it's overlong.
217  }
218  else
219  {
220  // It's necessary to check for values above 0x10FFFF.
221  // There is no need to check 0xD800..0xDFFF range because
222  // it was already handled by overlong check.
223  if( m_current_symbol >= 0x110000 )
225  else
227  }
228  }
229  else
230  {
232  }
233  }
234 
235 public:
236  utf8_checker_t() = default;
237 
238  /*!
239  * Checks another byte.
240  *
241  * @note
242  * The actual value of the current symbol can be obtained only if
243  * process_byte() returns `true` and the subsequent call to
244  * finalized() returns `true`:
245  *
246  * @code
247  * utf8checker_t checker;
248  * for( const auto ch : some_string )
249  * {
250  * if( checker.process_byte() )
251  * {
252  * if( checker.finalized() )
253  * process_unicode_symbol( checker.current_symbol() );
254  * }
255  * else
256  * {
257  * ... // Invalid sequence found!
258  * break;
259  * }
260  * }
261  * @endcode
262  *
263  * @retval true if the sequence is still valid and the next byte
264  * can be given to the next call to process_byte().
265  *
266  * @retval false if the sequence is invalid an there is no sense
267  * to continue call process_byte().
268  */
269  [[nodiscard]]
270  bool
271  process_byte( std::uint8_t byte ) noexcept
272  {
273  switch( m_state )
274  {
275  case state_t::wait_first_byte:
276  on_first_byte( byte );
277  break;
278 
279  case state_t::wait_second_of_two:
280  on_second_of_two( byte );
281  break;
282 
283  case state_t::wait_second_of_three:
284  on_second_of_three( byte );
285  break;
286 
287  case state_t::wait_second_of_four:
288  on_second_of_four( byte );
289  break;
290 
291  case state_t::wait_third_of_three:
292  on_third_of_three( byte );
293  break;
294 
295  case state_t::wait_third_of_four:
296  on_third_of_four( byte );
297  break;
298 
299  case state_t::wait_fourth_of_four:
300  on_fourth_of_four( byte );
301  break;
302 
303  case state_t::invalid:
304  // Nothing to do.
305  break;
306  }
307 
308  return (state_t::invalid != m_state);
309  }
310 
311  /*!
312  * @return true if the current sequence finalized.
313  */
314  [[nodiscard]]
315  bool
316  finalized() const noexcept
317  {
319  }
320 
321  /*!
322  * Return the object into the initial state.
323  */
324  void
325  reset() noexcept
326  {
327  m_current_symbol = 0u;
329  }
330 
331  /*!
332  * Get the collected value of the current symbol.
333  *
334  * @note
335  * It returns the actual value only if:
336  *
337  * - some bytes were successfully feed into process_byte();
338  * - finalized() returns `true`.
339  */
340  [[nodiscard]]
341  std::uint32_t
342  current_symbol() const noexcept { return m_current_symbol; }
343 };
344 
345 } /* namespace utils */
346 
347 } /* namespace restinio */
string_view_t from_string< string_view_t >(string_view_t s)
Get a value from string_view.
void on_second_of_three(std::uint8_t byte) noexcept
bool finalized() const noexcept
void on_third_of_three(std::uint8_t byte) noexcept
Helper class for checking UTF-8 byte sequence during parsing URI or incoming byte stream...
state_t
Enumeration of all possible checker states.
std::enable_if< std::is_same< Parameter_Container, query_string_params_t >::value||std::is_same< Parameter_Container, router::route_params_t >::value, std::optional< Value_Type > >::type opt_value(const Parameter_Container &params, string_view_t key)
Gets the value of a parameter specified by key wrapped in std::optional<Value_Type> if parameter exis...
Definition: value_or.hpp:64
void on_second_of_four(std::uint8_t byte) noexcept
void on_second_of_two(std::uint8_t byte) noexcept
void on_fourth_of_four(std::uint8_t byte) noexcept
void on_third_of_four(std::uint8_t byte) noexcept
void on_first_byte(std::uint8_t byte) noexcept
std::uint32_t current_symbol() const noexcept
bool process_byte(std::uint8_t byte) noexcept
state_t m_state
The current state of the checker.
std::uint32_t m_current_symbol
The current UNICODE symbol.