RESTinio
percent_encoding.hpp
Go to the documentation of this file.
1 /*
2  restinio
3 */
4 
5 /*!
6  Percent encoding routine.
7 */
8 
9 #pragma once
10 
11 #include <string>
12 
13 #include <restinio/impl/include_fmtlib.hpp>
14 
15 #include <restinio/string_view.hpp>
16 #include <restinio/exception.hpp>
17 #include <restinio/expected.hpp>
18 
19 #include <restinio/utils/utf8_checker.hpp>
20 
21 namespace restinio
22 {
23 
24 namespace utils
25 {
26 
27 /*!
28  * @brief The default traits for escaping and unexcaping symbols in
29  * a query string.
30  *
31  * Unescaped asterisk is not allowed.
32  *
33  * @since v.0.4.9.1
34  */
36 {
37  static constexpr bool
38  ordinary_char( char c ) noexcept
39  {
40  return
41  ( '0' <= c && c <= '9' ) ||
42  ( 'a' <= c && c <= 'z' ) ||
43  ( 'A' <= c && c <= 'Z' ) ||
44  '-' == c ||
45  '.' == c ||
46  '~' == c ||
47  '_' == c;
48  }
49 };
50 
51 /*!
52  * @brief Traits for escaping and unexcaping symbols in
53  * a query string in correspondence with application/x-www-form-urlencoded
54  * rules.
55  *
56  * Reference for more details: https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer
57  *
58  * @since v.0.6.5
59  */
61 {
62  static constexpr bool
63  ordinary_char( char c ) noexcept
64  {
65  return
66  ( '0' <= c && c <= '9' ) ||
67  ( 'a' <= c && c <= 'z' ) ||
68  ( 'A' <= c && c <= 'Z' ) ||
69  '*' == c ||
70  '-' == c ||
71  '.' == c ||
72  '_' == c;
73  }
74 };
75 
76 /*!
77  * @brief Traits for escaping and unescaping symbols in
78  * a query string in very relaxed mode.
79  *
80  * In that mode all characters described in that rule from
81  * [RCF3986](https://tools.ietf.org/html/rfc3986) can be used as unescaped:
82 @verbatim
83 query = *( pchar / "/" / "?" )
84 pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
85 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
86 reserved = gen-delims / sub-delims
87 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
88 sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
89  / "*" / "+" / "," / ";" / "="
90 @endverbatim
91  *
92  * Additionaly this traits allows to use unescaped space character.
93  *
94  * @since v.0.6.5
95  */
97 {
98  static bool
99  ordinary_char( char c ) noexcept
100  {
101  return nullptr != std::strchr(
102  " " // Space
103  "ABCDEFGHIJKLMNOPQRSTUVWXYZ" // ALPHA
104  "abcdefghijklmnopqrstuvwxyz"
105  "0123456789" // DIGIT
106  "-._~" // unreserved
107  ":/?#[]@" // gen-delims
108  "!$&'()*+,;=", c );
109  }
110 };
111 
112 /*!
113  * @brief The traits for escaping and unexcaping symbols in
114  * JavaScript-compatible mode.
115  *
116  * The following symbols are allowed to be unescaped:
117  * `-`, `.`, `~`, `_`, `*`, `!`, `'`, `(`, `)`
118  *
119  * @note
120  * The list of allowed symbols was extended in v.0.6.5.
121  *
122  * @since v.0.4.9.1, v.0.6.5
123  */
125 {
126  static constexpr bool
127  ordinary_char( char c ) noexcept
128  {
129  return
130  ( '0' <= c && c <= '9' ) ||
131  ( 'a' <= c && c <= 'z' ) ||
132  ( 'A' <= c && c <= 'Z' ) ||
133  '-' == c ||
134  '.' == c ||
135  '~' == c ||
136  '_' == c ||
137  '*' == c ||
138  '!' == c ||
139  '\'' == c ||
140  '(' == c ||
141  ')' == c;
142  }
143 };
144 
145 /*!
146  * @brief Type that indicates that unescaping of percent-encoded symbols
147  * completed successfully.
148  *
149  * @since v.0.6.5
150  */
152 
153 /*!
154  * @brief Type that indicates a failure of unescaping of percent-encoded
155  * symbols.
156  *
157  * @since v.0.6.5
158  */
160 {
161  //! Description of a failure.
163 
164 public:
166  std::string description )
168  {}
169 
170  //! Get a reference to the description of the failure.
171  [[nodiscard]]
172  const std::string &
173  description() const noexcept { return m_description; }
174 
175  //! Get out the value of the description of the failure.
176  /*!
177  * This method is intended for cases when this description should be move
178  * elsewhere (to another object like unescape_percent_encoding_failure_t or
179  * to some exception-like object).
180  */
181  [[nodiscard]]
182  std::string
183  giveout_description() noexcept { return std::move(m_description); }
184 };
185 
186 namespace impl
187 {
188 
189 inline bool
190 is_hexdigit( char c )
191 {
192  return
193  ( '0' <= c && c <= '9' ) ||
194  ( 'a' <= c && c <= 'f' ) ||
195  ( 'A' <= c && c <= 'F' );
196 }
197 
198 inline char
199 extract_escaped_char( char c1, char c2 )
200 {
201  char result;
202 
203  if( '0' <= c1 && c1 <= '9' )
204  result = c1 - '0';
205  else
206  {
207  c1 |= 0x20;
208  result = 10 + c1 - 'a';
209  }
210 
211  result <<= 4;
212 
213  if( '0' <= c2 && c2 <= '9' )
214  result += c2 - '0';
215  else
216  {
217  c2 |= 0x20;
218  result += 10 + c2 - 'a';
219  }
220 
221  return result;
222 }
223 
224 //
225 // do_unescape_percent_encoding
226 //
227 /*!
228  * @brief The actual implementation of unescape-percent-encoding procedure.
229  *
230  * @since v.0.6.5
231  */
232 template<
233  typename Traits,
234  typename Chars_Collector >
235 [[nodiscard]]
236 expected_t<
240  const string_view_t data,
242 {
244  const char * d = data.data();
245 
247  bool expect_next_utf8_byte = false;
248 
249  const auto current_pos = [&d, &data]() noexcept { return d - data.data(); };
250 
251  while( 0 < chars_to_handle )
252  {
253  char c = *d;
254  if( expect_next_utf8_byte && '%' != c )
256  fmt::format(
258  "next byte from UTF-8 sequence expected at {}" ),
259  current_pos() )
260  } );
261 
262  if( '%' == c )
263  {
264  if( chars_to_handle >= 3 &&
265  is_hexdigit( d[ 1 ] ) &&
266  is_hexdigit( d[ 2 ] ) )
267  {
268  const auto ch = extract_escaped_char( d[ 1 ], d[ 2 ] );
269  if( !utf8_checker.process_byte( static_cast<std::uint8_t>(ch) ) )
271  fmt::format(
273  "invalid UTF-8 sequence detected at {}" ),
274  current_pos() )
275  } );
276 
277  collector( ch );
278  chars_to_handle -= 3;
279  d += 3;
280 
282  if( !expect_next_utf8_byte )
284  }
285  else
286  {
288  fmt::format(
290  "invalid escape sequence at pos {}" ),
291  current_pos() )
292  } );
293  }
294  }
295  else if( '+' == c )
296  {
297  collector( ' ' );
298  --chars_to_handle;
299  ++d;
300  }
301  else if( Traits::ordinary_char( c ) )
302  {
303  collector( c );
304  --chars_to_handle;
305  ++d;
306  }
307  else
308  {
310  fmt::format(
312  "invalid non-escaped char with code {:#02X} at pos: {}" ),
313  c,
314  current_pos() )
315  } );
316  }
317  }
318 
321  fmt::format(
322  RESTINIO_FMT_FORMAT_STRING( "unfinished UTF-8 sequence" ) )
323  } );
324 
326 }
327 
328 } /* namespace impl */
329 
330 //! Percent encoding.
331 //! \{
332 template< typename Traits = restinio_default_unescape_traits >
333 [[nodiscard]]
334 std::string
336 {
337  std::string result;
338  const auto escaped_chars_count = static_cast<std::size_t>(
339  std::count_if(
340  data.begin(),
341  data.end(),
342  []( auto c ){ return !Traits::ordinary_char(c); } ));
343 
344  if( 0 == escaped_chars_count )
345  {
346  // No escaped chars.
347  result.assign( data.data(), data.size() );
348  }
349  else
350  {
351  // Having escaped chars.
353  for( auto c : data )
354  {
355  if( Traits::ordinary_char( c ) )
356  result += c;
357  else
358  {
359  result += fmt::format( RESTINIO_FMT_FORMAT_STRING( "%{:02X}" ), c );
360  }
361  }
362  }
363 
364  return result;
365 }
366 
367 template< typename Traits = restinio_default_unescape_traits >
368 [[nodiscard]]
369 std::string
371 {
372  std::string result;
373  result.reserve( data.size() );
374 
376  data,
377  [&result]( char ch ) { result += ch; } );
378  if( !r )
379  throw exception_t{ r.error().giveout_description() };
380 
381  return result;
382 }
383 
384 /*!
385  * @brief Helper function for unescaping percent-encoded string.
386  *
387  * This function doesn't throw if some character can't be unescaped or
388  * some ill-formed sequence is found.
389  *
390  * @note
391  * This function is not noexcept and can throw on other types of
392  * failures (like unability to allocate a memory).
393  *
394  * @since v.0.6.5
395  */
396 template< typename Traits = restinio_default_unescape_traits >
397 [[nodiscard]]
400 {
401  std::string result;
402  result.reserve( data.size() );
403 
405  data,
406  [&result]( char ch ) { result += ch; } );
407  if( !r )
408  return make_unexpected( std::move(r.error()) );
409 
410  return std::move(result);
411 }
412 
413 template< typename Traits = restinio_default_unescape_traits >
414 [[nodiscard]]
415 std::size_t
417 {
418  std::size_t result_size = 0u;
419  char * dest = data;
420 
422  string_view_t{ data, size },
423  [&result_size, &dest]( char ch ) {
424  *dest++ = ch;
425  ++result_size;
426  } );
427  if( !r )
428  throw exception_t{ r.error().giveout_description() };
429 
430  return result_size;
431 }
432 
433 /*!
434  * @brief Helper function for unescaping percent-encoded string inplace.
435  *
436  * This function doesn't throw if some character can't be unescaped or
437  * some ill-formed sequence is found.
438  *
439  * @note
440  * This function is not noexcept and can throw on other types of
441  * failures.
442  *
443  * @since v.0.6.5
444  */
445 template< typename Traits = restinio_default_unescape_traits >
446 [[nodiscard]]
449 {
450  std::size_t result_size = 0u;
451  char * dest = data;
452 
454  string_view_t{ data, size },
455  [&result_size, &dest]( char ch ) {
456  *dest++ = ch;
457  ++result_size;
458  } );
459  if( !r )
460  return make_unexpected( std::move(r.error()) );
461 
462  return result_size;
463 }
464 
465 //! \}
466 
468 {
469 
471 {
472 
473 namespace impl
474 {
475 
476 /*!
477  * @brief Is this symbol a part of unreserved set?
478  *
479  * See https://tools.ietf.org/html/rfc3986#section-2.3 for more details.
480  *
481  * @since v.0.6.2
482  */
483 [[nodiscard]]
484 constexpr inline bool
485 is_unreserved_char( const char ch ) noexcept
486 {
487  // In this version of RESTinio class restinio_default_unescape_traits
488  // already implements necessary check.
490 }
491 
492 /*!
493  * @brief Internal helper to perform the main logic of enumeration
494  * of symbols in URI.
495  *
496  * Inspect the content of \a what and calls \a one_byte_handler if
497  * single characted should be used as output, otherwise calls
498  * \a three_bytes_handler (if percent-encoding sequence from three chars
499  * should be passed to the output as is).
500  *
501  * @attention
502  * Throws if invalid UTF-8 sequence is found.
503  *
504  * @brief v.0.6.5
505  */
506 template<
507  typename One_Byte_Handler,
508  typename Three_Byte_Handler >
509 void
511  string_view_t what,
512  One_Byte_Handler && one_byte_handler,
513  Three_Byte_Handler && three_byte_handler )
514 {
515  using namespace restinio::utils::impl;
516 
518  const char * d = what.data();
519 
521  bool expect_next_utf8_byte = false;
522 
523  const auto current_pos = [&d, &what]() noexcept { return d - what.data(); };
524 
525  while( 0 < chars_to_handle )
526  {
527  if( expect_next_utf8_byte && '%' != *d )
528  throw exception_t{
529  fmt::format(
531  "next byte from UTF-8 sequence expected at {}" ),
532  current_pos() )
533  };
534 
535  if( '%' != *d )
536  {
537  // Just one symbol to the output.
538  one_byte_handler( *d );
539  ++d;
540  --chars_to_handle;
541  }
542  else if( chars_to_handle >= 3 &&
543  is_hexdigit( d[ 1 ] ) && is_hexdigit( d[ 2 ] ) )
544  {
545  const char ch = extract_escaped_char( d[ 1 ], d[ 2 ] );
546  if( !utf8_checker.process_byte( static_cast<std::uint8_t>(ch) ) )
547  throw exception_t{
548  fmt::format(
550  "invalid UTF-8 sequence detected at {}" ),
551  current_pos() )
552  };
553 
554  bool keep_three_bytes = true;
555 
556  if( utf8_checker.finalized() )
557  {
558  expect_next_utf8_byte = false;
559 
560  const auto symbol = utf8_checker.current_symbol();
562 
563  if( symbol < 0x80u )
564  {
565  const char ascii_char = static_cast<char>(symbol);
567  {
568  // percent encoded char will be replaced by one char.
570  keep_three_bytes = false;
571  }
572  }
573  }
574  else
575  {
576  expect_next_utf8_byte = true;
577  }
578 
579  if( keep_three_bytes )
580  {
581  // this part of multi-byte char will go to the output as is.
582  three_byte_handler( d[ 0 ], d[ 1 ], d[ 2 ] );
583  }
584 
585  chars_to_handle -= 3;
586  d += 3u;
587  }
588  else
589  {
590  throw exception_t{
591  fmt::format(
592  RESTINIO_FMT_FORMAT_STRING( "invalid escape sequence at pos {}" ),
593  current_pos() )
594  };
595  }
596  }
597 
599  throw exception_t{
600  fmt::format( RESTINIO_FMT_FORMAT_STRING( "unfinished UTF-8 sequence" ) ) };
601 }
602 
603 } /* namespace impl */
604 
605 /*!
606  * @brief Calculate the size of a buffer to hold normalized value of a URI.
607  *
608  * If @a what has some chars from unreserved set in percent-encoded form
609  * then this function returns the size of a buffer to hold normalized value
610  * of @a what. Otherwise the original size of @a what is returned.
611  *
612  * @note
613  * This functions throws if @a what has invalid value.
614  *
615  * @since v.0.6.2
616  */
617 [[nodiscard]]
618 inline std::size_t
621 {
622  std::size_t calculated_capacity = 0u;
623 
624  impl::run_normalization_algo( what,
625  [&calculated_capacity]( char ) noexcept {
626  ++calculated_capacity;
627  },
628  [&calculated_capacity]( char, char, char ) noexcept {
629  calculated_capacity += 3u;
630  } );
631 
632  return calculated_capacity;
633 }
634 
635 /*!
636  * @brief Perform normalization of URI value.
637  *
638  * Copies the content of @a what into @a dest and replaces the
639  * percent-encoded representation of chars from unreserved set into
640  * their normal values.
641  *
642  * @attention
643  * The capacity of @a dest should be enough to hold the result value.
644  * It's assumed that estimate_required_capacity() is called before that
645  * function and the result of estimate_required_capacity() is used for
646  * allocation of a buffer for @a dest.
647  *
648  * @note
649  * This functions throws if @a what has invalid value.
650  *
651  * @since v.0.6.2
652  */
653 inline void
655  string_view_t what,
656  char * dest )
657 {
658  impl::run_normalization_algo( what,
659  [&dest]( char ch ) noexcept {
660  *dest++ = ch;
661  },
662  [&dest]( char ch1, char ch2, char ch3 ) noexcept {
663  dest[ 0 ] = ch1;
664  dest[ 1 ] = ch2;
665  dest[ 2 ] = ch3;
666  dest += 3;
667  } );
668 }
669 
670 } /* namespace unreserved_chars */
671 
672 } /* namespace uri_normalization */
673 
674 } /* namespace utils */
675 
676 } /* namespace restinio */
expected_t< std::size_t, unescape_percent_encoding_failure_t > try_inplace_unescape_percent_encoding(char *data, std::size_t size)
Helper function for unescaping percent-encoded string inplace.
static constexpr bool ordinary_char(char c) noexcept
string_view_t from_string< string_view_t >(string_view_t s)
Get a value from string_view.
std::size_t inplace_unescape_percent_encoding(char *data, std::size_t size)
static bool ordinary_char(char c) noexcept
The traits for escaping and unexcaping symbols in JavaScript-compatible mode.
std::string m_description
Description of a failure.
The default traits for escaping and unexcaping symbols in a query string.
std::enable_if< std::is_same< Parameter_Container, query_string_params_t >::value||std::is_same< Parameter_Container, router::route_params_t >::value, std::optional< Value_Type > >::type opt_value(const Parameter_Container &params, string_view_t key)
Gets the value of a parameter specified by key wrapped in std::optional<Value_Type> if parameter exis...
Definition: value_or.hpp:64
std::string giveout_description() noexcept
Get out the value of the description of the failure.
std::string unescape_percent_encoding(const string_view_t data)
void run_normalization_algo(string_view_t what, One_Byte_Handler &&one_byte_handler, Three_Byte_Handler &&three_byte_handler)
Internal helper to perform the main logic of enumeration of symbols in URI.
expected_t< std::string, unescape_percent_encoding_failure_t > try_unescape_percent_encoding(const string_view_t data)
Helper function for unescaping percent-encoded string.
Traits for escaping and unescaping symbols in a query string in very relaxed mode.
void normalize_to(string_view_t what, char *dest)
Perform normalization of URI value.
char extract_escaped_char(char c1, char c2)
expected_t< unescape_percent_encoding_success_t, unescape_percent_encoding_failure_t > do_unescape_percent_encoding(const string_view_t data, Chars_Collector &&collector)
The actual implementation of unescape-percent-encoding procedure.
std::string escape_percent_encoding(const string_view_t data)
Percent encoding.
constexpr bool is_unreserved_char(const char ch) noexcept
Is this symbol a part of unreserved set?
Traits for escaping and unexcaping symbols in a query string in correspondence with application/x-www...
const std::string & description() const noexcept
Get a reference to the description of the failure.
std::size_t estimate_required_capacity(string_view_t what)
Calculate the size of a buffer to hold normalized value of a URI.
static constexpr bool ordinary_char(char c) noexcept
static constexpr bool ordinary_char(char c) noexcept
Type that indicates a failure of unescaping of percent-encoded symbols.
Type that indicates that unescaping of percent-encoded symbols completed successfully.