RESTinio
Loading...
Searching...
No Matches
utf8_checker.hpp
Go to the documentation of this file.
1/*
2 * RESTinio
3 */
4
12#pragma once
13
15
16#include <cstdint>
17
18namespace restinio
19{
20
21namespace utils
22{
23
24//
25// utf8_checker_t
26//
27
33{
35 enum class state_t
36 {
44 invalid,
45 };
46
48
53 std::uint32_t m_current_symbol = 0u;
54
57
58 void
59 on_first_byte( std::uint8_t byte ) noexcept
60 {
61 if( byte <= 0x7Fu )
62 {
64 m_current_symbol = byte;
65 }
66 else if( 0xC0u == (byte & 0xE0u) )
67 {
69 m_current_symbol = (byte & 0x1Fu);
70 }
71 else if( 0xE0u == (byte & 0xF0u) )
72 {
74 m_current_symbol = (byte & 0x0Fu);
75 }
76 else if( 0xF0u == (byte & 0xF8u) )
77 {
79 m_current_symbol = (byte & 0x07u);
80 }
81 else
82 {
83 // Because UTF-8 can represent only ranges from:
84 //
85 // 0000 0000-0000 007F
86 // 0000 0080-0000 07FF
87 // 0000 0800-0000 FFFF
88 // 0001 0000-0010 FFFF
89 //
90 // There is no need to check masks like 0b111110xx and so on.
91 //
92 // See https://datatracker.ietf.org/doc/html/rfc3629
93 //
95 }
96 }
97
98 void
99 on_second_of_two( std::uint8_t byte ) noexcept
100 {
101 if( 0x80u == (byte & 0xC0u) )
102 {
103 m_current_symbol <<= 6;
104 m_current_symbol |= (byte & 0x3Fu);
105
106 // Check for overlong sequence.
107 // The valid range for two bytes representation is 0x0080..0x07FF.
108 if( m_current_symbol < 0x0080u )
109 {
110 // The value is too small, it's overlong.
112 }
113 else
114 // Three is no need to check the result value against
115 // invalid ranges (0xD800..0xDFFF and 0x110000..)
116 // because two bytes only represents 0x0080..0x07FF.
118 }
119 else
120 {
122 }
123 }
124
125 void
126 on_second_of_three( std::uint8_t byte ) noexcept
127 {
128 if( 0x80u == (byte & 0xC0u) )
129 {
130 m_current_symbol <<= 6;
131 m_current_symbol |= (byte & 0x3Fu);
132
134 }
135 else
136 {
138 }
139 }
140
141 void
142 on_second_of_four( std::uint8_t byte ) noexcept
143 {
144 if( 0x80u == (byte & 0xC0u) )
145 {
146 m_current_symbol <<= 6;
147 m_current_symbol |= (byte & 0x3Fu);
148
150 }
151 else
152 {
154 }
155 }
156
157 void
158 on_third_of_three( std::uint8_t byte ) noexcept
159 {
160 if( 0x80u == (byte & 0xC0u) )
161 {
162 m_current_symbol <<= 6;
163 m_current_symbol |= (byte & 0x3Fu);
164
165 // Check for overlong sequence.
166 // The valid range for three bytes representation is 0x0800..0xFFFF.
167 if( m_current_symbol < 0x0800u )
168 {
169 // The value is too small, it's overlong.
171 }
172 else
173 {
174 // It's necessary to check illigal points 0xD800..0xDFFF.
175 if( m_current_symbol >= 0xD800 && m_current_symbol <= 0xDFFF )
177 else
179 }
180 }
181 else
182 {
184 }
185 }
186
187 void
188 on_third_of_four( std::uint8_t byte ) noexcept
189 {
190 if( 0x80u == (byte & 0xC0u) )
191 {
192 m_current_symbol <<= 6;
193 m_current_symbol |= (byte & 0x3Fu);
194
196 }
197 else
198 {
200 }
201 }
202
203 void
204 on_fourth_of_four( std::uint8_t byte ) noexcept
205 {
206 if( 0x80u == (byte & 0xC0u) )
207 {
208 m_current_symbol <<= 6;
209 m_current_symbol |= (byte & 0x3Fu);
210
211 // Check for overlong sequence.
212 // The valid range for three bytes representation is 0x10000..0x10FFFF.
213 if( m_current_symbol < 0x10000u )
214 {
215 // The value is too small, it's overlong.
217 }
218 else
219 {
220 // It's necessary to check for values above 0x10FFFF.
221 // There is no need to check 0xD800..0xDFFF range because
222 // it was already handled by overlong check.
223 if( m_current_symbol >= 0x110000 )
225 else
227 }
228 }
229 else
230 {
232 }
233 }
234
235public:
236 utf8_checker_t() = default;
237
270 bool
271 process_byte( std::uint8_t byte ) noexcept
272 {
273 switch( m_state )
274 {
276 on_first_byte( byte );
277 break;
278
280 on_second_of_two( byte );
281 break;
282
284 on_second_of_three( byte );
285 break;
286
288 on_second_of_four( byte );
289 break;
290
292 on_third_of_three( byte );
293 break;
294
296 on_third_of_four( byte );
297 break;
298
300 on_fourth_of_four( byte );
301 break;
302
303 case state_t::invalid:
304 // Nothing to do.
305 break;
306 }
307
308 return (state_t::invalid != m_state);
309 }
310
315 bool
316 finalized() const noexcept
317 {
319 }
320
324 void
325 reset() noexcept
326 {
327 m_current_symbol = 0u;
329 }
330
341 std::uint32_t
342 current_symbol() const noexcept { return m_current_symbol; }
343};
344
345} /* namespace utils */
346
347} /* namespace restinio */
348
Helper class for checking UTF-8 byte sequence during parsing URI or incoming byte stream.
void on_fourth_of_four(std::uint8_t byte) noexcept
void on_third_of_three(std::uint8_t byte) noexcept
void on_second_of_three(std::uint8_t byte) noexcept
state_t
Enumeration of all possible checker states.
void on_second_of_four(std::uint8_t byte) noexcept
void on_third_of_four(std::uint8_t byte) noexcept
void on_first_byte(std::uint8_t byte) noexcept
RESTINIO_NODISCARD bool finalized() const noexcept
RESTINIO_NODISCARD bool process_byte(std::uint8_t byte) noexcept
std::uint32_t m_current_symbol
The current UNICODE symbol.
RESTINIO_NODISCARD std::uint32_t current_symbol() const noexcept
void on_second_of_two(std::uint8_t byte) noexcept
state_t m_state
The current state of the checker.
Detection of compiler version and absence of various features.
#define RESTINIO_NODISCARD
#define const
Definition: zconf.h:230