31 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
32 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
40 class invalid_code_point :
public std::exception {
43 invalid_code_point(uint32_t cp) : cp(cp) {}
44 virtual const char* what()
const throw() {
return "Invalid code point"; }
45 uint32_t code_point()
const {
return cp;}
48 class invalid_utf8 :
public std::exception {
51 invalid_utf8 (uint8_t u) : u8(u) {}
52 virtual const char* what()
const throw() {
return "Invalid UTF-8"; }
53 uint8_t utf8_octet()
const {
return u8;}
56 class invalid_utf16 :
public std::exception {
59 invalid_utf16 (uint16_t u) : u16(u) {}
60 virtual const char* what()
const throw() {
return "Invalid UTF-16"; }
61 uint16_t utf16_word()
const {
return u16;}
64 class not_enough_room :
public std::exception {
66 virtual const char* what()
const throw() {
return "Not enough space"; }
71 template <
typename octet_iterator,
typename output_iterator>
72 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
74 while (start != end) {
75 octet_iterator sequence_start = start;
76 internal::utf_error err_code = internal::validate_next(start, end);
79 for (octet_iterator it = sequence_start; it != start; ++it)
82 case internal::NOT_ENOUGH_ROOM:
83 throw not_enough_room();
84 case internal::INVALID_LEAD:
85 append (replacement, out);
88 case internal::INCOMPLETE_SEQUENCE:
89 case internal::OVERLONG_SEQUENCE:
90 case internal::INVALID_CODE_POINT:
91 append (replacement, out);
94 while (internal::is_trail(*start) && start != end)
102 template <
typename octet_iterator,
typename output_iterator>
103 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
105 static const uint32_t replacement_marker = internal::mask16(0xfffd);
106 return replace_invalid(start, end, out, replacement_marker);
109 template <
typename octet_iterator>
110 octet_iterator append(uint32_t cp, octet_iterator result)
112 if (!internal::is_code_point_valid(cp))
113 throw invalid_code_point(cp);
116 *(result++) = static_cast<uint8_t>(cp);
117 else if (cp < 0x800) {
118 *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
119 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
121 else if (cp < 0x10000) {
122 *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
123 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
124 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
126 else if (cp <= internal::CODE_POINT_MAX) {
127 *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
128 *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80);
129 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
130 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
133 throw invalid_code_point(cp);
138 template <
typename octet_iterator>
139 uint32_t next(octet_iterator& it, octet_iterator end)
142 internal::utf_error err_code = internal::validate_next(it, end, &cp);
146 case internal::NOT_ENOUGH_ROOM :
147 throw not_enough_room();
148 case internal::INVALID_LEAD :
149 case internal::INCOMPLETE_SEQUENCE :
150 case internal::OVERLONG_SEQUENCE :
151 throw invalid_utf8(*it);
152 case internal::INVALID_CODE_POINT :
153 throw invalid_code_point(cp);
158 template <
typename octet_iterator>
159 uint32_t peek_next(octet_iterator it, octet_iterator end)
161 return next(it, end);
164 template <
typename octet_iterator>
165 uint32_t prior(octet_iterator& it, octet_iterator start)
167 octet_iterator end = it;
168 while (internal::is_trail(*(--it)))
170 throw invalid_utf8(*it);
171 octet_iterator temp = it;
172 return next(temp, end);
176 template <
typename octet_iterator>
177 uint32_t previous(octet_iterator& it, octet_iterator pass_start)
179 octet_iterator end = it;
180 while (internal::is_trail(*(--it)))
181 if (it == pass_start)
182 throw invalid_utf8(*it);
183 octet_iterator temp = it;
184 return next(temp, end);
187 template <
typename octet_iterator,
typename distance_type>
188 void advance (octet_iterator& it, distance_type n, octet_iterator end)
190 for (distance_type i = 0; i < n; ++i)
194 template <
typename octet_iterator>
195 typename std::iterator_traits<octet_iterator>::difference_type
196 distance (octet_iterator first, octet_iterator last)
198 typename std::iterator_traits<octet_iterator>::difference_type dist;
199 for (dist = 0; first < last; ++dist)
204 template <
typename u16bit_iterator,
typename octet_iterator>
205 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
207 while (start != end) {
208 uint32_t cp = internal::mask16(*start++);
210 if (internal::is_surrogate(cp)) {
212 uint32_t trail_surrogate = internal::mask16(*start++);
213 if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
214 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
216 throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
219 throw invalid_utf16(static_cast<uint16_t>(*start));
222 result = append(cp, result);
227 template <
typename u16bit_iterator,
typename octet_iterator>
228 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
230 while (start != end) {
231 uint32_t cp = next(start, end);
233 *result++ =
static_cast<uint16_t
>((cp >> 10) + internal::LEAD_OFFSET);
234 *result++ =
static_cast<uint16_t
>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
237 *result++ =
static_cast<uint16_t
>(cp);
242 template <
typename octet_iterator,
typename u32bit_iterator>
243 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
246 result = append(*(start++), result);
251 template <
typename octet_iterator,
typename u32bit_iterator>
252 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
255 (*result++) = next(start, end);
261 template <
typename octet_iterator>
262 class iterator :
public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
264 octet_iterator range_start;
265 octet_iterator range_end;
268 explicit iterator (
const octet_iterator& octet_it,
269 const octet_iterator& range_start,
270 const octet_iterator& range_end) :
271 it(octet_it), range_start(range_start), range_end(range_end)
273 if (it < range_start || it > range_end)
274 throw std::out_of_range(
"Invalid utf-8 iterator position");
277 octet_iterator base ()
const {
return it; }
278 uint32_t operator * ()
const
280 octet_iterator temp = it;
281 return next(temp, range_end);
285 if (range_start != rhs.range_start || range_end != rhs.range_end)
286 throw std::logic_error(
"Comparing utf-8 iterators defined with different ranges");
287 return (it == rhs.it);
293 iterator& operator ++ ()
298 iterator operator ++ (
int)
300 iterator temp = *
this;
304 iterator& operator -- ()
306 prior(it, range_start);
309 iterator operator -- (
int)
311 iterator temp = *
this;
312 prior(it, range_start);
318 template <
typename octet_iterator>
319 class wchar_iterator :
320 public std::iterator<std::bidirectional_iterator_tag, wchar_t>
323 octet_iterator range_start;
324 octet_iterator range_end;
326 wchar_iterator () {};
327 wchar_iterator (
const octet_iterator& octet_it,
328 const octet_iterator& range_start,
329 const octet_iterator& range_end) :
330 it(octet_it), range_start(range_start), range_end(range_end)
332 if (it < range_start || it > range_end)
333 throw std::out_of_range(
"Invalid utf-8 iterator position");
336 octet_iterator base ()
const {
return it; }
337 wchar_t operator * ()
const
339 octet_iterator temp = it;
340 uint32_t retval = next(temp, range_end);
341 assert(retval <= WCHAR_MAX);
346 if (range_start != rhs.range_start || range_end != rhs.range_end)
347 throw std::logic_error(
"Comparing utf-8 iterators defined with different ranges");
348 return (it == rhs.it);
354 wchar_iterator& operator ++ ()
359 wchar_iterator operator ++ (
int)
361 wchar_iterator temp = *
this;
365 wchar_iterator& operator -- ()
367 prior(it, range_start);
370 wchar_iterator operator -- (
int)
372 wchar_iterator temp = *
this;
373 prior(it, range_start);
380 #endif //header guard