00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include "CUnicode.h"
00016 #include "CArch.h"
00017 #include <cstring>
00018
00019
00020
00021
00022
00023 inline
00024 static
00025 UInt16
00026 decode16(const UInt8* n, bool byteSwapped)
00027 {
00028 union x16 {
00029 UInt8 n8[2];
00030 UInt16 n16;
00031 } c;
00032 if (byteSwapped) {
00033 c.n8[0] = n[1];
00034 c.n8[1] = n[0];
00035 }
00036 else {
00037 c.n8[0] = n[0];
00038 c.n8[1] = n[1];
00039 }
00040 return c.n16;
00041 }
00042
00043 inline
00044 static
00045 UInt32
00046 decode32(const UInt8* n, bool byteSwapped)
00047 {
00048 union x32 {
00049 UInt8 n8[4];
00050 UInt32 n32;
00051 } c;
00052 if (byteSwapped) {
00053 c.n8[0] = n[3];
00054 c.n8[1] = n[2];
00055 c.n8[2] = n[1];
00056 c.n8[3] = n[0];
00057 }
00058 else {
00059 c.n8[0] = n[0];
00060 c.n8[1] = n[1];
00061 c.n8[2] = n[2];
00062 c.n8[3] = n[3];
00063 }
00064 return c.n32;
00065 }
00066
00067 inline
00068 static
00069 void
00070 resetError(bool* errors)
00071 {
00072 if (errors != NULL) {
00073 *errors = false;
00074 }
00075 }
00076
00077 inline
00078 static
00079 void
00080 setError(bool* errors)
00081 {
00082 if (errors != NULL) {
00083 *errors = true;
00084 }
00085 }
00086
00087
00088
00089
00090
00091
00092 UInt32 CUnicode::s_invalid = 0x0000ffff;
00093 UInt32 CUnicode::s_replacement = 0x0000fffd;
00094
00095 bool
00096 CUnicode::isUTF8(const CString& src)
00097 {
00098
00099 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
00100 for (UInt32 n = src.size(); n > 0; ) {
00101 if (fromUTF8(data, n) == s_invalid) {
00102 return false;
00103 }
00104 }
00105 return true;
00106 }
00107
00108 CString
00109 CUnicode::UTF8ToUCS2(const CString& src, bool* errors)
00110 {
00111
00112 resetError(errors);
00113
00114
00115 UInt32 n = src.size();
00116 CString dst;
00117 dst.reserve(2 * n);
00118
00119
00120 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
00121 while (n > 0) {
00122 UInt32 c = fromUTF8(data, n);
00123 if (c == s_invalid) {
00124 c = s_replacement;
00125 }
00126 else if (c >= 0x00010000) {
00127 setError(errors);
00128 c = s_replacement;
00129 }
00130 UInt16 ucs2 = static_cast<UInt16>(c);
00131 dst.append(reinterpret_cast<const char*>(&ucs2), 2);
00132 }
00133
00134 return dst;
00135 }
00136
00137 CString
00138 CUnicode::UTF8ToUCS4(const CString& src, bool* errors)
00139 {
00140
00141 resetError(errors);
00142
00143
00144 UInt32 n = src.size();
00145 CString dst;
00146 dst.reserve(4 * n);
00147
00148
00149 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
00150 while (n > 0) {
00151 UInt32 c = fromUTF8(data, n);
00152 if (c == s_invalid) {
00153 c = s_replacement;
00154 }
00155 dst.append(reinterpret_cast<const char*>(&c), 4);
00156 }
00157
00158 return dst;
00159 }
00160
00161 CString
00162 CUnicode::UTF8ToUTF16(const CString& src, bool* errors)
00163 {
00164
00165 resetError(errors);
00166
00167
00168 UInt32 n = src.size();
00169 CString dst;
00170 dst.reserve(2 * n);
00171
00172
00173 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
00174 while (n > 0) {
00175 UInt32 c = fromUTF8(data, n);
00176 if (c == s_invalid) {
00177 c = s_replacement;
00178 }
00179 else if (c >= 0x00110000) {
00180 setError(errors);
00181 c = s_replacement;
00182 }
00183 if (c < 0x00010000) {
00184 UInt16 ucs2 = static_cast<UInt16>(c);
00185 dst.append(reinterpret_cast<const char*>(&ucs2), 2);
00186 }
00187 else {
00188 c -= 0x00010000;
00189 UInt16 utf16h = static_cast<UInt16>((c >> 10) + 0xd800);
00190 UInt16 utf16l = static_cast<UInt16>((c & 0x03ff) + 0xdc00);
00191 dst.append(reinterpret_cast<const char*>(&utf16h), 2);
00192 dst.append(reinterpret_cast<const char*>(&utf16l), 2);
00193 }
00194 }
00195
00196 return dst;
00197 }
00198
00199 CString
00200 CUnicode::UTF8ToUTF32(const CString& src, bool* errors)
00201 {
00202
00203 resetError(errors);
00204
00205
00206 UInt32 n = src.size();
00207 CString dst;
00208 dst.reserve(4 * n);
00209
00210
00211 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
00212 while (n > 0) {
00213 UInt32 c = fromUTF8(data, n);
00214 if (c == s_invalid) {
00215 c = s_replacement;
00216 }
00217 else if (c >= 0x00110000) {
00218 setError(errors);
00219 c = s_replacement;
00220 }
00221 dst.append(reinterpret_cast<const char*>(&c), 4);
00222 }
00223
00224 return dst;
00225 }
00226
00227 CString
00228 CUnicode::UTF8ToText(const CString& src, bool* errors)
00229 {
00230
00231 resetError(errors);
00232
00233
00234 UInt32 size;
00235 wchar_t* tmp = UTF8ToWideChar(src, size, errors);
00236
00237
00238 int len = ARCH->convStringWCToMB(NULL, tmp, size, errors);
00239 char* mbs = new char[len + 1];
00240 ARCH->convStringWCToMB(mbs, tmp, size, errors);
00241 CString text(mbs, len);
00242
00243
00244 delete[] mbs;
00245 delete[] tmp;
00246
00247 return text;
00248 }
00249
00250 CString
00251 CUnicode::UCS2ToUTF8(const CString& src, bool* errors)
00252 {
00253
00254 resetError(errors);
00255
00256
00257 UInt32 n = src.size() >> 1;
00258 return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
00259 }
00260
00261 CString
00262 CUnicode::UCS4ToUTF8(const CString& src, bool* errors)
00263 {
00264
00265 resetError(errors);
00266
00267
00268 UInt32 n = src.size() >> 2;
00269 return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
00270 }
00271
00272 CString
00273 CUnicode::UTF16ToUTF8(const CString& src, bool* errors)
00274 {
00275
00276 resetError(errors);
00277
00278
00279 UInt32 n = src.size() >> 1;
00280 return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
00281 }
00282
00283 CString
00284 CUnicode::UTF32ToUTF8(const CString& src, bool* errors)
00285 {
00286
00287 resetError(errors);
00288
00289
00290 UInt32 n = src.size() >> 2;
00291 return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
00292 }
00293
00294 CString
00295 CUnicode::textToUTF8(const CString& src, bool* errors)
00296 {
00297
00298 resetError(errors);
00299
00300
00301 UInt32 n = src.size();
00302 int len = ARCH->convStringMBToWC(NULL, src.c_str(), n, errors);
00303 wchar_t* wcs = new wchar_t[len + 1];
00304 ARCH->convStringMBToWC(wcs, src.c_str(), n, errors);
00305
00306
00307 CString utf8 = wideCharToUTF8(wcs, len, errors);
00308
00309
00310 delete[] wcs;
00311
00312 return utf8;
00313 }
00314
00315 wchar_t*
00316 CUnicode::UTF8ToWideChar(const CString& src, UInt32& size, bool* errors)
00317 {
00318
00319 CString tmp;
00320 switch (ARCH->getWideCharEncoding()) {
00321 case IArchString::kUCS2:
00322 tmp = UTF8ToUCS2(src, errors);
00323 size = tmp.size() >> 1;
00324 break;
00325
00326 case IArchString::kUCS4:
00327 tmp = UTF8ToUCS4(src, errors);
00328 size = tmp.size() >> 2;
00329 break;
00330
00331 case IArchString::kUTF16:
00332 tmp = UTF8ToUTF16(src, errors);
00333 size = tmp.size() >> 1;
00334 break;
00335
00336 case IArchString::kUTF32:
00337 tmp = UTF8ToUTF32(src, errors);
00338 size = tmp.size() >> 2;
00339 break;
00340
00341 default:
00342 assert(0 && "unknown wide character encoding");
00343 }
00344
00345
00346 wchar_t* dst = new wchar_t[size];
00347 ::memcpy(dst, tmp.data(), sizeof(wchar_t) * size);
00348 return dst;
00349 }
00350
00351 CString
00352 CUnicode::wideCharToUTF8(const wchar_t* src, UInt32 size, bool* errors)
00353 {
00354
00355
00356
00357 switch (ARCH->getWideCharEncoding()) {
00358 case IArchString::kUCS2:
00359 return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
00360
00361 case IArchString::kUCS4:
00362 return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
00363
00364 case IArchString::kUTF16:
00365 return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
00366
00367 case IArchString::kUTF32:
00368 return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
00369
00370 default:
00371 assert(0 && "unknown wide character encoding");
00372 return CString();
00373 }
00374 }
00375
00376 CString
00377 CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors)
00378 {
00379
00380 CString dst;
00381 dst.reserve(n);
00382
00383
00384 bool byteSwapped = false;
00385 if (n >= 1) {
00386 switch (decode16(data, false)) {
00387 case 0x0000feff:
00388 data += 2;
00389 --n;
00390 break;
00391
00392 case 0x0000fffe:
00393 byteSwapped = true;
00394 data += 2;
00395 --n;
00396 break;
00397
00398 default:
00399 break;
00400 }
00401 }
00402
00403
00404 for (; n > 0; data += 2, --n) {
00405 UInt32 c = decode16(data, byteSwapped);
00406 toUTF8(dst, c, errors);
00407 }
00408
00409 return dst;
00410 }
00411
00412 CString
00413 CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors)
00414 {
00415
00416 CString dst;
00417 dst.reserve(n);
00418
00419
00420 bool byteSwapped = false;
00421 if (n >= 1) {
00422 switch (decode32(data, false)) {
00423 case 0x0000feff:
00424 data += 4;
00425 --n;
00426 break;
00427
00428 case 0x0000fffe:
00429 byteSwapped = true;
00430 data += 4;
00431 --n;
00432 break;
00433
00434 default:
00435 break;
00436 }
00437 }
00438
00439
00440 for (; n > 0; data += 4, --n) {
00441 UInt32 c = decode32(data, byteSwapped);
00442 toUTF8(dst, c, errors);
00443 }
00444
00445 return dst;
00446 }
00447
00448 CString
00449 CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors)
00450 {
00451
00452 CString dst;
00453 dst.reserve(n);
00454
00455
00456 bool byteSwapped = false;
00457 if (n >= 1) {
00458 switch (decode16(data, false)) {
00459 case 0x0000feff:
00460 data += 2;
00461 --n;
00462 break;
00463
00464 case 0x0000fffe:
00465 byteSwapped = true;
00466 data += 2;
00467 --n;
00468 break;
00469
00470 default:
00471 break;
00472 }
00473 }
00474
00475
00476 for (; n > 0; data += 2, --n) {
00477 UInt32 c = decode16(data, byteSwapped);
00478 if (c < 0x0000d800 || c > 0x0000dfff) {
00479 toUTF8(dst, c, errors);
00480 }
00481 else if (n == 1) {
00482
00483 setError(errors);
00484 toUTF8(dst, s_replacement, NULL);
00485 }
00486 else if (c >= 0x0000d800 && c <= 0x0000dbff) {
00487 UInt32 c2 = decode16(data, byteSwapped);
00488 data += 2;
00489 --n;
00490 if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
00491
00492 setError(errors);
00493 toUTF8(dst, s_replacement, NULL);
00494 }
00495 else {
00496 c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
00497 toUTF8(dst, c, errors);
00498 }
00499 }
00500 else {
00501
00502 setError(errors);
00503 toUTF8(dst, s_replacement, NULL);
00504 }
00505 }
00506
00507 return dst;
00508 }
00509
00510 CString
00511 CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors)
00512 {
00513
00514 CString dst;
00515 dst.reserve(n);
00516
00517
00518 bool byteSwapped = false;
00519 if (n >= 1) {
00520 switch (decode32(data, false)) {
00521 case 0x0000feff:
00522 data += 4;
00523 --n;
00524 break;
00525
00526 case 0x0000fffe:
00527 byteSwapped = true;
00528 data += 4;
00529 --n;
00530 break;
00531
00532 default:
00533 break;
00534 }
00535 }
00536
00537
00538 for (; n > 0; data += 4, --n) {
00539 UInt32 c = decode32(data, byteSwapped);
00540 if (c >= 0x00110000) {
00541 setError(errors);
00542 c = s_replacement;
00543 }
00544 toUTF8(dst, c, errors);
00545 }
00546
00547 return dst;
00548 }
00549
00550 UInt32
00551 CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
00552 {
00553 assert(data != NULL);
00554 assert(n != 0);
00555
00556
00557
00558
00559 UInt32 size;
00560 if (data[0] < 0x80) {
00561
00562 size = 1;
00563 }
00564 else if (data[0] < 0xc0) {
00565
00566
00567 --n;
00568 ++data;
00569 return s_invalid;
00570 }
00571 else if (data[0] < 0xe0) {
00572
00573 size = 2;
00574 }
00575 else if (data[0] < 0xf0) {
00576
00577 size = 3;
00578 }
00579 else if (data[0] < 0xf8) {
00580
00581 size = 4;
00582 }
00583 else if (data[0] < 0xfc) {
00584
00585 size = 5;
00586 }
00587 else if (data[0] < 0xfe) {
00588
00589 size = 6;
00590 }
00591 else {
00592
00593 --n;
00594 ++data;
00595 return s_invalid;
00596 }
00597
00598
00599 if (size > n) {
00600 data += n;
00601 n = 0;
00602 return s_invalid;
00603 }
00604
00605
00606 UInt32 c;
00607 switch (size) {
00608 case 1:
00609 c = static_cast<UInt32>(data[0]);
00610 break;
00611
00612 case 2:
00613 c = ((static_cast<UInt32>(data[0]) & 0x1f) << 6) |
00614 ((static_cast<UInt32>(data[1]) & 0x3f) );
00615 break;
00616
00617 case 3:
00618 c = ((static_cast<UInt32>(data[0]) & 0x0f) << 12) |
00619 ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
00620 ((static_cast<UInt32>(data[2]) & 0x3f) );
00621 break;
00622
00623 case 4:
00624 c = ((static_cast<UInt32>(data[0]) & 0x07) << 18) |
00625 ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
00626 ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
00627 ((static_cast<UInt32>(data[1]) & 0x3f) );
00628 break;
00629
00630 case 5:
00631 c = ((static_cast<UInt32>(data[0]) & 0x03) << 24) |
00632 ((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
00633 ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
00634 ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
00635 ((static_cast<UInt32>(data[1]) & 0x3f) );
00636 break;
00637
00638 case 6:
00639 c = ((static_cast<UInt32>(data[0]) & 0x01) << 30) |
00640 ((static_cast<UInt32>(data[1]) & 0x3f) << 24) |
00641 ((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
00642 ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
00643 ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
00644 ((static_cast<UInt32>(data[1]) & 0x3f) );
00645 break;
00646
00647 default:
00648 assert(0 && "invalid size");
00649 return s_invalid;
00650 }
00651
00652
00653
00654 bool truncated = false;
00655 switch (size) {
00656 case 6:
00657 if ((data[5] & 0xc0) != 0x80) {
00658 truncated = true;
00659 size = 5;
00660 }
00661
00662
00663 case 5:
00664 if ((data[4] & 0xc0) != 0x80) {
00665 truncated = true;
00666 size = 4;
00667 }
00668
00669
00670 case 4:
00671 if ((data[3] & 0xc0) != 0x80) {
00672 truncated = true;
00673 size = 3;
00674 }
00675
00676
00677 case 3:
00678 if ((data[2] & 0xc0) != 0x80) {
00679 truncated = true;
00680 size = 2;
00681 }
00682
00683
00684 case 2:
00685 if ((data[1] & 0xc0) != 0x80) {
00686 truncated = true;
00687 size = 1;
00688 }
00689 }
00690
00691
00692 data += size;
00693 n -= size;
00694
00695
00696 if (truncated) {
00697 return s_invalid;
00698 }
00699
00700
00701 static UInt32 s_minChar[] = {
00702 0,
00703 0x00000000,
00704 0x00000080,
00705 0x00000800,
00706 0x00010000,
00707 0x00200000,
00708 0x04000000
00709 };
00710 if (c < s_minChar[size]) {
00711 return s_invalid;
00712 }
00713
00714
00715 if (c >= 0x0000d800 && c <= 0x0000dfff) {
00716 return s_invalid;
00717 }
00718 if (c >= 0x0000fffe && c <= 0x0000ffff) {
00719 return s_invalid;
00720 }
00721
00722 return c;
00723 }
00724
00725 void
00726 CUnicode::toUTF8(CString& dst, UInt32 c, bool* errors)
00727 {
00728 UInt8 data[6];
00729
00730
00731 if ((c >= 0x0000d800 && c <= 0x0000dfff) || c >= 0x80000000) {
00732 setError(errors);
00733 c = s_replacement;
00734 }
00735
00736
00737 if (c < 0x00000080) {
00738 data[0] = static_cast<UInt8>(c);
00739 dst.append(reinterpret_cast<char*>(data), 1);
00740 }
00741 else if (c < 0x00000800) {
00742 data[0] = static_cast<UInt8>(((c >> 6) & 0x0000001f) + 0xc0);
00743 data[1] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
00744 dst.append(reinterpret_cast<char*>(data), 2);
00745 }
00746 else if (c < 0x00010000) {
00747 data[0] = static_cast<UInt8>(((c >> 12) & 0x0000000f) + 0xe0);
00748 data[1] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
00749 data[2] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
00750 dst.append(reinterpret_cast<char*>(data), 3);
00751 }
00752 else if (c < 0x00200000) {
00753 data[0] = static_cast<UInt8>(((c >> 18) & 0x00000007) + 0xf0);
00754 data[1] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
00755 data[2] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
00756 data[3] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
00757 dst.append(reinterpret_cast<char*>(data), 4);
00758 }
00759 else if (c < 0x04000000) {
00760 data[0] = static_cast<UInt8>(((c >> 24) & 0x00000003) + 0xf8);
00761 data[1] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
00762 data[2] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
00763 data[3] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
00764 data[4] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
00765 dst.append(reinterpret_cast<char*>(data), 5);
00766 }
00767 else if (c < 0x80000000) {
00768 data[0] = static_cast<UInt8>(((c >> 30) & 0x00000001) + 0xfc);
00769 data[1] = static_cast<UInt8>(((c >> 24) & 0x0000003f) + 0x80);
00770 data[2] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
00771 data[3] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
00772 data[4] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
00773 data[5] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
00774 dst.append(reinterpret_cast<char*>(data), 6);
00775 }
00776 else {
00777 assert(0 && "character out of range");
00778 }
00779 }