00001
00032 #include "linden_common.h"
00033
00034 #include "llstring.h"
00035 #include "llerror.h"
00036
00037 std::string ll_safe_string(const char* in)
00038 {
00039 if(in) return std::string(in);
00040 return std::string();
00041 }
00042
00043 U8 hex_as_nybble(char hex)
00044 {
00045 if((hex >= '0') && (hex <= '9'))
00046 {
00047 return (U8)(hex - '0');
00048 }
00049 else if((hex >= 'a') && (hex <='f'))
00050 {
00051 return (U8)(10 + hex - 'a');
00052 }
00053 else if((hex >= 'A') && (hex <='F'))
00054 {
00055 return (U8)(10 + hex - 'A');
00056 }
00057 return 0;
00058 }
00059
00060
00061 bool _read_file_into_string(std::string& str, const char* filename)
00062 {
00063 llifstream ifs(filename, llifstream::binary);
00064 if (!ifs.is_open())
00065 {
00066 llinfos << "Unable to open file" << filename << llendl;
00067 return false;
00068 }
00069
00070 std::ostringstream oss;
00071
00072 oss << ifs.rdbuf();
00073 str = oss.str();
00074 ifs.close();
00075 return true;
00076 }
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086 std::ostream& operator<<(std::ostream &s, const LLWString &wstr)
00087 {
00088 std::string utf8_str = wstring_to_utf8str(wstr);
00089 s << utf8_str;
00090 return s;
00091 }
00092
00093 std::string rawstr_to_utf8(const std::string& raw)
00094 {
00095 LLWString wstr(utf8str_to_wstring(raw));
00096 return wstring_to_utf8str(wstr);
00097 }
00098
00099 S32 wchar_to_utf8chars(llwchar in_char, char* outchars)
00100 {
00101 U32 cur_char = (U32)in_char;
00102 char* base = outchars;
00103 if (cur_char < 0x80)
00104 {
00105 *outchars++ = (U8)cur_char;
00106 }
00107 else if (cur_char < 0x800)
00108 {
00109 *outchars++ = 0xC0 | (cur_char >> 6);
00110 *outchars++ = 0x80 | (cur_char & 0x3F);
00111 }
00112 else if (cur_char < 0x10000)
00113 {
00114 *outchars++ = 0xE0 | (cur_char >> 12);
00115 *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
00116 *outchars++ = 0x80 | (cur_char & 0x3F);
00117 }
00118 else if (cur_char < 0x200000)
00119 {
00120 *outchars++ = 0xF0 | (cur_char >> 18);
00121 *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F);
00122 *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
00123 *outchars++ = 0x80 | cur_char & 0x3F;
00124 }
00125 else if (cur_char < 0x4000000)
00126 {
00127 *outchars++ = 0xF8 | (cur_char >> 24);
00128 *outchars++ = 0x80 | ((cur_char >> 18) & 0x3F);
00129 *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F);
00130 *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
00131 *outchars++ = 0x80 | cur_char & 0x3F;
00132 }
00133 else if (cur_char < 0x80000000)
00134 {
00135 *outchars++ = 0xFC | (cur_char >> 30);
00136 *outchars++ = 0x80 | ((cur_char >> 24) & 0x3F);
00137 *outchars++ = 0x80 | ((cur_char >> 18) & 0x3F);
00138 *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F);
00139 *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
00140 *outchars++ = 0x80 | cur_char & 0x3F;
00141 }
00142 else
00143 {
00144 llwarns << "Invalid Unicode character " << cur_char << "!" << llendl;
00145 *outchars++ = LL_UNKNOWN_CHAR;
00146 }
00147 return outchars - base;
00148 }
00149
00150 S32 utf16chars_to_wchar(const U16* inchars, llwchar* outchar)
00151 {
00152 const U16* base = inchars;
00153 U16 cur_char = *inchars++;
00154 llwchar char32 = cur_char;
00155 if ((cur_char >= 0xD800) && (cur_char <= 0xDFFF))
00156 {
00157
00158 char32 = ((llwchar)(cur_char - 0xD800)) << 10;
00159 cur_char = *inchars++;
00160 char32 += (llwchar)(cur_char - 0xDC00) + 0x0010000UL;
00161 }
00162 else
00163 {
00164 char32 = (llwchar)cur_char;
00165 }
00166 *outchar = char32;
00167 return inchars - base;
00168 }
00169
00170 S32 utf16chars_to_utf8chars(const U16* inchars, char* outchars, S32* nchars8p)
00171 {
00172
00173 llwchar char32;
00174 S32 nchars16 = utf16chars_to_wchar(inchars, &char32);
00175
00176 S32 nchars8 = wchar_to_utf8chars(char32, outchars);
00177 if (nchars8p)
00178 {
00179 *nchars8p = nchars8;
00180 }
00181 return nchars16;
00182 }
00183
00184 llutf16string wstring_to_utf16str(const LLWString &utf32str, S32 len)
00185 {
00186 llutf16string out;
00187
00188 S32 i = 0;
00189 while (i < len)
00190 {
00191 U32 cur_char = utf32str[i];
00192 if (cur_char > 0xFFFF)
00193 {
00194 out += (0xD7C0 + (cur_char >> 10));
00195 out += (0xDC00 | (cur_char & 0x3FF));
00196 }
00197 else
00198 {
00199 out += cur_char;
00200 }
00201 i++;
00202 }
00203 return out;
00204 }
00205
00206 llutf16string wstring_to_utf16str(const LLWString &utf32str)
00207 {
00208 const S32 len = (S32)utf32str.length();
00209 return wstring_to_utf16str(utf32str, len);
00210 }
00211
00212 llutf16string utf8str_to_utf16str ( const LLString& utf8str )
00213 {
00214 LLWString wstr = utf8str_to_wstring ( utf8str );
00215 return wstring_to_utf16str ( wstr );
00216 }
00217
00218
00219 LLWString utf16str_to_wstring(const llutf16string &utf16str, S32 len)
00220 {
00221 LLWString wout;
00222 if((len <= 0) || utf16str.empty()) return wout;
00223
00224 S32 i = 0;
00225
00226 const U16* chars16 = &(*(utf16str.begin()));
00227 while (i < len)
00228 {
00229 llwchar cur_char;
00230 i += utf16chars_to_wchar(chars16+i, &cur_char);
00231 wout += cur_char;
00232 }
00233 return wout;
00234 }
00235
00236 LLWString utf16str_to_wstring(const llutf16string &utf16str)
00237 {
00238 const S32 len = (S32)utf16str.length();
00239 return utf16str_to_wstring(utf16str, len);
00240 }
00241
00242 S32 wchar_utf8_length(const llwchar wc)
00243 {
00244 if (wc < 0x80)
00245 {
00246
00247
00248 return 1;
00249 }
00250 else if (wc < 0x800)
00251 {
00252 return 2;
00253 }
00254 else if (wc < 0x10000)
00255 {
00256 return 3;
00257 }
00258 else if (wc < 0x200000)
00259 {
00260 return 4;
00261 }
00262 else if (wc < 0x4000000)
00263 {
00264 return 5;
00265 }
00266 else
00267 {
00268 return 6;
00269 }
00270 }
00271
00272
00273 S32 wstring_utf8_length(const LLWString& wstr)
00274 {
00275 S32 len = 0;
00276 for (S32 i = 0; i < (S32)wstr.length(); i++)
00277 {
00278 len += wchar_utf8_length(wstr[i]);
00279 }
00280 return len;
00281 }
00282
00283
00284 LLWString utf8str_to_wstring(const std::string& utf8str, S32 len)
00285 {
00286 LLWString wout;
00287
00288 S32 i = 0;
00289 while (i < len)
00290 {
00291 llwchar unichar;
00292 U8 cur_char = utf8str[i];
00293
00294 if (cur_char < 0x80)
00295 {
00296
00297 unichar = cur_char;
00298 }
00299 else
00300 {
00301 S32 cont_bytes = 0;
00302 if ((cur_char >> 5) == 0x6)
00303 {
00304 unichar = (0x1F&cur_char);
00305 cont_bytes = 1;
00306 }
00307 else if ((cur_char >> 4) == 0xe)
00308 {
00309 unichar = (0x0F&cur_char);
00310 cont_bytes = 2;
00311 }
00312 else if ((cur_char >> 3) == 0x1e)
00313 {
00314 unichar = (0x07&cur_char);
00315 cont_bytes = 3;
00316 }
00317 else if ((cur_char >> 2) == 0x3e)
00318 {
00319 unichar = (0x03&cur_char);
00320 cont_bytes = 4;
00321 }
00322 else if ((cur_char >> 1) == 0x7e)
00323 {
00324 unichar = (0x01&cur_char);
00325 cont_bytes = 5;
00326 }
00327 else
00328 {
00329 wout += LL_UNKNOWN_CHAR;
00330 ++i;
00331 continue;
00332 }
00333
00334
00335 S32 end = (len < (i + cont_bytes)) ? len : (i + cont_bytes);
00336 do
00337 {
00338 ++i;
00339
00340 cur_char = utf8str[i];
00341 if ( (cur_char >> 6) == 0x2 )
00342 {
00343 unichar <<= 6;
00344 unichar += (0x3F&cur_char);
00345 }
00346 else
00347 {
00348
00349 unichar = LL_UNKNOWN_CHAR;
00350 --i;
00351 break;
00352 }
00353 } while(i < end);
00354
00355
00356 if ( ((cont_bytes == 1) && (unichar < 0x80))
00357 || ((cont_bytes == 2) && (unichar < 0x800))
00358 || ((cont_bytes == 3) && (unichar < 0x10000))
00359 || ((cont_bytes == 4) && (unichar < 0x200000))
00360 || ((cont_bytes == 5) && (unichar < 0x4000000)) )
00361 {
00362 unichar = LL_UNKNOWN_CHAR;
00363 }
00364 }
00365
00366 wout += unichar;
00367 ++i;
00368 }
00369 return wout;
00370 }
00371
00372 LLWString utf8str_to_wstring(const std::string& utf8str)
00373 {
00374 const S32 len = (S32)utf8str.length();
00375 return utf8str_to_wstring(utf8str, len);
00376 }
00377
00378 std::string wstring_to_utf8str(const LLWString& utf32str, S32 len)
00379 {
00380 std::string out;
00381
00382 S32 i = 0;
00383 while (i < len)
00384 {
00385 char tchars[8];
00386 S32 n = wchar_to_utf8chars(utf32str[i], tchars);
00387 tchars[n] = 0;
00388 out += tchars;
00389 i++;
00390 }
00391 return out;
00392 }
00393
00394 std::string wstring_to_utf8str(const LLWString& utf32str)
00395 {
00396 const S32 len = (S32)utf32str.length();
00397 return wstring_to_utf8str(utf32str, len);
00398 }
00399
00400 std::string utf16str_to_utf8str(const llutf16string& utf16str)
00401 {
00402 return wstring_to_utf8str(utf16str_to_wstring(utf16str));
00403 }
00404
00405 std::string utf16str_to_utf8str(const llutf16string& utf16str, S32 len)
00406 {
00407 return wstring_to_utf8str(utf16str_to_wstring(utf16str, len), len);
00408 }
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535
00536
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564
00565
00566
00567
00568
00569
00570
00571
00572
00573
00574
00575
00576
00577
00578
00579
00580
00581
00582
00583
00584
00585
00586
00587
00588
00589
00590
00591
00592
00593
00594
00595
00596
00597
00598
00599
00600
00601
00602
00603
00604
00605
00606
00607
00608
00609
00610 std::string utf8str_trim(const std::string& utf8str)
00611 {
00612 LLWString wstr = utf8str_to_wstring(utf8str);
00613 LLWString::trim(wstr);
00614 return wstring_to_utf8str(wstr);
00615 }
00616
00617
00618 std::string utf8str_tolower(const std::string& utf8str)
00619 {
00620 LLWString out_str = utf8str_to_wstring(utf8str);
00621 LLWString::toLower(out_str);
00622 return wstring_to_utf8str(out_str);
00623 }
00624
00625
00626 S32 utf8str_compare_insensitive(const std::string& lhs, const std::string& rhs)
00627 {
00628 LLWString wlhs = utf8str_to_wstring(lhs);
00629 LLWString wrhs = utf8str_to_wstring(rhs);
00630 return LLWString::compareInsensitive(wlhs.c_str(), wrhs.c_str());
00631 }
00632
00633 std::string utf8str_truncate(const std::string& utf8str, const S32 max_len)
00634 {
00635 if (0 == max_len)
00636 {
00637 return std::string();
00638 }
00639 if ((S32)utf8str.length() <= max_len)
00640 {
00641 return utf8str;
00642 }
00643 else
00644 {
00645 S32 cur_char = max_len;
00646
00647
00648 if ((U8)utf8str[cur_char] > 0x7f)
00649 {
00650
00651
00652 while (0x80 == (0xc0 & utf8str[cur_char]))
00653 {
00654 cur_char--;
00655
00656 if (cur_char == 0)
00657 {
00658
00659 break;
00660 }
00661 }
00662 }
00663
00664 return utf8str.substr(0, cur_char);
00665 }
00666 }
00667
00668 std::string utf8str_substChar(
00669 const std::string& utf8str,
00670 const llwchar target_char,
00671 const llwchar replace_char)
00672 {
00673 LLWString wstr = utf8str_to_wstring(utf8str);
00674 LLWString::replaceChar(wstr, target_char, replace_char);
00675
00676 return wstring_to_utf8str(wstr);
00677 }
00678
00679 std::string utf8str_makeASCII(const std::string& utf8str)
00680 {
00681 LLWString wstr = utf8str_to_wstring(utf8str);
00682 LLWString::_makeASCII(wstr);
00683 return wstring_to_utf8str(wstr);
00684 }
00685
00686 std::string mbcsstring_makeASCII(const std::string& wstr)
00687 {
00688
00689 std::string out_str = wstr;
00690 for (S32 i = 0; i < (S32)out_str.length(); i++)
00691 {
00692 if ((U8)out_str[i] > 0x7f)
00693 {
00694 out_str[i] = LL_UNKNOWN_CHAR;
00695 }
00696 }
00697 return out_str;
00698 }
00699 std::string utf8str_removeCRLF(const std::string& utf8str)
00700 {
00701 if (0 == utf8str.length())
00702 {
00703 return std::string();
00704 }
00705 const char CR = 13;
00706
00707 std::string out;
00708 out.reserve(utf8str.length());
00709 const S32 len = (S32)utf8str.length();
00710 for( S32 i = 0; i < len; i++ )
00711 {
00712 if( utf8str[i] != CR )
00713 {
00714 out.push_back(utf8str[i]);
00715 }
00716 }
00717 return out;
00718 }
00719
00720 #if LL_WINDOWS
00721
00722
00723
00724
00725
00726
00727
00728
00729
00730
00731
00732
00733
00734 int safe_snprintf(char *str, size_t size, const char *format, ...)
00735 {
00736 va_list args;
00737 va_start(args, format);
00738
00739 int num_written = _vsnprintf(str, size, format, args);
00740 va_end(args);
00741
00742 str[size-1] = '\0';
00743 return num_written;
00744 }
00745 #endif // LL_WINDOWS
00746
00747 S32 LLStringOps::collate(const llwchar* a, const llwchar* b)
00748 {
00749 #if LL_WINDOWS
00750
00751
00752 return strcmp(wstring_to_utf8str(LLWString(a)).c_str(), wstring_to_utf8str(LLWString(b)).c_str());
00753 #else
00754 return wcscoll(a, b);
00755 #endif
00756 }
00757
00758 namespace LLStringFn
00759 {
00760 void replace_nonprintable(std::basic_string<char>& string, char replacement)
00761 {
00762 const char MIN = 0x20;
00763 std::basic_string<char>::size_type len = string.size();
00764 for(std::basic_string<char>::size_type ii = 0; ii < len; ++ii)
00765 {
00766 if(string[ii] < MIN)
00767 {
00768 string[ii] = replacement;
00769 }
00770 }
00771 }
00772
00773 void replace_nonprintable(
00774 std::basic_string<llwchar>& string,
00775 llwchar replacement)
00776 {
00777 const llwchar MIN = 0x20;
00778 const llwchar MAX = 0x7f;
00779 std::basic_string<llwchar>::size_type len = string.size();
00780 for(std::basic_string<llwchar>::size_type ii = 0; ii < len; ++ii)
00781 {
00782 if((string[ii] < MIN) || (string[ii] > MAX))
00783 {
00784 string[ii] = replacement;
00785 }
00786 }
00787 }
00788
00789 void replace_nonprintable_and_pipe(std::basic_string<char>& str,
00790 char replacement)
00791 {
00792 const char MIN = 0x20;
00793 const char PIPE = 0x7c;
00794 std::basic_string<char>::size_type len = str.size();
00795 for(std::basic_string<char>::size_type ii = 0; ii < len; ++ii)
00796 {
00797 if( (str[ii] < MIN) || (str[ii] == PIPE) )
00798 {
00799 str[ii] = replacement;
00800 }
00801 }
00802 }
00803
00804 void replace_nonprintable_and_pipe(std::basic_string<llwchar>& str,
00805 llwchar replacement)
00806 {
00807 const llwchar MIN = 0x20;
00808 const llwchar MAX = 0x7f;
00809 const llwchar PIPE = 0x7c;
00810 std::basic_string<llwchar>::size_type len = str.size();
00811 for(std::basic_string<llwchar>::size_type ii = 0; ii < len; ++ii)
00812 {
00813 if( (str[ii] < MIN) || (str[ii] > MAX) || (str[ii] == PIPE) )
00814 {
00815 str[ii] = replacement;
00816 }
00817 }
00818 }
00819 }
00820
00821
00823
00824
00825 #ifdef _DEBUG
00826
00827 template<class T>
00828 void LLStringBase<T>::testHarness()
00829 {
00830 LLString s1;
00831
00832 llassert( s1.c_str() == NULL );
00833 llassert( s1.size() == 0 );
00834 llassert( s1.empty() );
00835
00836 LLString s2( "hello");
00837 llassert( !strcmp( s2.c_str(), "hello" ) );
00838 llassert( s2.size() == 5 );
00839 llassert( !s2.empty() );
00840 LLString s3( s2 );
00841
00842 llassert( "hello" == s2 );
00843 llassert( s2 == "hello" );
00844 llassert( s2 > "gello" );
00845 llassert( "gello" < s2 );
00846 llassert( "gello" != s2 );
00847 llassert( s2 != "gello" );
00848
00849 LLString s4 = s2;
00850 llassert( !s4.empty() );
00851 s4.empty();
00852 llassert( s4.empty() );
00853
00854 LLString s5("");
00855 llassert( s5.empty() );
00856
00857 llassert( isValidIndex(s5, 0) );
00858 llassert( !isValidIndex(s5, 1) );
00859
00860 s3 = s2;
00861 s4 = "hello again";
00862
00863 s4 += "!";
00864 s4 += s4;
00865 llassert( s4 == "hello again!hello again!" );
00866
00867
00868 LLString s6 = s2 + " " + s2;
00869 LLString s7 = s6;
00870 llassert( s6 == s7 );
00871 llassert( !( s6 != s7) );
00872 llassert( !(s6 < s7) );
00873 llassert( !(s6 > s7) );
00874
00875 llassert( !(s6 == "hi"));
00876 llassert( s6 == "hello hello");
00877 llassert( s6 < "hi");
00878
00879 llassert( s6[1] == 'e' );
00880 s6[1] = 'f';
00881 llassert( s6[1] == 'f' );
00882
00883 s2.erase( 4, 1 );
00884 llassert( s2 == "hell");
00885 s2.insert( 0, 'y' );
00886 llassert( s2 == "yhell");
00887 s2.erase( 1, 3 );
00888 llassert( s2 == "yl");
00889 s2.insert( 1, "awn, don't yel");
00890 llassert( s2 == "yawn, don't yell");
00891
00892 LLString s8 = s2.substr( 6, 5 );
00893 llassert( s8 == "don't" );
00894
00895 LLString s9 = " \t\ntest \t\t\n ";
00896 trim(s9);
00897 llassert( s9 == "test" );
00898
00899 s8 = "abc123&*(ABC";
00900
00901 s9 = s8;
00902 toUpper(s9);
00903 llassert( s9 == "ABC123&*(ABC" );
00904
00905 s9 = s8;
00906 toLower(s9);
00907 llassert( s9 == "abc123&*(abc" );
00908
00909
00910 LLString s10( 10, 'x' );
00911 llassert( s10 == "xxxxxxxxxx" );
00912
00913 LLString s11( "monkey in the middle", 7, 2 );
00914 llassert( s11 == "in" );
00915
00916 LLString s12;
00917 s12 += "foo";
00918 llassert( s12 == "foo" );
00919
00920 LLString s13;
00921 s13 += 'f';
00922 llassert( s13 == "f" );
00923 }
00924
00925
00926 #endif // _DEBUG