00001 
00032 #include "linden_common.h"
00033 
00034 #include "llstring.h"
00035 #include "llerror.h"
00036 
00037 std::string ll_safe_string(const char* in)
00038 {
00039         if(in) return std::string(in);
00040         return std::string();
00041 }
00042 
00043 U8 hex_as_nybble(char hex)
00044 {
00045         if((hex >= '0') && (hex <= '9'))
00046         {
00047                 return (U8)(hex - '0');
00048         }
00049         else if((hex >= 'a') && (hex <='f'))
00050         {
00051                 return (U8)(10 + hex - 'a');
00052         }
00053         else if((hex >= 'A') && (hex <='F'))
00054         {
00055                 return (U8)(10 + hex - 'A');
00056         }
00057         return 0; 
00058 }
00059 
00060 
00061 bool _read_file_into_string(std::string& str, const char* filename)
00062 {
00063         llifstream ifs(filename, llifstream::binary);
00064         if (!ifs.is_open())
00065         {
00066                 llinfos << "Unable to open file" << filename << llendl;
00067                 return false;
00068         }
00069 
00070         std::ostringstream oss;
00071 
00072         oss << ifs.rdbuf();
00073         str = oss.str();
00074         ifs.close();
00075         return true;
00076 }
00077 
00078 
00079 
00080 
00081 
00082 
00083 
00084 
00085 
00086 std::ostream& operator<<(std::ostream &s, const LLWString &wstr)
00087 {
00088         std::string utf8_str = wstring_to_utf8str(wstr);
00089         s << utf8_str;
00090         return s;
00091 }
00092 
00093 std::string rawstr_to_utf8(const std::string& raw)
00094 {
00095         LLWString wstr(utf8str_to_wstring(raw));
00096         return wstring_to_utf8str(wstr);
00097 }
00098 
00099 S32 wchar_to_utf8chars(llwchar in_char, char* outchars)
00100 {
00101         U32 cur_char = (U32)in_char;
00102         char* base = outchars;
00103         if (cur_char < 0x80)
00104         {
00105                 *outchars++ = (U8)cur_char;
00106         }
00107         else if (cur_char < 0x800)
00108         {
00109                 *outchars++ = 0xC0 | (cur_char >> 6);
00110                 *outchars++ = 0x80 | (cur_char & 0x3F);
00111         }
00112         else if (cur_char < 0x10000)
00113         {
00114                 *outchars++ = 0xE0 | (cur_char >> 12);
00115                 *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
00116                 *outchars++ = 0x80 | (cur_char & 0x3F);
00117         }
00118         else if (cur_char < 0x200000)
00119         {
00120                 *outchars++ = 0xF0 | (cur_char >> 18);
00121                 *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F);
00122                 *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
00123                 *outchars++ = 0x80 | cur_char & 0x3F;
00124         }
00125         else if (cur_char < 0x4000000)
00126         {
00127                 *outchars++ = 0xF8 | (cur_char >> 24);
00128                 *outchars++ = 0x80 | ((cur_char >> 18) & 0x3F);
00129                 *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F);
00130                 *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
00131                 *outchars++ = 0x80 | cur_char & 0x3F;
00132         }
00133         else if (cur_char < 0x80000000)
00134         {
00135                 *outchars++ = 0xFC | (cur_char >> 30);
00136                 *outchars++ = 0x80 | ((cur_char >> 24) & 0x3F);
00137                 *outchars++ = 0x80 | ((cur_char >> 18) & 0x3F);
00138                 *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F);
00139                 *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
00140                 *outchars++ = 0x80 | cur_char & 0x3F;
00141         }
00142         else
00143         {
00144                 llwarns << "Invalid Unicode character " << cur_char << "!" << llendl;
00145                 *outchars++ = LL_UNKNOWN_CHAR;
00146         }
00147         return outchars - base;
00148 }       
00149 
00150 S32 utf16chars_to_wchar(const U16* inchars, llwchar* outchar)
00151 {
00152         const U16* base = inchars;
00153         U16 cur_char = *inchars++;
00154         llwchar char32 = cur_char;
00155         if ((cur_char >= 0xD800) && (cur_char <= 0xDFFF))
00156         {
00157                 
00158                 char32 = ((llwchar)(cur_char - 0xD800)) << 10;
00159                 cur_char = *inchars++;
00160                 char32 += (llwchar)(cur_char - 0xDC00) + 0x0010000UL;
00161         }
00162         else
00163         {
00164                 char32 = (llwchar)cur_char;
00165         }
00166         *outchar = char32;
00167         return inchars - base;
00168 }
00169 
00170 S32 utf16chars_to_utf8chars(const U16* inchars, char* outchars, S32* nchars8p)
00171 {
00172         
00173         llwchar char32;
00174         S32 nchars16 = utf16chars_to_wchar(inchars, &char32);
00175         
00176         S32 nchars8  = wchar_to_utf8chars(char32, outchars);
00177         if (nchars8p)
00178         {
00179                 *nchars8p = nchars8;
00180         }
00181         return nchars16;
00182 }
00183 
00184 llutf16string wstring_to_utf16str(const LLWString &utf32str, S32 len)
00185 {
00186         llutf16string out;
00187 
00188         S32 i = 0;
00189         while (i < len)
00190         {
00191                 U32 cur_char = utf32str[i];
00192                 if (cur_char > 0xFFFF)
00193                 {
00194                         out += (0xD7C0 + (cur_char >> 10));
00195                         out += (0xDC00 | (cur_char & 0x3FF));
00196                 }
00197                 else
00198                 {
00199                         out += cur_char;
00200                 }
00201                 i++;
00202         }
00203         return out;
00204 }
00205 
00206 llutf16string wstring_to_utf16str(const LLWString &utf32str)
00207 {
00208         const S32 len = (S32)utf32str.length();
00209         return wstring_to_utf16str(utf32str, len);
00210 }
00211 
00212 llutf16string utf8str_to_utf16str ( const LLString& utf8str )
00213 {
00214         LLWString wstr = utf8str_to_wstring ( utf8str );
00215         return wstring_to_utf16str ( wstr );
00216 }
00217 
00218 
00219 LLWString utf16str_to_wstring(const llutf16string &utf16str, S32 len)
00220 {
00221         LLWString wout;
00222         if((len <= 0) || utf16str.empty()) return wout;
00223 
00224         S32 i = 0;
00225         
00226         const U16* chars16 = &(*(utf16str.begin()));
00227         while (i < len)
00228         {
00229                 llwchar cur_char;
00230                 i += utf16chars_to_wchar(chars16+i, &cur_char);
00231                 wout += cur_char;
00232         }
00233         return wout;
00234 }
00235 
00236 LLWString utf16str_to_wstring(const llutf16string &utf16str)
00237 {
00238         const S32 len = (S32)utf16str.length();
00239         return utf16str_to_wstring(utf16str, len);
00240 }
00241 
00242 S32 wchar_utf8_length(const llwchar wc)
00243 {
00244         if (wc < 0x80)
00245         {
00246                 
00247                 
00248                 return 1;
00249         }
00250         else if (wc < 0x800)
00251         {
00252                 return 2;
00253         }
00254         else if (wc < 0x10000)
00255         {
00256                 return 3;
00257         }
00258         else if (wc < 0x200000)
00259         {
00260                 return 4;
00261         }
00262         else if (wc < 0x4000000)
00263         {
00264                 return 5;
00265         }
00266         else
00267         {
00268                 return 6;
00269         }
00270 }
00271 
00272 
00273 S32 wstring_utf8_length(const LLWString& wstr)
00274 {
00275         S32 len = 0;
00276         for (S32 i = 0; i < (S32)wstr.length(); i++)
00277         {
00278                 len += wchar_utf8_length(wstr[i]);
00279         }
00280         return len;
00281 }
00282 
00283 
00284 LLWString utf8str_to_wstring(const std::string& utf8str, S32 len)
00285 {
00286         LLWString wout;
00287 
00288         S32 i = 0;
00289         while (i < len)
00290         {
00291                 llwchar unichar;
00292                 U8 cur_char = utf8str[i];
00293 
00294                 if (cur_char < 0x80)
00295                 {
00296                         
00297                         unichar = cur_char;
00298                 }
00299                 else
00300                 {
00301                         S32 cont_bytes = 0;
00302                         if ((cur_char >> 5) == 0x6)                     
00303                         {
00304                                 unichar = (0x1F&cur_char);
00305                                 cont_bytes = 1;
00306                         }
00307                         else if ((cur_char >> 4) == 0xe)        
00308                         {
00309                                 unichar = (0x0F&cur_char);
00310                                 cont_bytes = 2;
00311                         }
00312                         else if ((cur_char >> 3) == 0x1e)       
00313                         {
00314                                 unichar = (0x07&cur_char);
00315                                 cont_bytes = 3;
00316                         }
00317                         else if ((cur_char >> 2) == 0x3e)       
00318                         {
00319                                 unichar = (0x03&cur_char);
00320                                 cont_bytes = 4;
00321                         }
00322                         else if ((cur_char >> 1) == 0x7e)       
00323                         {
00324                                 unichar = (0x01&cur_char);
00325                                 cont_bytes = 5;
00326                         }
00327                         else
00328                         {
00329                                 wout += LL_UNKNOWN_CHAR;
00330                                 ++i;
00331                                 continue;
00332                         }
00333 
00334                         
00335                         S32 end = (len < (i + cont_bytes)) ? len : (i + cont_bytes);
00336                         do
00337                         {
00338                                 ++i;
00339 
00340                                 cur_char = utf8str[i];
00341                                 if ( (cur_char >> 6) == 0x2 )
00342                                 {
00343                                         unichar <<= 6;
00344                                         unichar += (0x3F&cur_char);
00345                                 }
00346                                 else
00347                                 {
00348                                         
00349                                         unichar = LL_UNKNOWN_CHAR;
00350                                         --i;
00351                                         break;
00352                                 }
00353                         } while(i < end);
00354 
00355                         
00356                         if ( ((cont_bytes == 1) && (unichar < 0x80))
00357                                 || ((cont_bytes == 2) && (unichar < 0x800))
00358                                 || ((cont_bytes == 3) && (unichar < 0x10000))
00359                                 || ((cont_bytes == 4) && (unichar < 0x200000))
00360                                 || ((cont_bytes == 5) && (unichar < 0x4000000)) )
00361                         {
00362                                 unichar = LL_UNKNOWN_CHAR;
00363                         }
00364                 }
00365 
00366                 wout += unichar;
00367                 ++i;
00368         }
00369         return wout;
00370 }
00371 
00372 LLWString utf8str_to_wstring(const std::string& utf8str)
00373 {
00374         const S32 len = (S32)utf8str.length();
00375         return utf8str_to_wstring(utf8str, len);
00376 }
00377 
00378 std::string wstring_to_utf8str(const LLWString& utf32str, S32 len)
00379 {
00380         std::string out;
00381 
00382         S32 i = 0;
00383         while (i < len)
00384         {
00385                 char tchars[8];         
00386                 S32 n = wchar_to_utf8chars(utf32str[i], tchars);
00387                 tchars[n] = 0;
00388                 out += tchars;
00389                 i++;
00390         }
00391         return out;
00392 }
00393 
00394 std::string wstring_to_utf8str(const LLWString& utf32str)
00395 {
00396         const S32 len = (S32)utf32str.length();
00397         return wstring_to_utf8str(utf32str, len);
00398 }
00399 
00400 std::string utf16str_to_utf8str(const llutf16string& utf16str)
00401 {
00402         return wstring_to_utf8str(utf16str_to_wstring(utf16str));
00403 }
00404 
00405 std::string utf16str_to_utf8str(const llutf16string& utf16str, S32 len)
00406 {
00407         return wstring_to_utf8str(utf16str_to_wstring(utf16str, len), len);
00408 }
00409 
00410 
00411 
00412 
00413 
00414 
00415 
00416 
00417 
00418 
00419 
00420 
00421 
00422 
00423 
00424 
00425 
00426 
00427 
00428 
00429 
00430 
00431 
00432 
00433 
00434 
00435 
00436 
00437 
00438 
00439 
00440 
00441 
00442 
00443 
00444 
00445 
00446 
00447 
00448 
00449 
00450 
00451 
00452 
00453 
00454 
00455 
00456 
00457 
00458 
00459 
00460 
00461 
00462 
00463 
00464 
00465 
00466 
00467 
00468 
00469 
00470 
00471 
00472 
00473 
00474 
00475 
00476 
00477 
00478 
00479 
00480 
00481 
00482 
00483 
00484 
00485 
00486 
00487 
00488 
00489 
00490 
00491 
00492 
00493 
00494 
00495 
00496 
00497 
00498 
00499 
00500 
00501 
00502 
00503 
00504 
00505 
00506 
00507 
00508 
00509 
00510 
00511 
00512 
00513 
00514 
00515 
00516 
00517 
00518 
00519 
00520 
00521 
00522 
00523 
00524 
00525 
00526 
00527 
00528 
00529 
00530 
00531 
00532 
00533 
00534 
00535 
00536 
00537 
00538 
00539 
00540 
00541 
00542 
00543 
00544 
00545 
00546 
00547 
00548 
00549 
00550 
00551 
00552 
00553 
00554 
00555 
00556 
00557 
00558 
00559 
00560 
00561 
00562 
00563 
00564 
00565 
00566 
00567 
00568 
00569 
00570 
00571 
00572 
00573 
00574 
00575 
00576 
00577 
00578 
00579 
00580 
00581 
00582 
00583 
00584 
00585 
00586 
00587 
00588 
00589 
00590 
00591 
00592 
00593 
00594 
00595 
00596 
00597 
00598 
00599 
00600 
00601 
00602 
00603 
00604 
00605 
00606 
00607 
00608 
00609 
00610 std::string utf8str_trim(const std::string& utf8str)
00611 {
00612         LLWString wstr = utf8str_to_wstring(utf8str);
00613         LLWString::trim(wstr);
00614         return wstring_to_utf8str(wstr);
00615 }
00616 
00617 
00618 std::string utf8str_tolower(const std::string& utf8str)
00619 {
00620         LLWString out_str = utf8str_to_wstring(utf8str);
00621         LLWString::toLower(out_str);
00622         return wstring_to_utf8str(out_str);
00623 }
00624 
00625 
00626 S32 utf8str_compare_insensitive(const std::string& lhs, const std::string& rhs)
00627 {
00628         LLWString wlhs = utf8str_to_wstring(lhs);
00629         LLWString wrhs = utf8str_to_wstring(rhs);
00630         return LLWString::compareInsensitive(wlhs.c_str(), wrhs.c_str());
00631 }
00632 
00633 std::string utf8str_truncate(const std::string& utf8str, const S32 max_len)
00634 {
00635         if (0 == max_len)
00636         {
00637                 return std::string();
00638         }
00639         if ((S32)utf8str.length() <= max_len)
00640         {
00641                 return utf8str;
00642         }
00643         else
00644         {
00645                 S32 cur_char = max_len;
00646 
00647                 
00648                 if ((U8)utf8str[cur_char] > 0x7f)
00649                 {
00650                         
00651                         
00652                         while (0x80 == (0xc0 & utf8str[cur_char]))
00653                         {
00654                                 cur_char--;
00655                                 
00656                                 if (cur_char == 0)
00657                                 {
00658                                         
00659                                         break;
00660                                 }
00661                         }
00662                 }
00663                 
00664                 return utf8str.substr(0, cur_char);
00665         }
00666 }
00667 
00668 std::string utf8str_substChar(
00669         const std::string& utf8str,
00670         const llwchar target_char,
00671         const llwchar replace_char)
00672 {
00673         LLWString wstr = utf8str_to_wstring(utf8str);
00674         LLWString::replaceChar(wstr, target_char, replace_char);
00675         
00676         return wstring_to_utf8str(wstr);
00677 }
00678 
00679 std::string utf8str_makeASCII(const std::string& utf8str)
00680 {
00681         LLWString wstr = utf8str_to_wstring(utf8str);
00682         LLWString::_makeASCII(wstr);
00683         return wstring_to_utf8str(wstr);
00684 }
00685 
00686 std::string mbcsstring_makeASCII(const std::string& wstr)
00687 {
00688         
00689         std::string out_str = wstr;
00690         for (S32 i = 0; i < (S32)out_str.length(); i++)
00691         {
00692                 if ((U8)out_str[i] > 0x7f)
00693                 {
00694                         out_str[i] = LL_UNKNOWN_CHAR;
00695                 }
00696         }
00697         return out_str;
00698 }
00699 std::string utf8str_removeCRLF(const std::string& utf8str)
00700 {
00701         if (0 == utf8str.length())
00702         {
00703                 return std::string();
00704         }
00705         const char CR = 13;
00706 
00707         std::string out;
00708         out.reserve(utf8str.length());
00709         const S32 len = (S32)utf8str.length();
00710         for( S32 i = 0; i < len; i++ )
00711         {
00712                 if( utf8str[i] != CR )
00713                 {
00714                         out.push_back(utf8str[i]);
00715                 }
00716         }
00717         return out;
00718 }
00719 
00720 #if LL_WINDOWS
00721 
00722 
00723 
00724 
00725 
00726 
00727 
00728 
00729 
00730 
00731 
00732 
00733 
00734 int safe_snprintf(char *str, size_t size, const char *format, ...)
00735 {
00736         va_list args;
00737         va_start(args, format);
00738 
00739         int num_written = _vsnprintf(str, size, format, args); 
00740         va_end(args);
00741         
00742         str[size-1] = '\0'; 
00743         return num_written;
00744 }
00745 #endif // LL_WINDOWS
00746 
00747 S32     LLStringOps::collate(const llwchar* a, const llwchar* b)
00748 { 
00749         #if LL_WINDOWS
00750                 
00751                 
00752                 return strcmp(wstring_to_utf8str(LLWString(a)).c_str(), wstring_to_utf8str(LLWString(b)).c_str());
00753         #else
00754                 return wcscoll(a, b);
00755         #endif
00756 }
00757 
00758 namespace LLStringFn
00759 {
00760         void replace_nonprintable(std::basic_string<char>& string, char replacement)
00761         {
00762                 const char MIN = 0x20;
00763                 std::basic_string<char>::size_type len = string.size();
00764                 for(std::basic_string<char>::size_type ii = 0; ii < len; ++ii)
00765                 {
00766                         if(string[ii] < MIN)
00767                         {
00768                                 string[ii] = replacement;
00769                         }
00770                 }
00771         }
00772 
00773         void replace_nonprintable(
00774                 std::basic_string<llwchar>& string,
00775                 llwchar replacement)
00776         {
00777                 const llwchar MIN = 0x20;
00778                 const llwchar MAX = 0x7f;
00779                 std::basic_string<llwchar>::size_type len = string.size();
00780                 for(std::basic_string<llwchar>::size_type ii = 0; ii < len; ++ii)
00781                 {
00782                         if((string[ii] < MIN) || (string[ii] > MAX))
00783                         {
00784                                 string[ii] = replacement;
00785                         }
00786                 }
00787         }
00788 
00789         void replace_nonprintable_and_pipe(std::basic_string<char>& str,
00790                                                                            char replacement)
00791         {
00792                 const char MIN  = 0x20;
00793                 const char PIPE = 0x7c;
00794                 std::basic_string<char>::size_type len = str.size();
00795                 for(std::basic_string<char>::size_type ii = 0; ii < len; ++ii)
00796                 {
00797                         if( (str[ii] < MIN) || (str[ii] == PIPE) )
00798                         {
00799                                 str[ii] = replacement;
00800                         }
00801                 }
00802         }
00803 
00804         void replace_nonprintable_and_pipe(std::basic_string<llwchar>& str,
00805                                                                            llwchar replacement)
00806         {
00807                 const llwchar MIN  = 0x20;
00808                 const llwchar MAX  = 0x7f;
00809                 const llwchar PIPE = 0x7c;
00810                 std::basic_string<llwchar>::size_type len = str.size();
00811                 for(std::basic_string<llwchar>::size_type ii = 0; ii < len; ++ii)
00812                 {
00813                         if( (str[ii] < MIN) || (str[ii] > MAX) || (str[ii] == PIPE) )
00814                         {
00815                                 str[ii] = replacement;
00816                         }
00817                 }
00818         }
00819 }
00820 
00821 
00823 
00824 
00825 #ifdef _DEBUG
00826 
00827 template<class T> 
00828 void LLStringBase<T>::testHarness()
00829 {
00830         LLString s1;
00831         
00832         llassert( s1.c_str() == NULL );
00833         llassert( s1.size() == 0 );
00834         llassert( s1.empty() );
00835         
00836         LLString s2( "hello");
00837         llassert( !strcmp( s2.c_str(), "hello" ) );
00838         llassert( s2.size() == 5 ); 
00839         llassert( !s2.empty() );
00840         LLString s3( s2 );
00841 
00842         llassert( "hello" == s2 );
00843         llassert( s2 == "hello" );
00844         llassert( s2 > "gello" );
00845         llassert( "gello" < s2 );
00846         llassert( "gello" != s2 );
00847         llassert( s2 != "gello" );
00848 
00849         LLString s4 = s2;
00850         llassert( !s4.empty() );
00851         s4.empty();
00852         llassert( s4.empty() );
00853         
00854         LLString s5("");
00855         llassert( s5.empty() );
00856         
00857         llassert( isValidIndex(s5, 0) );
00858         llassert( !isValidIndex(s5, 1) );
00859         
00860         s3 = s2;
00861         s4 = "hello again";
00862         
00863         s4 += "!";
00864         s4 += s4;
00865         llassert( s4 == "hello again!hello again!" );
00866         
00867         
00868         LLString s6 = s2 + " " + s2;
00869         LLString s7 = s6;
00870         llassert( s6 == s7 );
00871         llassert( !( s6 != s7) );
00872         llassert( !(s6 < s7) );
00873         llassert( !(s6 > s7) );
00874         
00875         llassert( !(s6 == "hi"));
00876         llassert( s6 == "hello hello");
00877         llassert( s6 < "hi");
00878         
00879         llassert( s6[1] == 'e' );
00880         s6[1] = 'f';
00881         llassert( s6[1] == 'f' );
00882         
00883         s2.erase( 4, 1 );
00884         llassert( s2 == "hell");
00885         s2.insert( 0, 'y' );
00886         llassert( s2 == "yhell");
00887         s2.erase( 1, 3 );
00888         llassert( s2 == "yl");
00889         s2.insert( 1, "awn, don't yel");
00890         llassert( s2 == "yawn, don't yell");
00891         
00892         LLString s8 = s2.substr( 6, 5 );
00893         llassert( s8 == "don't"  );
00894         
00895         LLString s9 = "   \t\ntest  \t\t\n  ";
00896         trim(s9);
00897         llassert( s9 == "test"  );
00898 
00899         s8 = "abc123&*(ABC";
00900 
00901         s9 = s8;
00902         toUpper(s9);
00903         llassert( s9 == "ABC123&*(ABC"  );
00904 
00905         s9 = s8;
00906         toLower(s9);
00907         llassert( s9 == "abc123&*(abc"  );
00908 
00909 
00910         LLString s10( 10, 'x' );
00911         llassert( s10 == "xxxxxxxxxx" );
00912 
00913         LLString s11( "monkey in the middle", 7, 2 );
00914         llassert( s11 == "in" );
00915 
00916         LLString s12;  
00917         s12 += "foo";
00918         llassert( s12 == "foo" );
00919 
00920         LLString s13;  
00921         s13 += 'f';
00922         llassert( s13 == "f" );
00923 }
00924 
00925 
00926 #endif  // _DEBUG