SecondLife: llcommon/llstring.cpp Source File

00001 
00032 #include "linden_common.h"
00033 
00034 #include "llstring.h"
00035 #include "llerror.h"
00036 
00037 #if LL_WINDOWS
00038 #define WIN32_LEAN_AND_MEAN
00039 #include <winsock2.h>
00040 #include <windows.h>
00041 #include <winnls.h> // for WideCharToMultiByte
00042 #endif
00043 
00044 std::string ll_safe_string(const char* in)
00045 {
00046         if(in) return std::string(in);
00047         return std::string();
00048 }
00049 
00050 U8 hex_as_nybble(char hex)
00051 {
00052         if((hex >= '0') && (hex <= '9'))
00053         {
00054                 return (U8)(hex - '0');
00055         }
00056         else if((hex >= 'a') && (hex <='f'))
00057         {
00058                 return (U8)(10 + hex - 'a');
00059         }
00060         else if((hex >= 'A') && (hex <='F'))
00061         {
00062                 return (U8)(10 + hex - 'A');
00063         }
00064         return 0; // uh - oh, not hex any more...
00065 }
00066 
00067 
00068 bool _read_file_into_string(std::string& str, const char* filename)
00069 {
00070         llifstream ifs(filename, llifstream::binary);
00071         if (!ifs.is_open())
00072         {
00073                 llinfos << "Unable to open file" << filename << llendl;
00074                 return false;
00075         }
00076 
00077         std::ostringstream oss;
00078 
00079         oss << ifs.rdbuf();
00080         str = oss.str();
00081         ifs.close();
00082         return true;
00083 }
00084 
00085 
00086 
00087 
00088 // See http://www.unicode.org/Public/BETA/CVTUTF-1-2/ConvertUTF.c
00089 // for the Unicode implementation - this doesn't match because it was written before finding
00090 // it.
00091 
00092 
00093 std::ostream& operator<<(std::ostream &s, const LLWString &wstr)
00094 {
00095         std::string utf8_str = wstring_to_utf8str(wstr);
00096         s << utf8_str;
00097         return s;
00098 }
00099 
00100 std::string rawstr_to_utf8(const std::string& raw)
00101 {
00102         LLWString wstr(utf8str_to_wstring(raw));
00103         return wstring_to_utf8str(wstr);
00104 }
00105 
00106 S32 wchar_to_utf8chars(llwchar in_char, char* outchars)
00107 {
00108         U32 cur_char = (U32)in_char;
00109         char* base = outchars;
00110         if (cur_char < 0x80)
00111         {
00112                 *outchars++ = (U8)cur_char;
00113         }
00114         else if (cur_char < 0x800)
00115         {
00116                 *outchars++ = 0xC0 | (cur_char >> 6);
00117                 *outchars++ = 0x80 | (cur_char & 0x3F);
00118         }
00119         else if (cur_char < 0x10000)
00120         {
00121                 *outchars++ = 0xE0 | (cur_char >> 12);
00122                 *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
00123                 *outchars++ = 0x80 | (cur_char & 0x3F);
00124         }
00125         else if (cur_char < 0x200000)
00126         {
00127                 *outchars++ = 0xF0 | (cur_char >> 18);
00128                 *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F);
00129                 *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
00130                 *outchars++ = 0x80 | cur_char & 0x3F;
00131         }
00132         else if (cur_char < 0x4000000)
00133         {
00134                 *outchars++ = 0xF8 | (cur_char >> 24);
00135                 *outchars++ = 0x80 | ((cur_char >> 18) & 0x3F);
00136                 *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F);
00137                 *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
00138                 *outchars++ = 0x80 | cur_char & 0x3F;
00139         }
00140         else if (cur_char < 0x80000000)
00141         {
00142                 *outchars++ = 0xFC | (cur_char >> 30);
00143                 *outchars++ = 0x80 | ((cur_char >> 24) & 0x3F);
00144                 *outchars++ = 0x80 | ((cur_char >> 18) & 0x3F);
00145                 *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F);
00146                 *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
00147                 *outchars++ = 0x80 | cur_char & 0x3F;
00148         }
00149         else
00150         {
00151                 llwarns << "Invalid Unicode character " << cur_char << "!" << llendl;
00152                 *outchars++ = LL_UNKNOWN_CHAR;
00153         }
00154         return outchars - base;
00155 }       
00156 
00157 S32 utf16chars_to_wchar(const U16* inchars, llwchar* outchar)
00158 {
00159         const U16* base = inchars;
00160         U16 cur_char = *inchars++;
00161         llwchar char32 = cur_char;
00162         if ((cur_char >= 0xD800) && (cur_char <= 0xDFFF))
00163         {
00164                 // Surrogates
00165                 char32 = ((llwchar)(cur_char - 0xD800)) << 10;
00166                 cur_char = *inchars++;
00167                 char32 += (llwchar)(cur_char - 0xDC00) + 0x0010000UL;
00168         }
00169         else
00170         {
00171                 char32 = (llwchar)cur_char;
00172         }
00173         *outchar = char32;
00174         return inchars - base;
00175 }
00176 
00177 S32 utf16chars_to_utf8chars(const U16* inchars, char* outchars, S32* nchars8p)
00178 {
00179         // Get 32 bit char32
00180         llwchar char32;
00181         S32 nchars16 = utf16chars_to_wchar(inchars, &char32);
00182         // Convert to utf8
00183         S32 nchars8  = wchar_to_utf8chars(char32, outchars);
00184         if (nchars8p)
00185         {
00186                 *nchars8p = nchars8;
00187         }
00188         return nchars16;
00189 }
00190 
00191 llutf16string wstring_to_utf16str(const LLWString &utf32str, S32 len)
00192 {
00193         llutf16string out;
00194 
00195         S32 i = 0;
00196         while (i < len)
00197         {
00198                 U32 cur_char = utf32str[i];
00199                 if (cur_char > 0xFFFF)
00200                 {
00201                         out += (0xD7C0 + (cur_char >> 10));
00202                         out += (0xDC00 | (cur_char & 0x3FF));
00203                 }
00204                 else
00205                 {
00206                         out += cur_char;
00207                 }
00208                 i++;
00209         }
00210         return out;
00211 }
00212 
00213 llutf16string wstring_to_utf16str(const LLWString &utf32str)
00214 {
00215         const S32 len = (S32)utf32str.length();
00216         return wstring_to_utf16str(utf32str, len);
00217 }
00218 
00219 llutf16string utf8str_to_utf16str ( const LLString& utf8str )
00220 {
00221         LLWString wstr = utf8str_to_wstring ( utf8str );
00222         return wstring_to_utf16str ( wstr );
00223 }
00224 
00225 
00226 LLWString utf16str_to_wstring(const llutf16string &utf16str, S32 len)
00227 {
00228         LLWString wout;
00229         if((len <= 0) || utf16str.empty()) return wout;
00230 
00231         S32 i = 0;
00232         // craziness to make gcc happy (llutf16string.c_str() is tweaked on linux):
00233         const U16* chars16 = &(*(utf16str.begin()));
00234         while (i < len)
00235         {
00236                 llwchar cur_char;
00237                 i += utf16chars_to_wchar(chars16+i, &cur_char);
00238                 wout += cur_char;
00239         }
00240         return wout;
00241 }
00242 
00243 LLWString utf16str_to_wstring(const llutf16string &utf16str)
00244 {
00245         const S32 len = (S32)utf16str.length();
00246         return utf16str_to_wstring(utf16str, len);
00247 }
00248 
00249 // Length in llwchar (UTF-32) of the first len units (16 bits) of the given UTF-16 string.
00250 S32 utf16str_wstring_length(const llutf16string &utf16str, const S32 utf16_len)
00251 {
00252         S32 surrogate_pairs = 0;
00253         // ... craziness to make gcc happy (llutf16string.c_str() is tweaked on linux):
00254         const U16 *const utf16_chars = &(*(utf16str.begin()));
00255         S32 i = 0;
00256         while (i < utf16_len)
00257         {
00258                 const U16 c = utf16_chars[i++];
00259                 if (c >= 0xD800 && c <= 0xDBFF)         // See http://en.wikipedia.org/wiki/UTF-16
00260                 {   // Have first byte of a surrogate pair
00261                         if (i >= utf16_len)
00262                         {
00263                                 break;
00264                         }
00265                         const U16 d = utf16_chars[i];
00266                         if (d >= 0xDC00 && d <= 0xDFFF)
00267                         {   // Have valid second byte of a surrogate pair
00268                                 surrogate_pairs++;
00269                                 i++;
00270                         }
00271                 }
00272         }
00273         return utf16_len - surrogate_pairs;
00274 }
00275 
00276 // Length in utf16string (UTF-16) of wlen wchars beginning at woffset.
00277 S32 wstring_utf16_length(const LLWString &wstr, const S32 woffset, const S32 wlen)
00278 {
00279         const S32 end = llmin((S32)wstr.length(), woffset + wlen);
00280         if (end < woffset)
00281         {
00282                 return 0;
00283         }
00284         else
00285         {
00286                 S32 length = end - woffset;
00287                 for (S32 i = woffset; i < end; i++)
00288                 {
00289                         if (wstr[i] >= 0x10000)
00290                         {
00291                                 length++;
00292                         }
00293                 }
00294                 return length;
00295         }
00296 }
00297 
00298 // Given a wstring and an offset in it, returns the length as wstring (i.e.,
00299 // number of llwchars) of the longest substring that starts at the offset
00300 // and whose equivalent utf-16 string does not exceeds the given utf16_length.
00301 S32 wstring_wstring_length_from_utf16_length(const LLWString & wstr, const S32 woffset, const S32 utf16_length, BOOL *unaligned)
00302 {
00303         const S32 end = wstr.length();
00304         BOOL u = FALSE;
00305         S32 n = woffset + utf16_length;
00306         S32 i = woffset;
00307         while (i < end)
00308         {
00309                 if (wstr[i] >= 0x10000)
00310                 {
00311                         --n;
00312                 }
00313                 if (i >= n)
00314                 {
00315                         u = (i > n);
00316                         break;
00317                 }
00318                 i++;
00319         }
00320         if (unaligned)
00321         {
00322                 *unaligned = u;
00323         }
00324         return i - woffset;
00325 }
00326 
00327 S32 wchar_utf8_length(const llwchar wc)
00328 {
00329         if (wc < 0x80)
00330         {
00331                 // This case will also catch negative values which are
00332                 // technically invalid.
00333                 return 1;
00334         }
00335         else if (wc < 0x800)
00336         {
00337                 return 2;
00338         }
00339         else if (wc < 0x10000)
00340         {
00341                 return 3;
00342         }
00343         else if (wc < 0x200000)
00344         {
00345                 return 4;
00346         }
00347         else if (wc < 0x4000000)
00348         {
00349                 return 5;
00350         }
00351         else
00352         {
00353                 return 6;
00354         }
00355 }
00356 
00357 
00358 S32 wstring_utf8_length(const LLWString& wstr)
00359 {
00360         S32 len = 0;
00361         for (S32 i = 0; i < (S32)wstr.length(); i++)
00362         {
00363                 len += wchar_utf8_length(wstr[i]);
00364         }
00365         return len;
00366 }
00367 
00368 
00369 LLWString utf8str_to_wstring(const std::string& utf8str, S32 len)
00370 {
00371         LLWString wout;
00372 
00373         S32 i = 0;
00374         while (i < len)
00375         {
00376                 llwchar unichar;
00377                 U8 cur_char = utf8str[i];
00378 
00379                 if (cur_char < 0x80)
00380                 {
00381                         // Ascii character, just add it
00382                         unichar = cur_char;
00383                 }
00384                 else
00385                 {
00386                         S32 cont_bytes = 0;
00387                         if ((cur_char >> 5) == 0x6)                     // Two byte UTF8 -> 1 UTF32
00388                         {
00389                                 unichar = (0x1F&cur_char);
00390                                 cont_bytes = 1;
00391                         }
00392                         else if ((cur_char >> 4) == 0xe)        // Three byte UTF8 -> 1 UTF32
00393                         {
00394                                 unichar = (0x0F&cur_char);
00395                                 cont_bytes = 2;
00396                         }
00397                         else if ((cur_char >> 3) == 0x1e)       // Four byte UTF8 -> 1 UTF32
00398                         {
00399                                 unichar = (0x07&cur_char);
00400                                 cont_bytes = 3;
00401                         }
00402                         else if ((cur_char >> 2) == 0x3e)       // Five byte UTF8 -> 1 UTF32
00403                         {
00404                                 unichar = (0x03&cur_char);
00405                                 cont_bytes = 4;
00406                         }
00407                         else if ((cur_char >> 1) == 0x7e)       // Six byte UTF8 -> 1 UTF32
00408                         {
00409                                 unichar = (0x01&cur_char);
00410                                 cont_bytes = 5;
00411                         }
00412                         else
00413                         {
00414                                 wout += LL_UNKNOWN_CHAR;
00415                                 ++i;
00416                                 continue;
00417                         }
00418 
00419                         // Check that this character doesn't go past the end of the string
00420                         S32 end = (len < (i + cont_bytes)) ? len : (i + cont_bytes);
00421                         do
00422                         {
00423                                 ++i;
00424 
00425                                 cur_char = utf8str[i];
00426                                 if ( (cur_char >> 6) == 0x2 )
00427                                 {
00428                                         unichar <<= 6;
00429                                         unichar += (0x3F&cur_char);
00430                                 }
00431                                 else
00432                                 {
00433                                         // Malformed sequence - roll back to look at this as a new char
00434                                         unichar = LL_UNKNOWN_CHAR;
00435                                         --i;
00436                                         break;
00437                                 }
00438                         } while(i < end);
00439 
00440                         // Handle overlong characters and NULL characters
00441                         if ( ((cont_bytes == 1) && (unichar < 0x80))
00442                                 || ((cont_bytes == 2) && (unichar < 0x800))
00443                                 || ((cont_bytes == 3) && (unichar < 0x10000))
00444                                 || ((cont_bytes == 4) && (unichar < 0x200000))
00445                                 || ((cont_bytes == 5) && (unichar < 0x4000000)) )
00446                         {
00447                                 unichar = LL_UNKNOWN_CHAR;
00448                         }
00449                 }
00450 
00451                 wout += unichar;
00452                 ++i;
00453         }
00454         return wout;
00455 }
00456 
00457 LLWString utf8str_to_wstring(const std::string& utf8str)
00458 {
00459         const S32 len = (S32)utf8str.length();
00460         return utf8str_to_wstring(utf8str, len);
00461 }
00462 
00463 std::string wstring_to_utf8str(const LLWString& utf32str, S32 len)
00464 {
00465         std::string out;
00466 
00467         S32 i = 0;
00468         while (i < len)
00469         {
00470                 char tchars[8];         /* Flawfinder: ignore */
00471                 S32 n = wchar_to_utf8chars(utf32str[i], tchars);
00472                 tchars[n] = 0;
00473                 out += tchars;
00474                 i++;
00475         }
00476         return out;
00477 }
00478 
00479 std::string wstring_to_utf8str(const LLWString& utf32str)
00480 {
00481         const S32 len = (S32)utf32str.length();
00482         return wstring_to_utf8str(utf32str, len);
00483 }
00484 
00485 std::string utf16str_to_utf8str(const llutf16string& utf16str)
00486 {
00487         return wstring_to_utf8str(utf16str_to_wstring(utf16str));
00488 }
00489 
00490 std::string utf16str_to_utf8str(const llutf16string& utf16str, S32 len)
00491 {
00492         return wstring_to_utf8str(utf16str_to_wstring(utf16str, len), len);
00493 }
00494 
00495 
00496 //LLWString wstring_truncate(const LLWString &wstr, const S32 max_len)
00497 //{
00498 //      return wstr.substr(0, llmin((S32)wstr.length(), max_len));
00499 //}
00500 //
00501 //
00502 //LLWString wstring_trim(const LLWString &wstr)
00503 //{
00504 //      LLWString outstr;
00505 //      outstr = wstring_trimhead(wstr);
00506 //      outstr = wstring_trimtail(outstr);
00507 //      return outstr;
00508 //}
00509 //
00510 //
00511 //LLWString wstring_trimhead(const LLWString &wstr)
00512 //{
00513 //      if(wstr.empty())
00514 //      {
00515 //              return wstr;
00516 //      }
00517 //
00518 //    S32 i = 0;
00519 //      while((i < (S32)wstr.length()) && iswspace(wstr[i]))
00520 //      {
00521 //              i++;
00522 //      }
00523 //      return wstr.substr(i, wstr.length() - i);
00524 //}
00525 //
00526 //
00527 //LLWString wstring_trimtail(const LLWString &wstr)
00528 //{                     
00529 //      if(wstr.empty())
00530 //      {
00531 //              return wstr;
00532 //      }
00533 //
00534 //      S32 len = (S32)wstr.length();
00535 //
00536 //      S32 i = len - 1;
00537 //      while (i >= 0 && iswspace(wstr[i]))
00538 //      {
00539 //              i--;
00540 //      }
00541 //
00542 //      if (i >= 0)
00543 //      {
00544 //              return wstr.substr(0, i + 1);
00545 //      }
00546 //      return wstr;
00547 //}
00548 //
00549 //
00550 //LLWString wstring_copyinto(const LLWString &dest, const LLWString &src, const S32 insert_offset)
00551 //{
00552 //      llassert( insert_offset <= (S32)dest.length() );
00553 //
00554 //      LLWString out_str = dest.substr(0, insert_offset);
00555 //      out_str += src;
00556 //      LLWString tail = dest.substr(insert_offset);
00557 //      out_str += tail;
00558 //
00559 //      return out_str;
00560 //}
00561 
00562 
00563 //LLWString wstring_detabify(const LLWString &wstr, const S32 num_spaces)
00564 //{
00565 //      LLWString out_str;
00566 //      // Replace tabs with spaces
00567 //      for (S32 i = 0; i < (S32)wstr.length(); i++)
00568 //      {
00569 //              if (wstr[i] == '\t')
00570 //              {
00571 //                      for (S32 j = 0; j < num_spaces; j++)
00572 //                              out_str += ' ';
00573 //              }
00574 //              else
00575 //              {
00576 //                      out_str += wstr[i];
00577 //              }
00578 //      }
00579 //      return out_str;
00580 //}
00581 
00582 
00583 //LLWString wstring_makeASCII(const LLWString &wstr)
00584 //{
00585 //      // Replace non-ASCII chars with replace_char
00586 //      LLWString out_str = wstr;
00587 //      for (S32 i = 0; i < (S32)out_str.length(); i++)
00588 //      {
00589 //              if (out_str[i] > 0x7f)
00590 //              {
00591 //                      out_str[i] = LL_UNKNOWN_CHAR;
00592 //              }
00593 //      }
00594 //      return out_str;
00595 //}
00596 
00597 
00598 //LLWString wstring_substChar(const LLWString &wstr, const llwchar target_char, const llwchar replace_char)
00599 //{
00600 //      // Replace all occurences of target_char with replace_char
00601 //      LLWString out_str = wstr;
00602 //      for (S32 i = 0; i < (S32)out_str.length(); i++)
00603 //      {
00604 //              if (out_str[i] == target_char)
00605 //              {
00606 //                      out_str[i] = replace_char;
00607 //              }
00608 //      }
00609 //      return out_str;
00610 //}
00611 //
00612 //
00613 //LLWString wstring_tolower(const LLWString &wstr)
00614 //{
00615 //      LLWString out_str = wstr;
00616 //      for (S32 i = 0; i < (S32)out_str.length(); i++)
00617 //      {
00618 //              out_str[i] = towlower(out_str[i]);
00619 //      }
00620 //      return out_str;
00621 //}
00622 //
00623 //
00624 //LLWString wstring_convert_to_lf(const LLWString &wstr)
00625 //{
00626 //      const llwchar CR = 13;
00627 //      // Remove carriage returns from string with CRLF
00628 //      LLWString out_str;
00629 //
00630 //      for (S32 i = 0; i < (S32)wstr.length(); i++)
00631 //      {
00632 //              if (wstr[i] != CR)
00633 //              {
00634 //                      out_str += wstr[i];
00635 //              }
00636 //      }
00637 //      return out_str;
00638 //}
00639 //
00640 //
00641 //LLWString wstring_convert_to_crlf(const LLWString &wstr)
00642 //{
00643 //      const llwchar LF = 10;
00644 //      const llwchar CR = 13;
00645 //      // Remove carriage returns from string with CRLF
00646 //      LLWString out_str;
00647 //
00648 //      for (S32 i = 0; i < (S32)wstr.length(); i++)
00649 //      {
00650 //              if (wstr[i] == LF)
00651 //              {
00652 //                      out_str += CR;
00653 //              }
00654 //              out_str += wstr[i];
00655 //      }
00656 //      return out_str;
00657 //}
00658 
00659 
00660 //S32   wstring_compare_insensitive(const LLWString &lhs, const LLWString &rhs)
00661 //{
00662 //
00663 //      if (lhs == rhs)
00664 //      {
00665 //              return 0;
00666 //      }
00667 //
00668 //      if (lhs.empty())
00669 //      {
00670 //              return rhs.empty() ? 0 : 1;
00671 //      }
00672 //
00673 //      if (rhs.empty())
00674 //      {
00675 //              return -1;
00676 //      }
00677 //
00678 //#ifdef LL_LINUX
00679 //      // doesn't work because gcc 2.95 doesn't correctly implement c_str().  Sigh...
00680 //      llerrs << "wstring_compare_insensitive doesn't work on Linux!" << llendl;
00681 //      return 0;
00682 //#else
00683 //      LLWString lhs_lower = lhs;
00684 //      LLWString::toLower(lhs_lower);
00685 //      std::string lhs_lower = wstring_to_utf8str(lhs_lower);
00686 //      LLWString rhs_lower = lhs;
00687 //      LLWString::toLower(rhs_lower);
00688 //      std::string rhs_lower = wstring_to_utf8str(rhs_lower);
00689 //
00690 //      return strcmp(lhs_lower.c_str(), rhs_lower.c_str());
00691 //#endif
00692 //}
00693 
00694 
00695 std::string utf8str_trim(const std::string& utf8str)
00696 {
00697         LLWString wstr = utf8str_to_wstring(utf8str);
00698         LLWString::trim(wstr);
00699         return wstring_to_utf8str(wstr);
00700 }
00701 
00702 
00703 std::string utf8str_tolower(const std::string& utf8str)
00704 {
00705         LLWString out_str = utf8str_to_wstring(utf8str);
00706         LLWString::toLower(out_str);
00707         return wstring_to_utf8str(out_str);
00708 }
00709 
00710 
00711 S32 utf8str_compare_insensitive(const std::string& lhs, const std::string& rhs)
00712 {
00713         LLWString wlhs = utf8str_to_wstring(lhs);
00714         LLWString wrhs = utf8str_to_wstring(rhs);
00715         return LLWString::compareInsensitive(wlhs.c_str(), wrhs.c_str());
00716 }
00717 
00718 std::string utf8str_truncate(const std::string& utf8str, const S32 max_len)
00719 {
00720         if (0 == max_len)
00721         {
00722                 return std::string();
00723         }
00724         if ((S32)utf8str.length() <= max_len)
00725         {
00726                 return utf8str;
00727         }
00728         else
00729         {
00730                 S32 cur_char = max_len;
00731 
00732                 // If we're ASCII, we don't need to do anything
00733                 if ((U8)utf8str[cur_char] > 0x7f)
00734                 {
00735                         // If first two bits are (10), it's the tail end of a multibyte char.  We need to shift back
00736                         // to the first character
00737                         while (0x80 == (0xc0 & utf8str[cur_char]))
00738                         {
00739                                 cur_char--;
00740                                 // Keep moving forward until we hit the first char;
00741                                 if (cur_char == 0)
00742                                 {
00743                                         // Make sure we don't trash memory if we've got a bogus string.
00744                                         break;
00745                                 }
00746                         }
00747                 }
00748                 // The byte index we're on is one we want to get rid of, so we only want to copy up to (cur_char-1) chars
00749                 return utf8str.substr(0, cur_char);
00750         }
00751 }
00752 
00753 std::string utf8str_substChar(
00754         const std::string& utf8str,
00755         const llwchar target_char,
00756         const llwchar replace_char)
00757 {
00758         LLWString wstr = utf8str_to_wstring(utf8str);
00759         LLWString::replaceChar(wstr, target_char, replace_char);
00760         //wstr = wstring_substChar(wstr, target_char, replace_char);
00761         return wstring_to_utf8str(wstr);
00762 }
00763 
00764 std::string utf8str_makeASCII(const std::string& utf8str)
00765 {
00766         LLWString wstr = utf8str_to_wstring(utf8str);
00767         LLWString::_makeASCII(wstr);
00768         return wstring_to_utf8str(wstr);
00769 }
00770 
00771 std::string mbcsstring_makeASCII(const std::string& wstr)
00772 {
00773         // Replace non-ASCII chars with replace_char
00774         std::string out_str = wstr;
00775         for (S32 i = 0; i < (S32)out_str.length(); i++)
00776         {
00777                 if ((U8)out_str[i] > 0x7f)
00778                 {
00779                         out_str[i] = LL_UNKNOWN_CHAR;
00780                 }
00781         }
00782         return out_str;
00783 }
00784 std::string utf8str_removeCRLF(const std::string& utf8str)
00785 {
00786         if (0 == utf8str.length())
00787         {
00788                 return std::string();
00789         }
00790         const char CR = 13;
00791 
00792         std::string out;
00793         out.reserve(utf8str.length());
00794         const S32 len = (S32)utf8str.length();
00795         for( S32 i = 0; i < len; i++ )
00796         {
00797                 if( utf8str[i] != CR )
00798                 {
00799                         out.push_back(utf8str[i]);
00800                 }
00801         }
00802         return out;
00803 }
00804 
00805 #if LL_WINDOWS
00806 // documentation moved to header. Phoenix 2007-11-27
00807 namespace snprintf_hack
00808 {
00809         int snprintf(char *str, size_t size, const char *format, ...)
00810         {
00811                 va_list args;
00812                 va_start(args, format);
00813 
00814                 int num_written = _vsnprintf(str, size, format, args); /* Flawfinder: ignore */
00815                 va_end(args);
00816                 
00817                 str[size-1] = '\0'; // always null terminate
00818                 return num_written;
00819         }
00820 }
00821 
00822 std::string ll_convert_wide_to_string(const wchar_t* in)
00823 {
00824         std::string out;
00825         if(in)
00826         {
00827                 int len_in = wcslen(in);
00828                 int len_out = WideCharToMultiByte(
00829                         CP_ACP,
00830                         0,
00831                         in,
00832                         len_in,
00833                         NULL,
00834                         0,
00835                         0,
00836                         0);
00837                 // We will need two more bytes for the double NULL ending
00838                 // created in WideCharToMultiByte().
00839                 char* pout = new char [len_out + 2];
00840                 memset(pout, 0, len_out + 2);
00841                 if(pout)
00842                 {
00843                         WideCharToMultiByte(
00844                                 CP_ACP,
00845                                 0,
00846                                 in,
00847                                 len_in,
00848                                 pout,
00849                                 len_out,
00850                                 0,
00851                                 0);
00852                         out.assign(pout);
00853                         delete[] pout;
00854                 }
00855         }
00856         return out;
00857 }
00858 #endif // LL_WINDOWS
00859 
00860 S32     LLStringOps::collate(const llwchar* a, const llwchar* b)
00861 { 
00862         #if LL_WINDOWS
00863                 // in Windows, wide string functions operator on 16-bit strings, 
00864                 // not the proper 32 bit wide string
00865                 return strcmp(wstring_to_utf8str(LLWString(a)).c_str(), wstring_to_utf8str(LLWString(b)).c_str());
00866         #else
00867                 return wcscoll(a, b);
00868         #endif
00869 }
00870 
00871 namespace LLStringFn
00872 {
00873         void replace_nonprintable(std::basic_string<char>& string, char replacement)
00874         {
00875                 const char MIN = 0x20;
00876                 std::basic_string<char>::size_type len = string.size();
00877                 for(std::basic_string<char>::size_type ii = 0; ii < len; ++ii)
00878                 {
00879                         if(string[ii] < MIN)
00880                         {
00881                                 string[ii] = replacement;
00882                         }
00883                 }
00884         }
00885 
00886         void replace_nonprintable(
00887                 std::basic_string<llwchar>& string,
00888                 llwchar replacement)
00889         {
00890                 const llwchar MIN = 0x20;
00891                 const llwchar MAX = 0x7f;
00892                 std::basic_string<llwchar>::size_type len = string.size();
00893                 for(std::basic_string<llwchar>::size_type ii = 0; ii < len; ++ii)
00894                 {
00895                         if((string[ii] < MIN) || (string[ii] > MAX))
00896                         {
00897                                 string[ii] = replacement;
00898                         }
00899                 }
00900         }
00901 
00902         void replace_nonprintable_and_pipe(std::basic_string<char>& str,
00903                                                                            char replacement)
00904         {
00905                 const char MIN  = 0x20;
00906                 const char PIPE = 0x7c;
00907                 std::basic_string<char>::size_type len = str.size();
00908                 for(std::basic_string<char>::size_type ii = 0; ii < len; ++ii)
00909                 {
00910                         if( (str[ii] < MIN) || (str[ii] == PIPE) )
00911                         {
00912                                 str[ii] = replacement;
00913                         }
00914                 }
00915         }
00916 
00917         void replace_nonprintable_and_pipe(std::basic_string<llwchar>& str,
00918                                                                            llwchar replacement)
00919         {
00920                 const llwchar MIN  = 0x20;
00921                 const llwchar MAX  = 0x7f;
00922                 const llwchar PIPE = 0x7c;
00923                 std::basic_string<llwchar>::size_type len = str.size();
00924                 for(std::basic_string<llwchar>::size_type ii = 0; ii < len; ++ii)
00925                 {
00926                         if( (str[ii] < MIN) || (str[ii] > MAX) || (str[ii] == PIPE) )
00927                         {
00928                                 str[ii] = replacement;
00929                         }
00930                 }
00931         }
00932 }
00933 
00934 
00936 // Testing
00937 
00938 #ifdef _DEBUG
00939 
00940 template<class T> 
00941 void LLStringBase<T>::testHarness()
00942 {
00943         LLString s1;
00944         
00945         llassert( s1.c_str() == NULL );
00946         llassert( s1.size() == 0 );
00947         llassert( s1.empty() );
00948         
00949         LLString s2( "hello");
00950         llassert( !strcmp( s2.c_str(), "hello" ) );
00951         llassert( s2.size() == 5 ); 
00952         llassert( !s2.empty() );
00953         LLString s3( s2 );
00954 
00955         llassert( "hello" == s2 );
00956         llassert( s2 == "hello" );
00957         llassert( s2 > "gello" );
00958         llassert( "gello" < s2 );
00959         llassert( "gello" != s2 );
00960         llassert( s2 != "gello" );
00961 
00962         LLString s4 = s2;
00963         llassert( !s4.empty() );
00964         s4.empty();
00965         llassert( s4.empty() );
00966         
00967         LLString s5("");
00968         llassert( s5.empty() );
00969         
00970         llassert( isValidIndex(s5, 0) );
00971         llassert( !isValidIndex(s5, 1) );
00972         
00973         s3 = s2;
00974         s4 = "hello again";
00975         
00976         s4 += "!";
00977         s4 += s4;
00978         llassert( s4 == "hello again!hello again!" );
00979         
00980         
00981         LLString s6 = s2 + " " + s2;
00982         LLString s7 = s6;
00983         llassert( s6 == s7 );
00984         llassert( !( s6 != s7) );
00985         llassert( !(s6 < s7) );
00986         llassert( !(s6 > s7) );
00987         
00988         llassert( !(s6 == "hi"));
00989         llassert( s6 == "hello hello");
00990         llassert( s6 < "hi");
00991         
00992         llassert( s6[1] == 'e' );
00993         s6[1] = 'f';
00994         llassert( s6[1] == 'f' );
00995         
00996         s2.erase( 4, 1 );
00997         llassert( s2 == "hell");
00998         s2.insert( 0, 'y' );
00999         llassert( s2 == "yhell");
01000         s2.erase( 1, 3 );
01001         llassert( s2 == "yl");
01002         s2.insert( 1, "awn, don't yel");
01003         llassert( s2 == "yawn, don't yell");
01004         
01005         LLString s8 = s2.substr( 6, 5 );
01006         llassert( s8 == "don't"  );
01007         
01008         LLString s9 = "   \t\ntest  \t\t\n  ";
01009         trim(s9);
01010         llassert( s9 == "test"  );
01011 
01012         s8 = "abc123&*(ABC";
01013 
01014         s9 = s8;
01015         toUpper(s9);
01016         llassert( s9 == "ABC123&*(ABC"  );
01017 
01018         s9 = s8;
01019         toLower(s9);
01020         llassert( s9 == "abc123&*(abc"  );
01021 
01022 
01023         LLString s10( 10, 'x' );
01024         llassert( s10 == "xxxxxxxxxx" );
01025 
01026         LLString s11( "monkey in the middle", 7, 2 );
01027         llassert( s11 == "in" );
01028 
01029         LLString s12;  //empty
01030         s12 += "foo";
01031         llassert( s12 == "foo" );
01032 
01033         LLString s13;  //empty
01034         s13 += 'f';
01035         llassert( s13 == "f" );
01036 }
01037 
01038 
01039 #endif  // _DEBUG