Ruby  2.0.0p247(2013-06-27revision41674)
string.c
Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   string.c -
00004 
00005   $Author: nagachika $
00006   created at: Mon Aug  9 17:12:58 JST 1993
00007 
00008   Copyright (C) 1993-2007 Yukihiro Matsumoto
00009   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
00010   Copyright (C) 2000  Information-technology Promotion Agency, Japan
00011 
00012 **********************************************************************/
00013 
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include "vm_core.h"
00018 #include "internal.h"
00019 #include "probes.h"
00020 #include <assert.h>
00021 
00022 #define BEG(no) (regs->beg[(no)])
00023 #define END(no) (regs->end[(no)])
00024 
00025 #include <math.h>
00026 #include <ctype.h>
00027 
00028 #ifdef HAVE_UNISTD_H
00029 #include <unistd.h>
00030 #endif
00031 
00032 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00033 
00034 #undef rb_str_new_cstr
00035 #undef rb_tainted_str_new_cstr
00036 #undef rb_usascii_str_new_cstr
00037 #undef rb_external_str_new_cstr
00038 #undef rb_locale_str_new_cstr
00039 #undef rb_str_new2
00040 #undef rb_str_new3
00041 #undef rb_str_new4
00042 #undef rb_str_new5
00043 #undef rb_tainted_str_new2
00044 #undef rb_usascii_str_new2
00045 #undef rb_str_dup_frozen
00046 #undef rb_str_buf_new_cstr
00047 #undef rb_str_buf_new2
00048 #undef rb_str_buf_cat2
00049 #undef rb_str_cat2
00050 
00051 static VALUE rb_str_clear(VALUE str);
00052 
00053 VALUE rb_cString;
00054 VALUE rb_cSymbol;
00055 
00056 #define RUBY_MAX_CHAR_LEN 16
00057 #define STR_TMPLOCK FL_USER7
00058 #define STR_NOEMBED FL_USER1
00059 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
00060 #define STR_ASSOC   FL_USER3
00061 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
00062 #define STR_ASSOC_P(s)  FL_ALL((s), STR_NOEMBED|STR_ASSOC)
00063 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00064 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
00065 #define STR_UNSET_NOCAPA(s) do {\
00066     if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
00067 } while (0)
00068 
00069 
00070 #define STR_SET_NOEMBED(str) do {\
00071     FL_SET((str), STR_NOEMBED);\
00072     STR_SET_EMBED_LEN((str), 0);\
00073 } while (0)
00074 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
00075 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
00076 #define STR_SET_EMBED_LEN(str, n) do { \
00077     long tmp_n = (n);\
00078     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00079     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00080 } while (0)
00081 
00082 #define STR_SET_LEN(str, n) do { \
00083     if (STR_EMBED_P(str)) {\
00084         STR_SET_EMBED_LEN((str), (n));\
00085     }\
00086     else {\
00087         RSTRING(str)->as.heap.len = (n);\
00088     }\
00089 } while (0)
00090 
00091 #define STR_DEC_LEN(str) do {\
00092     if (STR_EMBED_P(str)) {\
00093         long n = RSTRING_LEN(str);\
00094         n--;\
00095         STR_SET_EMBED_LEN((str), n);\
00096     }\
00097     else {\
00098         RSTRING(str)->as.heap.len--;\
00099     }\
00100 } while (0)
00101 
00102 #define RESIZE_CAPA(str,capacity) do {\
00103     if (STR_EMBED_P(str)) {\
00104         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00105             char *tmp = ALLOC_N(char, (capacity)+1);\
00106             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00107             RSTRING(str)->as.heap.ptr = tmp;\
00108             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00109             STR_SET_NOEMBED(str);\
00110             RSTRING(str)->as.heap.aux.capa = (capacity);\
00111         }\
00112     }\
00113     else {\
00114         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00115         if (!STR_NOCAPA_P(str))\
00116             RSTRING(str)->as.heap.aux.capa = (capacity);\
00117     }\
00118 } while (0)
00119 
00120 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00121 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00122 
00123 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00124 
00125 static inline int
00126 single_byte_optimizable(VALUE str)
00127 {
00128     rb_encoding *enc;
00129 
00130     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
00131     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00132         return 1;
00133 
00134     enc = STR_ENC_GET(str);
00135     if (rb_enc_mbmaxlen(enc) == 1)
00136         return 1;
00137 
00138     /* Conservative.  Possibly single byte.
00139      * "\xa1" in Shift_JIS for example. */
00140     return 0;
00141 }
00142 
00143 VALUE rb_fs;
00144 
00145 static inline const char *
00146 search_nonascii(const char *p, const char *e)
00147 {
00148 #if SIZEOF_VALUE == 8
00149 # define NONASCII_MASK 0x8080808080808080ULL
00150 #elif SIZEOF_VALUE == 4
00151 # define NONASCII_MASK 0x80808080UL
00152 #endif
00153 #ifdef NONASCII_MASK
00154     if ((int)sizeof(VALUE) * 2 < e - p) {
00155         const VALUE *s, *t;
00156         const VALUE lowbits = sizeof(VALUE) - 1;
00157         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00158         while (p < (const char *)s) {
00159             if (!ISASCII(*p))
00160                 return p;
00161             p++;
00162         }
00163         t = (const VALUE*)(~lowbits & (VALUE)e);
00164         while (s < t) {
00165             if (*s & NONASCII_MASK) {
00166                 t = s;
00167                 break;
00168             }
00169             s++;
00170         }
00171         p = (const char *)t;
00172     }
00173 #endif
00174     while (p < e) {
00175         if (!ISASCII(*p))
00176             return p;
00177         p++;
00178     }
00179     return NULL;
00180 }
00181 
00182 static int
00183 coderange_scan(const char *p, long len, rb_encoding *enc)
00184 {
00185     const char *e = p + len;
00186 
00187     if (rb_enc_to_index(enc) == 0) {
00188         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00189         p = search_nonascii(p, e);
00190         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00191     }
00192 
00193     if (rb_enc_asciicompat(enc)) {
00194         p = search_nonascii(p, e);
00195         if (!p) {
00196             return ENC_CODERANGE_7BIT;
00197         }
00198         while (p < e) {
00199             int ret = rb_enc_precise_mbclen(p, e, enc);
00200             if (!MBCLEN_CHARFOUND_P(ret)) {
00201                 return ENC_CODERANGE_BROKEN;
00202             }
00203             p += MBCLEN_CHARFOUND_LEN(ret);
00204             if (p < e) {
00205                 p = search_nonascii(p, e);
00206                 if (!p) {
00207                     return ENC_CODERANGE_VALID;
00208                 }
00209             }
00210         }
00211         if (e < p) {
00212             return ENC_CODERANGE_BROKEN;
00213         }
00214         return ENC_CODERANGE_VALID;
00215     }
00216 
00217     while (p < e) {
00218         int ret = rb_enc_precise_mbclen(p, e, enc);
00219 
00220         if (!MBCLEN_CHARFOUND_P(ret)) {
00221             return ENC_CODERANGE_BROKEN;
00222         }
00223         p += MBCLEN_CHARFOUND_LEN(ret);
00224     }
00225     if (e < p) {
00226         return ENC_CODERANGE_BROKEN;
00227     }
00228     return ENC_CODERANGE_VALID;
00229 }
00230 
00231 long
00232 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00233 {
00234     const char *p = s;
00235 
00236     if (*cr == ENC_CODERANGE_BROKEN)
00237         return e - s;
00238 
00239     if (rb_enc_to_index(enc) == 0) {
00240         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00241         p = search_nonascii(p, e);
00242         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00243         return e - s;
00244     }
00245     else if (rb_enc_asciicompat(enc)) {
00246         p = search_nonascii(p, e);
00247         if (!p) {
00248             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00249             return e - s;
00250         }
00251         while (p < e) {
00252             int ret = rb_enc_precise_mbclen(p, e, enc);
00253             if (!MBCLEN_CHARFOUND_P(ret)) {
00254                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00255                 return p - s;
00256             }
00257             p += MBCLEN_CHARFOUND_LEN(ret);
00258             if (p < e) {
00259                 p = search_nonascii(p, e);
00260                 if (!p) {
00261                     *cr = ENC_CODERANGE_VALID;
00262                     return e - s;
00263                 }
00264             }
00265         }
00266         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00267         return p - s;
00268     }
00269     else {
00270         while (p < e) {
00271             int ret = rb_enc_precise_mbclen(p, e, enc);
00272             if (!MBCLEN_CHARFOUND_P(ret)) {
00273                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00274                 return p - s;
00275             }
00276             p += MBCLEN_CHARFOUND_LEN(ret);
00277         }
00278         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00279         return p - s;
00280     }
00281 }
00282 
00283 static inline void
00284 str_enc_copy(VALUE str1, VALUE str2)
00285 {
00286     rb_enc_set_index(str1, ENCODING_GET(str2));
00287 }
00288 
00289 static void
00290 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00291 {
00292     /* this function is designed for copying encoding and coderange
00293      * from src to new string "dest" which is made from the part of src.
00294      */
00295     str_enc_copy(dest, src);
00296     if (RSTRING_LEN(dest) == 0) {
00297         if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00298             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00299         else
00300             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00301         return;
00302     }
00303     switch (ENC_CODERANGE(src)) {
00304       case ENC_CODERANGE_7BIT:
00305         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00306         break;
00307       case ENC_CODERANGE_VALID:
00308         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00309             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00310             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00311         else
00312             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00313         break;
00314       default:
00315         break;
00316     }
00317 }
00318 
00319 static void
00320 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00321 {
00322     str_enc_copy(dest, src);
00323     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00324 }
00325 
00326 int
00327 rb_enc_str_coderange(VALUE str)
00328 {
00329     int cr = ENC_CODERANGE(str);
00330 
00331     if (cr == ENC_CODERANGE_UNKNOWN) {
00332         rb_encoding *enc = STR_ENC_GET(str);
00333         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00334         ENC_CODERANGE_SET(str, cr);
00335     }
00336     return cr;
00337 }
00338 
00339 int
00340 rb_enc_str_asciionly_p(VALUE str)
00341 {
00342     rb_encoding *enc = STR_ENC_GET(str);
00343 
00344     if (!rb_enc_asciicompat(enc))
00345         return FALSE;
00346     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00347         return TRUE;
00348     return FALSE;
00349 }
00350 
00351 static inline void
00352 str_mod_check(VALUE s, const char *p, long len)
00353 {
00354     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00355         rb_raise(rb_eRuntimeError, "string modified");
00356     }
00357 }
00358 
00359 size_t
00360 rb_str_capacity(VALUE str)
00361 {
00362     if (STR_EMBED_P(str)) {
00363         return RSTRING_EMBED_LEN_MAX;
00364     }
00365     else if (STR_NOCAPA_P(str)) {
00366         return RSTRING(str)->as.heap.len;
00367     }
00368     else {
00369         return RSTRING(str)->as.heap.aux.capa;
00370     }
00371 }
00372 
00373 static inline VALUE
00374 str_alloc(VALUE klass)
00375 {
00376     NEWOBJ_OF(str, struct RString, klass, T_STRING);
00377 
00378     str->as.heap.ptr = 0;
00379     str->as.heap.len = 0;
00380     str->as.heap.aux.capa = 0;
00381 
00382     return (VALUE)str;
00383 }
00384 
00385 static inline VALUE
00386 empty_str_alloc(VALUE klass)
00387 {
00388     if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
00389         RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline());
00390     }
00391     return str_alloc(klass);
00392 }
00393 
00394 static VALUE
00395 str_new(VALUE klass, const char *ptr, long len)
00396 {
00397     VALUE str;
00398 
00399     if (len < 0) {
00400         rb_raise(rb_eArgError, "negative string size (or size too big)");
00401     }
00402 
00403     if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
00404         RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline());
00405     }
00406 
00407     str = str_alloc(klass);
00408     if (len > RSTRING_EMBED_LEN_MAX) {
00409         RSTRING(str)->as.heap.aux.capa = len;
00410         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00411         STR_SET_NOEMBED(str);
00412     }
00413     else if (len == 0) {
00414         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00415     }
00416     if (ptr) {
00417         memcpy(RSTRING_PTR(str), ptr, len);
00418     }
00419     STR_SET_LEN(str, len);
00420     RSTRING_PTR(str)[len] = '\0';
00421     return str;
00422 }
00423 
00424 VALUE
00425 rb_str_new(const char *ptr, long len)
00426 {
00427     return str_new(rb_cString, ptr, len);
00428 }
00429 
00430 VALUE
00431 rb_usascii_str_new(const char *ptr, long len)
00432 {
00433     VALUE str = rb_str_new(ptr, len);
00434     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00435     return str;
00436 }
00437 
00438 VALUE
00439 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00440 {
00441     VALUE str = rb_str_new(ptr, len);
00442     rb_enc_associate(str, enc);
00443     return str;
00444 }
00445 
00446 VALUE
00447 rb_str_new_cstr(const char *ptr)
00448 {
00449     if (!ptr) {
00450         rb_raise(rb_eArgError, "NULL pointer given");
00451     }
00452     return rb_str_new(ptr, strlen(ptr));
00453 }
00454 
00455 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00456 #define rb_str_new2 rb_str_new_cstr
00457 
00458 VALUE
00459 rb_usascii_str_new_cstr(const char *ptr)
00460 {
00461     VALUE str = rb_str_new2(ptr);
00462     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00463     return str;
00464 }
00465 
00466 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00467 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00468 
00469 VALUE
00470 rb_tainted_str_new(const char *ptr, long len)
00471 {
00472     VALUE str = rb_str_new(ptr, len);
00473 
00474     OBJ_TAINT(str);
00475     return str;
00476 }
00477 
00478 VALUE
00479 rb_tainted_str_new_cstr(const char *ptr)
00480 {
00481     VALUE str = rb_str_new2(ptr);
00482 
00483     OBJ_TAINT(str);
00484     return str;
00485 }
00486 
00487 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00488 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00489 
00490 VALUE
00491 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00492 {
00493     extern VALUE rb_cEncodingConverter;
00494     rb_econv_t *ec;
00495     rb_econv_result_t ret;
00496     long len, olen;
00497     VALUE econv_wrapper;
00498     VALUE newstr;
00499     const unsigned char *start, *sp;
00500     unsigned char *dest, *dp;
00501     size_t converted_output = 0;
00502 
00503     if (!to) return str;
00504     if (!from) from = rb_enc_get(str);
00505     if (from == to) return str;
00506     if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00507         to == rb_ascii8bit_encoding()) {
00508         if (STR_ENC_GET(str) != to) {
00509             str = rb_str_dup(str);
00510             rb_enc_associate(str, to);
00511         }
00512         return str;
00513     }
00514 
00515     len = RSTRING_LEN(str);
00516     newstr = rb_str_new(0, len);
00517     olen = len;
00518 
00519     econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
00520     RBASIC(econv_wrapper)->klass = 0;
00521     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00522     if (!ec) return str;
00523     DATA_PTR(econv_wrapper) = ec;
00524 
00525     sp = (unsigned char*)RSTRING_PTR(str);
00526     start = sp;
00527     while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
00528            (dp = dest + converted_output),
00529            (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
00530            ret == econv_destination_buffer_full) {
00531         /* destination buffer short */
00532         size_t converted_input = sp - start;
00533         size_t rest = len - converted_input;
00534         converted_output = dp - dest;
00535         rb_str_set_len(newstr, converted_output);
00536         if (converted_input && converted_output &&
00537             rest < (LONG_MAX / converted_output)) {
00538             rest = (rest * converted_output) / converted_input;
00539         }
00540         else {
00541             rest = olen;
00542         }
00543         olen += rest < 2 ? 2 : rest;
00544         rb_str_resize(newstr, olen);
00545     }
00546     DATA_PTR(econv_wrapper) = 0;
00547     rb_econv_close(ec);
00548     rb_gc_force_recycle(econv_wrapper);
00549     switch (ret) {
00550       case econv_finished:
00551         len = dp - (unsigned char*)RSTRING_PTR(newstr);
00552         rb_str_set_len(newstr, len);
00553         rb_enc_associate(newstr, to);
00554         return newstr;
00555 
00556       default:
00557         /* some error, return original */
00558         return str;
00559     }
00560 }
00561 
00562 VALUE
00563 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00564 {
00565     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00566 }
00567 
00568 VALUE
00569 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00570 {
00571     VALUE str;
00572 
00573     str = rb_tainted_str_new(ptr, len);
00574     if (eenc == rb_usascii_encoding() &&
00575         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00576         rb_enc_associate(str, rb_ascii8bit_encoding());
00577         return str;
00578     }
00579     rb_enc_associate(str, eenc);
00580     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00581 }
00582 
00583 VALUE
00584 rb_external_str_new(const char *ptr, long len)
00585 {
00586     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00587 }
00588 
00589 VALUE
00590 rb_external_str_new_cstr(const char *ptr)
00591 {
00592     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00593 }
00594 
00595 VALUE
00596 rb_locale_str_new(const char *ptr, long len)
00597 {
00598     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00599 }
00600 
00601 VALUE
00602 rb_locale_str_new_cstr(const char *ptr)
00603 {
00604     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00605 }
00606 
00607 VALUE
00608 rb_filesystem_str_new(const char *ptr, long len)
00609 {
00610     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00611 }
00612 
00613 VALUE
00614 rb_filesystem_str_new_cstr(const char *ptr)
00615 {
00616     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00617 }
00618 
00619 VALUE
00620 rb_str_export(VALUE str)
00621 {
00622     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00623 }
00624 
00625 VALUE
00626 rb_str_export_locale(VALUE str)
00627 {
00628     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00629 }
00630 
00631 VALUE
00632 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00633 {
00634     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00635 }
00636 
00637 static VALUE
00638 str_replace_shared_without_enc(VALUE str2, VALUE str)
00639 {
00640     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00641         STR_SET_EMBED(str2);
00642         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00643         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00644     }
00645     else {
00646         str = rb_str_new_frozen(str);
00647         FL_SET(str2, STR_NOEMBED);
00648         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00649         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00650         RSTRING(str2)->as.heap.aux.shared = str;
00651         FL_SET(str2, ELTS_SHARED);
00652     }
00653     return str2;
00654 }
00655 
00656 static VALUE
00657 str_replace_shared(VALUE str2, VALUE str)
00658 {
00659     str_replace_shared_without_enc(str2, str);
00660     rb_enc_cr_str_exact_copy(str2, str);
00661     return str2;
00662 }
00663 
00664 static VALUE
00665 str_new_shared(VALUE klass, VALUE str)
00666 {
00667     return str_replace_shared(str_alloc(klass), str);
00668 }
00669 
00670 static VALUE
00671 str_new3(VALUE klass, VALUE str)
00672 {
00673     return str_new_shared(klass, str);
00674 }
00675 
00676 VALUE
00677 rb_str_new_shared(VALUE str)
00678 {
00679     VALUE str2 = str_new3(rb_obj_class(str), str);
00680 
00681     OBJ_INFECT(str2, str);
00682     return str2;
00683 }
00684 
00685 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00686 #define rb_str_new3 rb_str_new_shared
00687 
00688 static VALUE
00689 str_new4(VALUE klass, VALUE str)
00690 {
00691     VALUE str2;
00692 
00693     str2 = str_alloc(klass);
00694     STR_SET_NOEMBED(str2);
00695     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00696     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00697     if (STR_SHARED_P(str)) {
00698         VALUE shared = RSTRING(str)->as.heap.aux.shared;
00699         assert(OBJ_FROZEN(shared));
00700         FL_SET(str2, ELTS_SHARED);
00701         RSTRING(str2)->as.heap.aux.shared = shared;
00702     }
00703     else {
00704         FL_SET(str, ELTS_SHARED);
00705         RSTRING(str)->as.heap.aux.shared = str2;
00706     }
00707     rb_enc_cr_str_exact_copy(str2, str);
00708     OBJ_INFECT(str2, str);
00709     return str2;
00710 }
00711 
00712 VALUE
00713 rb_str_new_frozen(VALUE orig)
00714 {
00715     VALUE klass, str;
00716 
00717     if (OBJ_FROZEN(orig)) return orig;
00718     klass = rb_obj_class(orig);
00719     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00720         long ofs;
00721         assert(OBJ_FROZEN(str));
00722         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00723         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00724             ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) ||
00725             ENCODING_GET(str) != ENCODING_GET(orig)) {
00726             str = str_new3(klass, str);
00727             RSTRING(str)->as.heap.ptr += ofs;
00728             RSTRING(str)->as.heap.len -= ofs;
00729             rb_enc_cr_str_exact_copy(str, orig);
00730             OBJ_INFECT(str, orig);
00731         }
00732     }
00733     else if (STR_EMBED_P(orig)) {
00734         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00735         rb_enc_cr_str_exact_copy(str, orig);
00736         OBJ_INFECT(str, orig);
00737     }
00738     else if (STR_ASSOC_P(orig)) {
00739         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00740         FL_UNSET(orig, STR_ASSOC);
00741         str = str_new4(klass, orig);
00742         FL_SET(str, STR_ASSOC);
00743         RSTRING(str)->as.heap.aux.shared = assoc;
00744     }
00745     else {
00746         str = str_new4(klass, orig);
00747     }
00748     OBJ_FREEZE(str);
00749     return str;
00750 }
00751 
00752 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00753 #define rb_str_new4 rb_str_new_frozen
00754 
00755 VALUE
00756 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00757 {
00758     return str_new(rb_obj_class(obj), ptr, len);
00759 }
00760 
00761 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00762            rb_str_new_with_class, (obj, ptr, len))
00763 #define rb_str_new5 rb_str_new_with_class
00764 
00765 static VALUE
00766 str_new_empty(VALUE str)
00767 {
00768     VALUE v = rb_str_new5(str, 0, 0);
00769     rb_enc_copy(v, str);
00770     OBJ_INFECT(v, str);
00771     return v;
00772 }
00773 
00774 #define STR_BUF_MIN_SIZE 128
00775 
00776 VALUE
00777 rb_str_buf_new(long capa)
00778 {
00779     VALUE str = str_alloc(rb_cString);
00780 
00781     if (capa < STR_BUF_MIN_SIZE) {
00782         capa = STR_BUF_MIN_SIZE;
00783     }
00784     FL_SET(str, STR_NOEMBED);
00785     RSTRING(str)->as.heap.aux.capa = capa;
00786     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00787     RSTRING(str)->as.heap.ptr[0] = '\0';
00788 
00789     return str;
00790 }
00791 
00792 VALUE
00793 rb_str_buf_new_cstr(const char *ptr)
00794 {
00795     VALUE str;
00796     long len = strlen(ptr);
00797 
00798     str = rb_str_buf_new(len);
00799     rb_str_buf_cat(str, ptr, len);
00800 
00801     return str;
00802 }
00803 
00804 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00805 #define rb_str_buf_new2 rb_str_buf_new_cstr
00806 
00807 VALUE
00808 rb_str_tmp_new(long len)
00809 {
00810     return str_new(0, 0, len);
00811 }
00812 
00813 void *
00814 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
00815 {
00816     VALUE s = rb_str_tmp_new(len);
00817     *store = s;
00818     return RSTRING_PTR(s);
00819 }
00820 
00821 void
00822 rb_free_tmp_buffer(volatile VALUE *store)
00823 {
00824     VALUE s = *store;
00825     *store = 0;
00826     if (s) rb_str_clear(s);
00827 }
00828 
00829 void
00830 rb_str_free(VALUE str)
00831 {
00832     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00833         xfree(RSTRING(str)->as.heap.ptr);
00834     }
00835 }
00836 
00837 RUBY_FUNC_EXPORTED size_t
00838 rb_str_memsize(VALUE str)
00839 {
00840     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00841         return RSTRING(str)->as.heap.aux.capa;
00842     }
00843     else {
00844         return 0;
00845     }
00846 }
00847 
00848 VALUE
00849 rb_str_to_str(VALUE str)
00850 {
00851     return rb_convert_type(str, T_STRING, "String", "to_str");
00852 }
00853 
00854 static inline void str_discard(VALUE str);
00855 
00856 void
00857 rb_str_shared_replace(VALUE str, VALUE str2)
00858 {
00859     rb_encoding *enc;
00860     int cr;
00861     if (str == str2) return;
00862     enc = STR_ENC_GET(str2);
00863     cr = ENC_CODERANGE(str2);
00864     str_discard(str);
00865     OBJ_INFECT(str, str2);
00866     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00867         STR_SET_EMBED(str);
00868         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00869         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00870         rb_enc_associate(str, enc);
00871         ENC_CODERANGE_SET(str, cr);
00872         return;
00873     }
00874     STR_SET_NOEMBED(str);
00875     STR_UNSET_NOCAPA(str);
00876     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00877     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00878     if (STR_NOCAPA_P(str2)) {
00879         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00880         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00881     }
00882     else {
00883         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00884     }
00885     STR_SET_EMBED(str2);        /* abandon str2 */
00886     RSTRING_PTR(str2)[0] = 0;
00887     STR_SET_EMBED_LEN(str2, 0);
00888     rb_enc_associate(str, enc);
00889     ENC_CODERANGE_SET(str, cr);
00890 }
00891 
00892 static ID id_to_s;
00893 
00894 VALUE
00895 rb_obj_as_string(VALUE obj)
00896 {
00897     VALUE str;
00898 
00899     if (RB_TYPE_P(obj, T_STRING)) {
00900         return obj;
00901     }
00902     str = rb_funcall(obj, id_to_s, 0);
00903     if (!RB_TYPE_P(str, T_STRING))
00904         return rb_any_to_s(obj);
00905     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00906     return str;
00907 }
00908 
00909 static VALUE
00910 str_replace(VALUE str, VALUE str2)
00911 {
00912     long len;
00913 
00914     len = RSTRING_LEN(str2);
00915     if (STR_ASSOC_P(str2)) {
00916         str2 = rb_str_new4(str2);
00917     }
00918     if (STR_SHARED_P(str2)) {
00919         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00920         assert(OBJ_FROZEN(shared));
00921         STR_SET_NOEMBED(str);
00922         RSTRING(str)->as.heap.len = len;
00923         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00924         FL_SET(str, ELTS_SHARED);
00925         FL_UNSET(str, STR_ASSOC);
00926         RSTRING(str)->as.heap.aux.shared = shared;
00927     }
00928     else {
00929         str_replace_shared(str, str2);
00930     }
00931 
00932     OBJ_INFECT(str, str2);
00933     rb_enc_cr_str_exact_copy(str, str2);
00934     return str;
00935 }
00936 
00937 static VALUE
00938 str_duplicate(VALUE klass, VALUE str)
00939 {
00940     VALUE dup = str_alloc(klass);
00941     str_replace(dup, str);
00942     return dup;
00943 }
00944 
00945 VALUE
00946 rb_str_dup(VALUE str)
00947 {
00948     return str_duplicate(rb_obj_class(str), str);
00949 }
00950 
00951 VALUE
00952 rb_str_resurrect(VALUE str)
00953 {
00954     if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
00955         RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str),
00956                                   rb_sourcefile(), rb_sourceline());
00957     }
00958     return str_replace(str_alloc(rb_cString), str);
00959 }
00960 
00961 /*
00962  *  call-seq:
00963  *     String.new(str="")   -> new_str
00964  *
00965  *  Returns a new string object containing a copy of <i>str</i>.
00966  */
00967 
00968 static VALUE
00969 rb_str_init(int argc, VALUE *argv, VALUE str)
00970 {
00971     VALUE orig;
00972 
00973     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00974         rb_str_replace(str, orig);
00975     return str;
00976 }
00977 
00978 static inline long
00979 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00980 {
00981     long c;
00982     const char *q;
00983 
00984     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00985         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00986     }
00987     else if (rb_enc_asciicompat(enc)) {
00988         c = 0;
00989         if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00990             while (p < e) {
00991                 if (ISASCII(*p)) {
00992                     q = search_nonascii(p, e);
00993                     if (!q)
00994                         return c + (e - p);
00995                     c += q - p;
00996                     p = q;
00997                 }
00998                 p += rb_enc_fast_mbclen(p, e, enc);
00999                 c++;
01000             }
01001         }
01002         else {
01003             while (p < e) {
01004                 if (ISASCII(*p)) {
01005                     q = search_nonascii(p, e);
01006                     if (!q)
01007                         return c + (e - p);
01008                     c += q - p;
01009                     p = q;
01010                 }
01011                 p += rb_enc_mbclen(p, e, enc);
01012                 c++;
01013             }
01014         }
01015         return c;
01016     }
01017 
01018     for (c=0; p<e; c++) {
01019         p += rb_enc_mbclen(p, e, enc);
01020     }
01021     return c;
01022 }
01023 
01024 long
01025 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
01026 {
01027     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
01028 }
01029 
01030 long
01031 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
01032 {
01033     long c;
01034     const char *q;
01035     int ret;
01036 
01037     *cr = 0;
01038     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01039         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
01040     }
01041     else if (rb_enc_asciicompat(enc)) {
01042         c = 0;
01043         while (p < e) {
01044             if (ISASCII(*p)) {
01045                 q = search_nonascii(p, e);
01046                 if (!q) {
01047                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01048                     return c + (e - p);
01049                 }
01050                 c += q - p;
01051                 p = q;
01052             }
01053             ret = rb_enc_precise_mbclen(p, e, enc);
01054             if (MBCLEN_CHARFOUND_P(ret)) {
01055                 *cr |= ENC_CODERANGE_VALID;
01056                 p += MBCLEN_CHARFOUND_LEN(ret);
01057             }
01058             else {
01059                 *cr = ENC_CODERANGE_BROKEN;
01060                 p++;
01061             }
01062             c++;
01063         }
01064         if (!*cr) *cr = ENC_CODERANGE_7BIT;
01065         return c;
01066     }
01067 
01068     for (c=0; p<e; c++) {
01069         ret = rb_enc_precise_mbclen(p, e, enc);
01070         if (MBCLEN_CHARFOUND_P(ret)) {
01071             *cr |= ENC_CODERANGE_VALID;
01072             p += MBCLEN_CHARFOUND_LEN(ret);
01073         }
01074         else {
01075             *cr = ENC_CODERANGE_BROKEN;
01076             if (p + rb_enc_mbminlen(enc) <= e)
01077                 p += rb_enc_mbminlen(enc);
01078             else
01079                 p = e;
01080         }
01081     }
01082     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01083     return c;
01084 }
01085 
01086 #ifdef NONASCII_MASK
01087 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01088 
01089 /*
01090  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
01091  * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
01092  * Therefore, following pseudo code can detect UTF-8 leading byte.
01093  *
01094  * if (!(byte & 0x80))
01095  *   byte |= 0x40;          // turn on bit6
01096  * return ((byte>>6) & 1);  // bit6 represent it's leading byte or not.
01097  *
01098  * This function calculate every bytes in the argument word `s'
01099  * using the above logic concurrently. and gather every bytes result.
01100  */
01101 static inline VALUE
01102 count_utf8_lead_bytes_with_word(const VALUE *s)
01103 {
01104     VALUE d = *s;
01105 
01106     /* Transform into bit0 represent UTF-8 leading or not. */
01107     d |= ~(d>>1);
01108     d >>= 6;
01109     d &= NONASCII_MASK >> 7;
01110 
01111     /* Gather every bytes. */
01112     d += (d>>8);
01113     d += (d>>16);
01114 #if SIZEOF_VALUE == 8
01115     d += (d>>32);
01116 #endif
01117     return (d&0xF);
01118 }
01119 #endif
01120 
01121 static long
01122 str_strlen(VALUE str, rb_encoding *enc)
01123 {
01124     const char *p, *e;
01125     long n;
01126     int cr;
01127 
01128     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01129     if (!enc) enc = STR_ENC_GET(str);
01130     p = RSTRING_PTR(str);
01131     e = RSTRING_END(str);
01132     cr = ENC_CODERANGE(str);
01133 #ifdef NONASCII_MASK
01134     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01135         enc == rb_utf8_encoding()) {
01136 
01137         VALUE len = 0;
01138         if ((int)sizeof(VALUE) * 2 < e - p) {
01139             const VALUE *s, *t;
01140             const VALUE lowbits = sizeof(VALUE) - 1;
01141             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01142             t = (const VALUE*)(~lowbits & (VALUE)e);
01143             while (p < (const char *)s) {
01144                 if (is_utf8_lead_byte(*p)) len++;
01145                 p++;
01146             }
01147             while (s < t) {
01148                 len += count_utf8_lead_bytes_with_word(s);
01149                 s++;
01150             }
01151             p = (const char *)s;
01152         }
01153         while (p < e) {
01154             if (is_utf8_lead_byte(*p)) len++;
01155             p++;
01156         }
01157         return (long)len;
01158     }
01159 #endif
01160     n = rb_enc_strlen_cr(p, e, enc, &cr);
01161     if (cr) {
01162         ENC_CODERANGE_SET(str, cr);
01163     }
01164     return n;
01165 }
01166 
01167 long
01168 rb_str_strlen(VALUE str)
01169 {
01170     return str_strlen(str, STR_ENC_GET(str));
01171 }
01172 
01173 /*
01174  *  call-seq:
01175  *     str.length   -> integer
01176  *     str.size     -> integer
01177  *
01178  *  Returns the character length of <i>str</i>.
01179  */
01180 
01181 VALUE
01182 rb_str_length(VALUE str)
01183 {
01184     long len;
01185 
01186     len = str_strlen(str, STR_ENC_GET(str));
01187     return LONG2NUM(len);
01188 }
01189 
01190 /*
01191  *  call-seq:
01192  *     str.bytesize  -> integer
01193  *
01194  *  Returns the length of +str+ in bytes.
01195  *
01196  *    "\x80\u3042".bytesize  #=> 4
01197  *    "hello".bytesize       #=> 5
01198  */
01199 
01200 static VALUE
01201 rb_str_bytesize(VALUE str)
01202 {
01203     return LONG2NUM(RSTRING_LEN(str));
01204 }
01205 
01206 /*
01207  *  call-seq:
01208  *     str.empty?   -> true or false
01209  *
01210  *  Returns <code>true</code> if <i>str</i> has a length of zero.
01211  *
01212  *     "hello".empty?   #=> false
01213  *     " ".empty?       #=> false
01214  *     "".empty?        #=> true
01215  */
01216 
01217 static VALUE
01218 rb_str_empty(VALUE str)
01219 {
01220     if (RSTRING_LEN(str) == 0)
01221         return Qtrue;
01222     return Qfalse;
01223 }
01224 
01225 /*
01226  *  call-seq:
01227  *     str + other_str   -> new_str
01228  *
01229  *  Concatenation---Returns a new <code>String</code> containing
01230  *  <i>other_str</i> concatenated to <i>str</i>.
01231  *
01232  *     "Hello from " + self.to_s   #=> "Hello from main"
01233  */
01234 
01235 VALUE
01236 rb_str_plus(VALUE str1, VALUE str2)
01237 {
01238     VALUE str3;
01239     rb_encoding *enc;
01240 
01241     StringValue(str2);
01242     enc = rb_enc_check(str1, str2);
01243     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01244     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01245     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01246            RSTRING_PTR(str2), RSTRING_LEN(str2));
01247     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01248 
01249     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01250         OBJ_TAINT(str3);
01251     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01252                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01253     return str3;
01254 }
01255 
01256 /*
01257  *  call-seq:
01258  *     str * integer   -> new_str
01259  *
01260  *  Copy --- Returns a new String containing +integer+ copies of the receiver.
01261  *  +integer+ must be greater than or equal to 0.
01262  *
01263  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
01264  *     "Ho! " * 0   #=> ""
01265  */
01266 
01267 VALUE
01268 rb_str_times(VALUE str, VALUE times)
01269 {
01270     VALUE str2;
01271     long n, len;
01272     char *ptr2;
01273 
01274     len = NUM2LONG(times);
01275     if (len < 0) {
01276         rb_raise(rb_eArgError, "negative argument");
01277     }
01278     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
01279         rb_raise(rb_eArgError, "argument too big");
01280     }
01281 
01282     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01283     ptr2 = RSTRING_PTR(str2);
01284     if (len) {
01285         n = RSTRING_LEN(str);
01286         memcpy(ptr2, RSTRING_PTR(str), n);
01287         while (n <= len/2) {
01288             memcpy(ptr2 + n, ptr2, n);
01289             n *= 2;
01290         }
01291         memcpy(ptr2 + n, ptr2, len-n);
01292     }
01293     ptr2[RSTRING_LEN(str2)] = '\0';
01294     OBJ_INFECT(str2, str);
01295     rb_enc_cr_str_copy_for_substr(str2, str);
01296 
01297     return str2;
01298 }
01299 
01300 /*
01301  *  call-seq:
01302  *     str % arg   -> new_str
01303  *
01304  *  Format---Uses <i>str</i> as a format specification, and returns the result
01305  *  of applying it to <i>arg</i>. If the format specification contains more than
01306  *  one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
01307  *  containing the values to be substituted. See <code>Kernel::sprintf</code> for
01308  *  details of the format string.
01309  *
01310  *     "%05d" % 123                              #=> "00123"
01311  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
01312  *     "foo = %{foo}" % { :foo => 'bar' }        #=> "foo = bar"
01313  */
01314 
01315 static VALUE
01316 rb_str_format_m(VALUE str, VALUE arg)
01317 {
01318     volatile VALUE tmp = rb_check_array_type(arg);
01319 
01320     if (!NIL_P(tmp)) {
01321         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01322     }
01323     return rb_str_format(1, &arg, str);
01324 }
01325 
01326 static inline void
01327 str_modifiable(VALUE str)
01328 {
01329     if (FL_TEST(str, STR_TMPLOCK)) {
01330         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01331     }
01332     rb_check_frozen(str);
01333     if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01334         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01335 }
01336 
01337 static inline int
01338 str_independent(VALUE str)
01339 {
01340     str_modifiable(str);
01341     if (!STR_SHARED_P(str)) return 1;
01342     if (STR_EMBED_P(str)) return 1;
01343     return 0;
01344 }
01345 
01346 static void
01347 str_make_independent_expand(VALUE str, long expand)
01348 {
01349     char *ptr;
01350     long len = RSTRING_LEN(str);
01351     long capa = len + expand;
01352 
01353     if (len > capa) len = capa;
01354     ptr = ALLOC_N(char, capa + 1);
01355     if (RSTRING_PTR(str)) {
01356         memcpy(ptr, RSTRING_PTR(str), len);
01357     }
01358     STR_SET_NOEMBED(str);
01359     STR_UNSET_NOCAPA(str);
01360     ptr[len] = 0;
01361     RSTRING(str)->as.heap.ptr = ptr;
01362     RSTRING(str)->as.heap.len = len;
01363     RSTRING(str)->as.heap.aux.capa = capa;
01364 }
01365 
01366 #define str_make_independent(str) str_make_independent_expand((str), 0L)
01367 
01368 void
01369 rb_str_modify(VALUE str)
01370 {
01371     if (!str_independent(str))
01372         str_make_independent(str);
01373     ENC_CODERANGE_CLEAR(str);
01374 }
01375 
01376 void
01377 rb_str_modify_expand(VALUE str, long expand)
01378 {
01379     if (expand < 0) {
01380         rb_raise(rb_eArgError, "negative expanding string size");
01381     }
01382     if (!str_independent(str)) {
01383         str_make_independent_expand(str, expand);
01384     }
01385     else if (expand > 0) {
01386         long len = RSTRING_LEN(str);
01387         long capa = len + expand;
01388         if (!STR_EMBED_P(str)) {
01389             REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
01390             RSTRING(str)->as.heap.aux.capa = capa;
01391         }
01392         else if (capa > RSTRING_EMBED_LEN_MAX) {
01393             str_make_independent_expand(str, expand);
01394         }
01395     }
01396     ENC_CODERANGE_CLEAR(str);
01397 }
01398 
01399 /* As rb_str_modify(), but don't clear coderange */
01400 static void
01401 str_modify_keep_cr(VALUE str)
01402 {
01403     if (!str_independent(str))
01404         str_make_independent(str);
01405     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01406         /* Force re-scan later */
01407         ENC_CODERANGE_CLEAR(str);
01408 }
01409 
01410 static inline void
01411 str_discard(VALUE str)
01412 {
01413     str_modifiable(str);
01414     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01415         xfree(RSTRING_PTR(str));
01416         RSTRING(str)->as.heap.ptr = 0;
01417         RSTRING(str)->as.heap.len = 0;
01418     }
01419 }
01420 
01421 void
01422 rb_str_associate(VALUE str, VALUE add)
01423 {
01424     /* sanity check */
01425     rb_check_frozen(str);
01426     if (STR_ASSOC_P(str)) {
01427         /* already associated */
01428         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01429     }
01430     else {
01431         if (STR_SHARED_P(str)) {
01432             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01433             str_make_independent(str);
01434             if (STR_ASSOC_P(assoc)) {
01435                 assoc = RSTRING(assoc)->as.heap.aux.shared;
01436                 rb_ary_concat(assoc, add);
01437                 add = assoc;
01438             }
01439         }
01440         else if (STR_EMBED_P(str)) {
01441             str_make_independent(str);
01442         }
01443         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01444             RESIZE_CAPA(str, RSTRING_LEN(str));
01445         }
01446         FL_SET(str, STR_ASSOC);
01447         RBASIC(add)->klass = 0;
01448         RSTRING(str)->as.heap.aux.shared = add;
01449     }
01450 }
01451 
01452 VALUE
01453 rb_str_associated(VALUE str)
01454 {
01455     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01456     if (STR_ASSOC_P(str)) {
01457         return RSTRING(str)->as.heap.aux.shared;
01458     }
01459     return Qfalse;
01460 }
01461 
01462 void
01463 rb_must_asciicompat(VALUE str)
01464 {
01465     rb_encoding *enc = rb_enc_get(str);
01466     if (!rb_enc_asciicompat(enc)) {
01467         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
01468     }
01469 }
01470 
01471 VALUE
01472 rb_string_value(volatile VALUE *ptr)
01473 {
01474     VALUE s = *ptr;
01475     if (!RB_TYPE_P(s, T_STRING)) {
01476         s = rb_str_to_str(s);
01477         *ptr = s;
01478     }
01479     return s;
01480 }
01481 
01482 char *
01483 rb_string_value_ptr(volatile VALUE *ptr)
01484 {
01485     VALUE str = rb_string_value(ptr);
01486     return RSTRING_PTR(str);
01487 }
01488 
01489 char *
01490 rb_string_value_cstr(volatile VALUE *ptr)
01491 {
01492     VALUE str = rb_string_value(ptr);
01493     char *s = RSTRING_PTR(str);
01494     long len = RSTRING_LEN(str);
01495 
01496     if (!s || memchr(s, 0, len)) {
01497         rb_raise(rb_eArgError, "string contains null byte");
01498     }
01499     if (s[len]) {
01500         rb_str_modify(str);
01501         s = RSTRING_PTR(str);
01502         s[RSTRING_LEN(str)] = 0;
01503     }
01504     return s;
01505 }
01506 
01507 VALUE
01508 rb_check_string_type(VALUE str)
01509 {
01510     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01511     return str;
01512 }
01513 
01514 /*
01515  *  call-seq:
01516  *     String.try_convert(obj) -> string or nil
01517  *
01518  *  Try to convert <i>obj</i> into a String, using to_str method.
01519  *  Returns converted string or nil if <i>obj</i> cannot be converted
01520  *  for any reason.
01521  *
01522  *     String.try_convert("str")     #=> "str"
01523  *     String.try_convert(/re/)      #=> nil
01524  */
01525 static VALUE
01526 rb_str_s_try_convert(VALUE dummy, VALUE str)
01527 {
01528     return rb_check_string_type(str);
01529 }
01530 
01531 static char*
01532 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
01533 {
01534     long nth = *nthp;
01535     if (rb_enc_mbmaxlen(enc) == 1) {
01536         p += nth;
01537     }
01538     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01539         p += nth * rb_enc_mbmaxlen(enc);
01540     }
01541     else if (rb_enc_asciicompat(enc)) {
01542         const char *p2, *e2;
01543         int n;
01544 
01545         while (p < e && 0 < nth) {
01546             e2 = p + nth;
01547             if (e < e2) {
01548                 *nthp = nth;
01549                 return (char *)e;
01550             }
01551             if (ISASCII(*p)) {
01552                 p2 = search_nonascii(p, e2);
01553                 if (!p2) {
01554                     nth -= e2 - p;
01555                     *nthp = nth;
01556                     return (char *)e2;
01557                 }
01558                 nth -= p2 - p;
01559                 p = p2;
01560             }
01561             n = rb_enc_mbclen(p, e, enc);
01562             p += n;
01563             nth--;
01564         }
01565         *nthp = nth;
01566         if (nth != 0) {
01567             return (char *)e;
01568         }
01569         return (char *)p;
01570     }
01571     else {
01572         while (p < e && nth--) {
01573             p += rb_enc_mbclen(p, e, enc);
01574         }
01575     }
01576     if (p > e) p = e;
01577     *nthp = nth;
01578     return (char*)p;
01579 }
01580 
01581 char*
01582 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01583 {
01584     return str_nth_len(p, e, &nth, enc);
01585 }
01586 
01587 static char*
01588 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01589 {
01590     if (singlebyte)
01591         p += nth;
01592     else {
01593         p = str_nth_len(p, e, &nth, enc);
01594     }
01595     if (!p) return 0;
01596     if (p > e) p = e;
01597     return (char *)p;
01598 }
01599 
01600 /* char offset to byte offset */
01601 static long
01602 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01603 {
01604     const char *pp = str_nth(p, e, nth, enc, singlebyte);
01605     if (!pp) return e - p;
01606     return pp - p;
01607 }
01608 
01609 long
01610 rb_str_offset(VALUE str, long pos)
01611 {
01612     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01613                       STR_ENC_GET(str), single_byte_optimizable(str));
01614 }
01615 
01616 #ifdef NONASCII_MASK
01617 static char *
01618 str_utf8_nth(const char *p, const char *e, long *nthp)
01619 {
01620     long nth = *nthp;
01621     if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01622         const VALUE *s, *t;
01623         const VALUE lowbits = sizeof(VALUE) - 1;
01624         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01625         t = (const VALUE*)(~lowbits & (VALUE)e);
01626         while (p < (const char *)s) {
01627             if (is_utf8_lead_byte(*p)) nth--;
01628             p++;
01629         }
01630         do {
01631             nth -= count_utf8_lead_bytes_with_word(s);
01632             s++;
01633         } while (s < t && (int)sizeof(VALUE) <= nth);
01634         p = (char *)s;
01635     }
01636     while (p < e) {
01637         if (is_utf8_lead_byte(*p)) {
01638             if (nth == 0) break;
01639             nth--;
01640         }
01641         p++;
01642     }
01643     *nthp = nth;
01644     return (char *)p;
01645 }
01646 
01647 static long
01648 str_utf8_offset(const char *p, const char *e, long nth)
01649 {
01650     const char *pp = str_utf8_nth(p, e, &nth);
01651     return pp - p;
01652 }
01653 #endif
01654 
01655 /* byte offset to char offset */
01656 long
01657 rb_str_sublen(VALUE str, long pos)
01658 {
01659     if (single_byte_optimizable(str) || pos < 0)
01660         return pos;
01661     else {
01662         char *p = RSTRING_PTR(str);
01663         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01664     }
01665 }
01666 
01667 VALUE
01668 rb_str_subseq(VALUE str, long beg, long len)
01669 {
01670     VALUE str2;
01671 
01672     if (RSTRING_LEN(str) == beg + len &&
01673         RSTRING_EMBED_LEN_MAX < len) {
01674         str2 = rb_str_new_shared(rb_str_new_frozen(str));
01675         rb_str_drop_bytes(str2, beg);
01676     }
01677     else {
01678         str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01679         RB_GC_GUARD(str);
01680     }
01681 
01682     rb_enc_cr_str_copy_for_substr(str2, str);
01683     OBJ_INFECT(str2, str);
01684 
01685     return str2;
01686 }
01687 
01688 static char *
01689 rb_str_subpos(VALUE str, long beg, long *lenp)
01690 {
01691     long len = *lenp;
01692     long slen = -1L;
01693     long blen = RSTRING_LEN(str);
01694     rb_encoding *enc = STR_ENC_GET(str);
01695     char *p, *s = RSTRING_PTR(str), *e = s + blen;
01696 
01697     if (len < 0) return 0;
01698     if (!blen) {
01699         len = 0;
01700     }
01701     if (single_byte_optimizable(str)) {
01702         if (beg > blen) return 0;
01703         if (beg < 0) {
01704             beg += blen;
01705             if (beg < 0) return 0;
01706         }
01707         if (beg + len > blen)
01708             len = blen - beg;
01709         if (len < 0) return 0;
01710         p = s + beg;
01711         goto end;
01712     }
01713     if (beg < 0) {
01714         if (len > -beg) len = -beg;
01715         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01716             beg = -beg;
01717             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01718             p = e;
01719             if (!p) return 0;
01720             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01721             if (!p) return 0;
01722             len = e - p;
01723             goto end;
01724         }
01725         else {
01726             slen = str_strlen(str, enc);
01727             beg += slen;
01728             if (beg < 0) return 0;
01729             p = s + beg;
01730             if (len == 0) goto end;
01731         }
01732     }
01733     else if (beg > 0 && beg > RSTRING_LEN(str)) {
01734         return 0;
01735     }
01736     if (len == 0) {
01737         if (beg > str_strlen(str, enc)) return 0;
01738         p = s + beg;
01739     }
01740 #ifdef NONASCII_MASK
01741     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01742         enc == rb_utf8_encoding()) {
01743         p = str_utf8_nth(s, e, &beg);
01744         if (beg > 0) return 0;
01745         len = str_utf8_offset(p, e, len);
01746     }
01747 #endif
01748     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01749         int char_sz = rb_enc_mbmaxlen(enc);
01750 
01751         p = s + beg * char_sz;
01752         if (p > e) {
01753             return 0;
01754         }
01755         else if (len * char_sz > e - p)
01756             len = e - p;
01757         else
01758             len *= char_sz;
01759     }
01760     else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
01761         if (beg > 0) return 0;
01762         len = 0;
01763     }
01764     else {
01765         len = str_offset(p, e, len, enc, 0);
01766     }
01767   end:
01768     *lenp = len;
01769     RB_GC_GUARD(str);
01770     return p;
01771 }
01772 
01773 VALUE
01774 rb_str_substr(VALUE str, long beg, long len)
01775 {
01776     VALUE str2;
01777     char *p = rb_str_subpos(str, beg, &len);
01778 
01779     if (!p) return Qnil;
01780     if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) {
01781         str2 = rb_str_new4(str);
01782         str2 = str_new3(rb_obj_class(str2), str2);
01783         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01784         RSTRING(str2)->as.heap.len = len;
01785     }
01786     else {
01787         str2 = rb_str_new5(str, p, len);
01788         rb_enc_cr_str_copy_for_substr(str2, str);
01789         OBJ_INFECT(str2, str);
01790         RB_GC_GUARD(str);
01791     }
01792 
01793     return str2;
01794 }
01795 
01796 VALUE
01797 rb_str_freeze(VALUE str)
01798 {
01799     if (STR_ASSOC_P(str)) {
01800         VALUE ary = RSTRING(str)->as.heap.aux.shared;
01801         OBJ_FREEZE(ary);
01802     }
01803     return rb_obj_freeze(str);
01804 }
01805 
01806 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01807 #define rb_str_dup_frozen rb_str_new_frozen
01808 
01809 VALUE
01810 rb_str_locktmp(VALUE str)
01811 {
01812     if (FL_TEST(str, STR_TMPLOCK)) {
01813         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01814     }
01815     FL_SET(str, STR_TMPLOCK);
01816     return str;
01817 }
01818 
01819 VALUE
01820 rb_str_unlocktmp(VALUE str)
01821 {
01822     if (!FL_TEST(str, STR_TMPLOCK)) {
01823         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01824     }
01825     FL_UNSET(str, STR_TMPLOCK);
01826     return str;
01827 }
01828 
01829 void
01830 rb_str_set_len(VALUE str, long len)
01831 {
01832     long capa;
01833 
01834     str_modifiable(str);
01835     if (STR_SHARED_P(str)) {
01836         rb_raise(rb_eRuntimeError, "can't set length of shared string");
01837     }
01838     if (len > (capa = (long)rb_str_capacity(str))) {
01839         rb_bug("probable buffer overflow: %ld for %ld", len, capa);
01840     }
01841     STR_SET_LEN(str, len);
01842     RSTRING_PTR(str)[len] = '\0';
01843 }
01844 
01845 VALUE
01846 rb_str_resize(VALUE str, long len)
01847 {
01848     long slen;
01849     int independent;
01850 
01851     if (len < 0) {
01852         rb_raise(rb_eArgError, "negative string size (or size too big)");
01853     }
01854 
01855     independent = str_independent(str);
01856     ENC_CODERANGE_CLEAR(str);
01857     slen = RSTRING_LEN(str);
01858     if (len != slen) {
01859         if (STR_EMBED_P(str)) {
01860             if (len <= RSTRING_EMBED_LEN_MAX) {
01861                 STR_SET_EMBED_LEN(str, len);
01862                 RSTRING(str)->as.ary[len] = '\0';
01863                 return str;
01864             }
01865             str_make_independent_expand(str, len - slen);
01866             STR_SET_NOEMBED(str);
01867         }
01868         else if (len <= RSTRING_EMBED_LEN_MAX) {
01869             char *ptr = RSTRING(str)->as.heap.ptr;
01870             STR_SET_EMBED(str);
01871             if (slen > len) slen = len;
01872             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01873             RSTRING(str)->as.ary[len] = '\0';
01874             STR_SET_EMBED_LEN(str, len);
01875             if (independent) xfree(ptr);
01876             return str;
01877         }
01878         else if (!independent) {
01879             str_make_independent_expand(str, len - slen);
01880         }
01881         else if (slen < len || slen - len > 1024) {
01882             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01883         }
01884         if (!STR_NOCAPA_P(str)) {
01885             RSTRING(str)->as.heap.aux.capa = len;
01886         }
01887         RSTRING(str)->as.heap.len = len;
01888         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
01889     }
01890     return str;
01891 }
01892 
01893 static VALUE
01894 str_buf_cat(VALUE str, const char *ptr, long len)
01895 {
01896     long capa, total, off = -1;
01897 
01898     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01899         off = ptr - RSTRING_PTR(str);
01900     }
01901     rb_str_modify(str);
01902     if (len == 0) return 0;
01903     if (STR_ASSOC_P(str)) {
01904         FL_UNSET(str, STR_ASSOC);
01905         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01906     }
01907     else if (STR_EMBED_P(str)) {
01908         capa = RSTRING_EMBED_LEN_MAX;
01909     }
01910     else {
01911         capa = RSTRING(str)->as.heap.aux.capa;
01912     }
01913     if (RSTRING_LEN(str) >= LONG_MAX - len) {
01914         rb_raise(rb_eArgError, "string sizes too big");
01915     }
01916     total = RSTRING_LEN(str)+len;
01917     if (capa <= total) {
01918         while (total > capa) {
01919             if (capa + 1 >= LONG_MAX / 2) {
01920                 capa = (total + 4095) / 4096;
01921                 break;
01922             }
01923             capa = (capa + 1) * 2;
01924         }
01925         RESIZE_CAPA(str, capa);
01926     }
01927     if (off != -1) {
01928         ptr = RSTRING_PTR(str) + off;
01929     }
01930     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01931     STR_SET_LEN(str, total);
01932     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
01933 
01934     return str;
01935 }
01936 
01937 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
01938 
01939 VALUE
01940 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01941 {
01942     if (len == 0) return str;
01943     if (len < 0) {
01944         rb_raise(rb_eArgError, "negative string size (or size too big)");
01945     }
01946     return str_buf_cat(str, ptr, len);
01947 }
01948 
01949 VALUE
01950 rb_str_buf_cat2(VALUE str, const char *ptr)
01951 {
01952     return rb_str_buf_cat(str, ptr, strlen(ptr));
01953 }
01954 
01955 VALUE
01956 rb_str_cat(VALUE str, const char *ptr, long len)
01957 {
01958     if (len < 0) {
01959         rb_raise(rb_eArgError, "negative string size (or size too big)");
01960     }
01961     if (STR_ASSOC_P(str)) {
01962         char *p;
01963         rb_str_modify_expand(str, len);
01964         p = RSTRING(str)->as.heap.ptr;
01965         memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
01966         len = RSTRING(str)->as.heap.len += len;
01967         p[len] = '\0'; /* sentinel */
01968         return str;
01969     }
01970 
01971     return rb_str_buf_cat(str, ptr, len);
01972 }
01973 
01974 VALUE
01975 rb_str_cat2(VALUE str, const char *ptr)
01976 {
01977     return rb_str_cat(str, ptr, strlen(ptr));
01978 }
01979 
01980 static VALUE
01981 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01982     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01983 {
01984     int str_encindex = ENCODING_GET(str);
01985     int res_encindex;
01986     int str_cr, res_cr;
01987 
01988     str_cr = ENC_CODERANGE(str);
01989 
01990     if (str_encindex == ptr_encindex) {
01991         if (str_cr == ENC_CODERANGE_UNKNOWN)
01992             ptr_cr = ENC_CODERANGE_UNKNOWN;
01993         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01994             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01995         }
01996     }
01997     else {
01998         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01999         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
02000         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
02001             if (len == 0)
02002                 return str;
02003             if (RSTRING_LEN(str) == 0) {
02004                 rb_str_buf_cat(str, ptr, len);
02005                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
02006                 return str;
02007             }
02008             goto incompatible;
02009         }
02010         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
02011             ptr_cr = coderange_scan(ptr, len, ptr_enc);
02012         }
02013         if (str_cr == ENC_CODERANGE_UNKNOWN) {
02014             if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
02015                 str_cr = rb_enc_str_coderange(str);
02016             }
02017         }
02018     }
02019     if (ptr_cr_ret)
02020         *ptr_cr_ret = ptr_cr;
02021 
02022     if (str_encindex != ptr_encindex &&
02023         str_cr != ENC_CODERANGE_7BIT &&
02024         ptr_cr != ENC_CODERANGE_7BIT) {
02025       incompatible:
02026         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
02027             rb_enc_name(rb_enc_from_index(str_encindex)),
02028             rb_enc_name(rb_enc_from_index(ptr_encindex)));
02029     }
02030 
02031     if (str_cr == ENC_CODERANGE_UNKNOWN) {
02032         res_encindex = str_encindex;
02033         res_cr = ENC_CODERANGE_UNKNOWN;
02034     }
02035     else if (str_cr == ENC_CODERANGE_7BIT) {
02036         if (ptr_cr == ENC_CODERANGE_7BIT) {
02037             res_encindex = str_encindex;
02038             res_cr = ENC_CODERANGE_7BIT;
02039         }
02040         else {
02041             res_encindex = ptr_encindex;
02042             res_cr = ptr_cr;
02043         }
02044     }
02045     else if (str_cr == ENC_CODERANGE_VALID) {
02046         res_encindex = str_encindex;
02047         if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
02048             res_cr = str_cr;
02049         else
02050             res_cr = ptr_cr;
02051     }
02052     else { /* str_cr == ENC_CODERANGE_BROKEN */
02053         res_encindex = str_encindex;
02054         res_cr = str_cr;
02055         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
02056     }
02057 
02058     if (len < 0) {
02059         rb_raise(rb_eArgError, "negative string size (or size too big)");
02060     }
02061     str_buf_cat(str, ptr, len);
02062     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
02063     return str;
02064 }
02065 
02066 VALUE
02067 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
02068 {
02069     return rb_enc_cr_str_buf_cat(str, ptr, len,
02070         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
02071 }
02072 
02073 VALUE
02074 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
02075 {
02076     /* ptr must reference NUL terminated ASCII string. */
02077     int encindex = ENCODING_GET(str);
02078     rb_encoding *enc = rb_enc_from_index(encindex);
02079     if (rb_enc_asciicompat(enc)) {
02080         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
02081             encindex, ENC_CODERANGE_7BIT, 0);
02082     }
02083     else {
02084         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
02085         while (*ptr) {
02086             unsigned int c = (unsigned char)*ptr;
02087             int len = rb_enc_codelen(c, enc);
02088             rb_enc_mbcput(c, buf, enc);
02089             rb_enc_cr_str_buf_cat(str, buf, len,
02090                 encindex, ENC_CODERANGE_VALID, 0);
02091             ptr++;
02092         }
02093         return str;
02094     }
02095 }
02096 
02097 VALUE
02098 rb_str_buf_append(VALUE str, VALUE str2)
02099 {
02100     int str2_cr;
02101 
02102     str2_cr = ENC_CODERANGE(str2);
02103 
02104     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
02105         ENCODING_GET(str2), str2_cr, &str2_cr);
02106 
02107     OBJ_INFECT(str, str2);
02108     ENC_CODERANGE_SET(str2, str2_cr);
02109 
02110     return str;
02111 }
02112 
02113 VALUE
02114 rb_str_append(VALUE str, VALUE str2)
02115 {
02116     rb_encoding *enc;
02117     int cr, cr2;
02118     long len2;
02119 
02120     StringValue(str2);
02121     if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
02122         long len = RSTRING_LEN(str) + len2;
02123         enc = rb_enc_check(str, str2);
02124         cr = ENC_CODERANGE(str);
02125         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
02126         rb_str_modify_expand(str, len2);
02127         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
02128                RSTRING_PTR(str2), len2+1);
02129         RSTRING(str)->as.heap.len = len;
02130         rb_enc_associate(str, enc);
02131         ENC_CODERANGE_SET(str, cr);
02132         OBJ_INFECT(str, str2);
02133         return str;
02134     }
02135     return rb_str_buf_append(str, str2);
02136 }
02137 
02138 /*
02139  *  call-seq:
02140  *     str << integer       -> str
02141  *     str.concat(integer)  -> str
02142  *     str << obj           -> str
02143  *     str.concat(obj)      -> str
02144  *
02145  *  Append---Concatenates the given object to <i>str</i>. If the object is a
02146  *  <code>Integer</code>, it is considered as a codepoint, and is converted
02147  *  to a character before concatenation.
02148  *
02149  *     a = "hello "
02150  *     a << "world"   #=> "hello world"
02151  *     a.concat(33)   #=> "hello world!"
02152  */
02153 
02154 VALUE
02155 rb_str_concat(VALUE str1, VALUE str2)
02156 {
02157     unsigned int code;
02158     rb_encoding *enc = STR_ENC_GET(str1);
02159 
02160     if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) {
02161         if (rb_num_to_uint(str2, &code) == 0) {
02162         }
02163         else if (FIXNUM_P(str2)) {
02164             rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
02165         }
02166         else {
02167             rb_raise(rb_eRangeError, "bignum out of char range");
02168         }
02169     }
02170     else {
02171         return rb_str_append(str1, str2);
02172     }
02173 
02174     if (enc == rb_usascii_encoding()) {
02175         /* US-ASCII automatically extended to ASCII-8BIT */
02176         char buf[1];
02177         buf[0] = (char)code;
02178         if (code > 0xFF) {
02179             rb_raise(rb_eRangeError, "%u out of char range", code);
02180         }
02181         rb_str_cat(str1, buf, 1);
02182         if (code > 127) {
02183             rb_enc_associate(str1, rb_ascii8bit_encoding());
02184             ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
02185         }
02186     }
02187     else {
02188         long pos = RSTRING_LEN(str1);
02189         int cr = ENC_CODERANGE(str1);
02190         int len;
02191         char *buf;
02192 
02193         switch (len = rb_enc_codelen(code, enc)) {
02194           case ONIGERR_INVALID_CODE_POINT_VALUE:
02195             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02196             break;
02197           case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
02198           case 0:
02199             rb_raise(rb_eRangeError, "%u out of char range", code);
02200             break;
02201         }
02202         buf = ALLOCA_N(char, len + 1);
02203         rb_enc_mbcput(code, buf, enc);
02204         if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
02205             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02206         }
02207         rb_str_resize(str1, pos+len);
02208         memcpy(RSTRING_PTR(str1) + pos, buf, len);
02209         if (cr == ENC_CODERANGE_7BIT && code > 127)
02210             cr = ENC_CODERANGE_VALID;
02211         ENC_CODERANGE_SET(str1, cr);
02212     }
02213     return str1;
02214 }
02215 
02216 /*
02217  *  call-seq:
02218  *     str.prepend(other_str)  -> str
02219  *
02220  *  Prepend---Prepend the given string to <i>str</i>.
02221  *
02222  *     a = "world"
02223  *     a.prepend("hello ") #=> "hello world"
02224  *     a                   #=> "hello world"
02225  */
02226 
02227 static VALUE
02228 rb_str_prepend(VALUE str, VALUE str2)
02229 {
02230     StringValue(str2);
02231     StringValue(str);
02232     rb_str_update(str, 0L, 0L, str2);
02233     return str;
02234 }
02235 
02236 st_index_t
02237 rb_str_hash(VALUE str)
02238 {
02239     int e = ENCODING_GET(str);
02240     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02241         e = 0;
02242     }
02243     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02244 }
02245 
02246 int
02247 rb_str_hash_cmp(VALUE str1, VALUE str2)
02248 {
02249     long len;
02250 
02251     if (!rb_str_comparable(str1, str2)) return 1;
02252     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02253         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02254         return 0;
02255     }
02256     return 1;
02257 }
02258 
02259 /*
02260  * call-seq:
02261  *    str.hash   -> fixnum
02262  *
02263  * Return a hash based on the string's length and content.
02264  */
02265 
02266 static VALUE
02267 rb_str_hash_m(VALUE str)
02268 {
02269     st_index_t hval = rb_str_hash(str);
02270     return INT2FIX(hval);
02271 }
02272 
02273 #define lesser(a,b) (((a)>(b))?(b):(a))
02274 
02275 int
02276 rb_str_comparable(VALUE str1, VALUE str2)
02277 {
02278     int idx1, idx2;
02279     int rc1, rc2;
02280 
02281     if (RSTRING_LEN(str1) == 0) return TRUE;
02282     if (RSTRING_LEN(str2) == 0) return TRUE;
02283     idx1 = ENCODING_GET(str1);
02284     idx2 = ENCODING_GET(str2);
02285     if (idx1 == idx2) return TRUE;
02286     rc1 = rb_enc_str_coderange(str1);
02287     rc2 = rb_enc_str_coderange(str2);
02288     if (rc1 == ENC_CODERANGE_7BIT) {
02289         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02290         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02291             return TRUE;
02292     }
02293     if (rc2 == ENC_CODERANGE_7BIT) {
02294         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02295             return TRUE;
02296     }
02297     return FALSE;
02298 }
02299 
02300 int
02301 rb_str_cmp(VALUE str1, VALUE str2)
02302 {
02303     long len1, len2;
02304     const char *ptr1, *ptr2;
02305     int retval;
02306 
02307     if (str1 == str2) return 0;
02308     RSTRING_GETMEM(str1, ptr1, len1);
02309     RSTRING_GETMEM(str2, ptr2, len2);
02310     if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
02311         if (len1 == len2) {
02312             if (!rb_str_comparable(str1, str2)) {
02313                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02314                     return 1;
02315                 return -1;
02316             }
02317             return 0;
02318         }
02319         if (len1 > len2) return 1;
02320         return -1;
02321     }
02322     if (retval > 0) return 1;
02323     return -1;
02324 }
02325 
02326 /* expect tail call optimization */
02327 static VALUE
02328 str_eql(const VALUE str1, const VALUE str2)
02329 {
02330     const long len = RSTRING_LEN(str1);
02331     const char *ptr1, *ptr2;
02332 
02333     if (len != RSTRING_LEN(str2)) return Qfalse;
02334     if (!rb_str_comparable(str1, str2)) return Qfalse;
02335     if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
02336         return Qtrue;
02337     if (memcmp(ptr1, ptr2, len) == 0)
02338         return Qtrue;
02339     return Qfalse;
02340 }
02341 /*
02342  *  call-seq:
02343  *     str == obj   -> true or false
02344  *
02345  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
02346  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
02347  *  <code><=></code> <i>obj</i> returns zero.
02348  */
02349 
02350 VALUE
02351 rb_str_equal(VALUE str1, VALUE str2)
02352 {
02353     if (str1 == str2) return Qtrue;
02354     if (!RB_TYPE_P(str2, T_STRING)) {
02355         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02356             return Qfalse;
02357         }
02358         return rb_equal(str2, str1);
02359     }
02360     return str_eql(str1, str2);
02361 }
02362 
02363 /*
02364  * call-seq:
02365  *   str.eql?(other)   -> true or false
02366  *
02367  * Two strings are equal if they have the same length and content.
02368  */
02369 
02370 static VALUE
02371 rb_str_eql(VALUE str1, VALUE str2)
02372 {
02373     if (str1 == str2) return Qtrue;
02374     if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
02375     return str_eql(str1, str2);
02376 }
02377 
02378 /*
02379  *  call-seq:
02380  *     string <=> other_string   -> -1, 0, +1 or nil
02381  *
02382  *
02383  *  Comparison---Returns -1, 0, +1 or nil depending on whether +string+ is less
02384  *  than, equal to, or greater than +other_string+.
02385  *
02386  *  +nil+ is returned if the two values are incomparable.
02387  *
02388  *  If the strings are of different lengths, and the strings are equal when
02389  *  compared up to the shortest length, then the longer string is considered
02390  *  greater than the shorter one.
02391  *
02392  *  <code><=></code> is the basis for the methods <code><</code>,
02393  *  <code><=</code>, <code>></code>, <code>>=</code>, and
02394  *  <code>between?</code>, included from module Comparable. The method
02395  *  String#== does not use Comparable#==.
02396  *
02397  *     "abcdef" <=> "abcde"     #=> 1
02398  *     "abcdef" <=> "abcdef"    #=> 0
02399  *     "abcdef" <=> "abcdefg"   #=> -1
02400  *     "abcdef" <=> "ABCDEF"    #=> 1
02401  */
02402 
02403 static VALUE
02404 rb_str_cmp_m(VALUE str1, VALUE str2)
02405 {
02406     int result;
02407 
02408     if (!RB_TYPE_P(str2, T_STRING)) {
02409         VALUE tmp = rb_check_funcall(str2, rb_intern("to_str"), 0, 0);
02410         if (RB_TYPE_P(tmp, T_STRING)) {
02411             result = rb_str_cmp(str1, tmp);
02412         }
02413         else {
02414             return rb_invcmp(str1, str2);
02415         }
02416     }
02417     else {
02418         result = rb_str_cmp(str1, str2);
02419     }
02420     return INT2FIX(result);
02421 }
02422 
02423 /*
02424  *  call-seq:
02425  *     str.casecmp(other_str)   -> -1, 0, +1 or nil
02426  *
02427  *  Case-insensitive version of <code>String#<=></code>.
02428  *
02429  *     "abcdef".casecmp("abcde")     #=> 1
02430  *     "aBcDeF".casecmp("abcdef")    #=> 0
02431  *     "abcdef".casecmp("abcdefg")   #=> -1
02432  *     "abcdef".casecmp("ABCDEF")    #=> 0
02433  */
02434 
02435 static VALUE
02436 rb_str_casecmp(VALUE str1, VALUE str2)
02437 {
02438     long len;
02439     rb_encoding *enc;
02440     char *p1, *p1end, *p2, *p2end;
02441 
02442     StringValue(str2);
02443     enc = rb_enc_compatible(str1, str2);
02444     if (!enc) {
02445         return Qnil;
02446     }
02447 
02448     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02449     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02450     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02451         while (p1 < p1end && p2 < p2end) {
02452             if (*p1 != *p2) {
02453                 unsigned int c1 = TOUPPER(*p1 & 0xff);
02454                 unsigned int c2 = TOUPPER(*p2 & 0xff);
02455                 if (c1 != c2)
02456                     return INT2FIX(c1 < c2 ? -1 : 1);
02457             }
02458             p1++;
02459             p2++;
02460         }
02461     }
02462     else {
02463         while (p1 < p1end && p2 < p2end) {
02464             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02465             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02466 
02467             if (0 <= c1 && 0 <= c2) {
02468                 c1 = TOUPPER(c1);
02469                 c2 = TOUPPER(c2);
02470                 if (c1 != c2)
02471                     return INT2FIX(c1 < c2 ? -1 : 1);
02472             }
02473             else {
02474                 int r;
02475                 l1 = rb_enc_mbclen(p1, p1end, enc);
02476                 l2 = rb_enc_mbclen(p2, p2end, enc);
02477                 len = l1 < l2 ? l1 : l2;
02478                 r = memcmp(p1, p2, len);
02479                 if (r != 0)
02480                     return INT2FIX(r < 0 ? -1 : 1);
02481                 if (l1 != l2)
02482                     return INT2FIX(l1 < l2 ? -1 : 1);
02483             }
02484             p1 += l1;
02485             p2 += l2;
02486         }
02487     }
02488     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02489     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02490     return INT2FIX(-1);
02491 }
02492 
02493 static long
02494 rb_str_index(VALUE str, VALUE sub, long offset)
02495 {
02496     long pos;
02497     char *s, *sptr, *e;
02498     long len, slen;
02499     rb_encoding *enc;
02500 
02501     enc = rb_enc_check(str, sub);
02502     if (is_broken_string(sub)) {
02503         return -1;
02504     }
02505     len = str_strlen(str, enc);
02506     slen = str_strlen(sub, enc);
02507     if (offset < 0) {
02508         offset += len;
02509         if (offset < 0) return -1;
02510     }
02511     if (len - offset < slen) return -1;
02512     s = RSTRING_PTR(str);
02513     e = s + RSTRING_LEN(str);
02514     if (offset) {
02515         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02516         s += offset;
02517     }
02518     if (slen == 0) return offset;
02519     /* need proceed one character at a time */
02520     sptr = RSTRING_PTR(sub);
02521     slen = RSTRING_LEN(sub);
02522     len = RSTRING_LEN(str) - offset;
02523     for (;;) {
02524         char *t;
02525         pos = rb_memsearch(sptr, slen, s, len, enc);
02526         if (pos < 0) return pos;
02527         t = rb_enc_right_char_head(s, s+pos, e, enc);
02528         if (t == s + pos) break;
02529         if ((len -= t - s) <= 0) return -1;
02530         offset += t - s;
02531         s = t;
02532     }
02533     return pos + offset;
02534 }
02535 
02536 
02537 /*
02538  *  call-seq:
02539  *     str.index(substring [, offset])   -> fixnum or nil
02540  *     str.index(regexp [, offset])      -> fixnum or nil
02541  *
02542  *  Returns the index of the first occurrence of the given <i>substring</i> or
02543  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02544  *  found. If the second parameter is present, it specifies the position in the
02545  *  string to begin the search.
02546  *
02547  *     "hello".index('e')             #=> 1
02548  *     "hello".index('lo')            #=> 3
02549  *     "hello".index('a')             #=> nil
02550  *     "hello".index(?e)              #=> 1
02551  *     "hello".index(/[aeiou]/, -3)   #=> 4
02552  */
02553 
02554 static VALUE
02555 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02556 {
02557     VALUE sub;
02558     VALUE initpos;
02559     long pos;
02560 
02561     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02562         pos = NUM2LONG(initpos);
02563     }
02564     else {
02565         pos = 0;
02566     }
02567     if (pos < 0) {
02568         pos += str_strlen(str, STR_ENC_GET(str));
02569         if (pos < 0) {
02570             if (RB_TYPE_P(sub, T_REGEXP)) {
02571                 rb_backref_set(Qnil);
02572             }
02573             return Qnil;
02574         }
02575     }
02576 
02577     if (SPECIAL_CONST_P(sub)) goto generic;
02578     switch (BUILTIN_TYPE(sub)) {
02579       case T_REGEXP:
02580         if (pos > str_strlen(str, STR_ENC_GET(str)))
02581             return Qnil;
02582         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02583                          rb_enc_check(str, sub), single_byte_optimizable(str));
02584 
02585         pos = rb_reg_search(sub, str, pos, 0);
02586         pos = rb_str_sublen(str, pos);
02587         break;
02588 
02589       generic:
02590       default: {
02591         VALUE tmp;
02592 
02593         tmp = rb_check_string_type(sub);
02594         if (NIL_P(tmp)) {
02595             rb_raise(rb_eTypeError, "type mismatch: %s given",
02596                      rb_obj_classname(sub));
02597         }
02598         sub = tmp;
02599       }
02600         /* fall through */
02601       case T_STRING:
02602         pos = rb_str_index(str, sub, pos);
02603         pos = rb_str_sublen(str, pos);
02604         break;
02605     }
02606 
02607     if (pos == -1) return Qnil;
02608     return LONG2NUM(pos);
02609 }
02610 
02611 static long
02612 rb_str_rindex(VALUE str, VALUE sub, long pos)
02613 {
02614     long len, slen;
02615     char *s, *sbeg, *e, *t;
02616     rb_encoding *enc;
02617     int singlebyte = single_byte_optimizable(str);
02618 
02619     enc = rb_enc_check(str, sub);
02620     if (is_broken_string(sub)) {
02621         return -1;
02622     }
02623     len = str_strlen(str, enc);
02624     slen = str_strlen(sub, enc);
02625     /* substring longer than string */
02626     if (len < slen) return -1;
02627     if (len - pos < slen) {
02628         pos = len - slen;
02629     }
02630     if (len == 0) {
02631         return pos;
02632     }
02633     sbeg = RSTRING_PTR(str);
02634     e = RSTRING_END(str);
02635     t = RSTRING_PTR(sub);
02636     slen = RSTRING_LEN(sub);
02637     s = str_nth(sbeg, e, pos, enc, singlebyte);
02638     while (s) {
02639         if (memcmp(s, t, slen) == 0) {
02640             return pos;
02641         }
02642         if (pos == 0) break;
02643         pos--;
02644         s = rb_enc_prev_char(sbeg, s, e, enc);
02645     }
02646     return -1;
02647 }
02648 
02649 
02650 /*
02651  *  call-seq:
02652  *     str.rindex(substring [, fixnum])   -> fixnum or nil
02653  *     str.rindex(regexp [, fixnum])   -> fixnum or nil
02654  *
02655  *  Returns the index of the last occurrence of the given <i>substring</i> or
02656  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02657  *  found. If the second parameter is present, it specifies the position in the
02658  *  string to end the search---characters beyond this point will not be
02659  *  considered.
02660  *
02661  *     "hello".rindex('e')             #=> 1
02662  *     "hello".rindex('l')             #=> 3
02663  *     "hello".rindex('a')             #=> nil
02664  *     "hello".rindex(?e)              #=> 1
02665  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
02666  */
02667 
02668 static VALUE
02669 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02670 {
02671     VALUE sub;
02672     VALUE vpos;
02673     rb_encoding *enc = STR_ENC_GET(str);
02674     long pos, len = str_strlen(str, enc);
02675 
02676     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02677         pos = NUM2LONG(vpos);
02678         if (pos < 0) {
02679             pos += len;
02680             if (pos < 0) {
02681                 if (RB_TYPE_P(sub, T_REGEXP)) {
02682                     rb_backref_set(Qnil);
02683                 }
02684                 return Qnil;
02685             }
02686         }
02687         if (pos > len) pos = len;
02688     }
02689     else {
02690         pos = len;
02691     }
02692 
02693     if (SPECIAL_CONST_P(sub)) goto generic;
02694     switch (BUILTIN_TYPE(sub)) {
02695       case T_REGEXP:
02696         /* enc = rb_get_check(str, sub); */
02697         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02698                          STR_ENC_GET(str), single_byte_optimizable(str));
02699 
02700         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02701             pos = rb_reg_search(sub, str, pos, 1);
02702             pos = rb_str_sublen(str, pos);
02703         }
02704         if (pos >= 0) return LONG2NUM(pos);
02705         break;
02706 
02707       generic:
02708       default: {
02709         VALUE tmp;
02710 
02711         tmp = rb_check_string_type(sub);
02712         if (NIL_P(tmp)) {
02713             rb_raise(rb_eTypeError, "type mismatch: %s given",
02714                      rb_obj_classname(sub));
02715         }
02716         sub = tmp;
02717       }
02718         /* fall through */
02719       case T_STRING:
02720         pos = rb_str_rindex(str, sub, pos);
02721         if (pos >= 0) return LONG2NUM(pos);
02722         break;
02723     }
02724     return Qnil;
02725 }
02726 
02727 /*
02728  *  call-seq:
02729  *     str =~ obj   -> fixnum or nil
02730  *
02731  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
02732  *  against <i>str</i>,and returns the position the match starts, or
02733  *  <code>nil</code> if there is no match. Otherwise, invokes
02734  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
02735  *  <code>=~</code> in <code>Object</code> returns <code>nil</code>.
02736  *
02737  *  Note: <code>str =~ regexp</code> is not the same as
02738  *  <code>regexp =~ str</code>. Strings captured from named capture groups
02739  *  are assigned to local variables only in the second case.
02740  *
02741  *     "cat o' 9 tails" =~ /\d/   #=> 7
02742  *     "cat o' 9 tails" =~ 9      #=> nil
02743  */
02744 
02745 static VALUE
02746 rb_str_match(VALUE x, VALUE y)
02747 {
02748     if (SPECIAL_CONST_P(y)) goto generic;
02749     switch (BUILTIN_TYPE(y)) {
02750       case T_STRING:
02751         rb_raise(rb_eTypeError, "type mismatch: String given");
02752 
02753       case T_REGEXP:
02754         return rb_reg_match(y, x);
02755 
02756       generic:
02757       default:
02758         return rb_funcall(y, rb_intern("=~"), 1, x);
02759     }
02760 }
02761 
02762 
02763 static VALUE get_pat(VALUE, int);
02764 
02765 
02766 /*
02767  *  call-seq:
02768  *     str.match(pattern)        -> matchdata or nil
02769  *     str.match(pattern, pos)   -> matchdata or nil
02770  *
02771  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
02772  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
02773  *  parameter is present, it specifies the position in the string to begin the
02774  *  search.
02775  *
02776  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
02777  *     'hello'.match('(.)\1')[0]   #=> "ll"
02778  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
02779  *     'hello'.match('xx')         #=> nil
02780  *
02781  *  If a block is given, invoke the block with MatchData if match succeed, so
02782  *  that you can write
02783  *
02784  *     str.match(pat) {|m| ...}
02785  *
02786  *  instead of
02787  *
02788  *     if m = str.match(pat)
02789  *       ...
02790  *     end
02791  *
02792  *  The return value is a value from block execution in this case.
02793  */
02794 
02795 static VALUE
02796 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02797 {
02798     VALUE re, result;
02799     if (argc < 1)
02800         rb_check_arity(argc, 1, 2);
02801     re = argv[0];
02802     argv[0] = str;
02803     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02804     if (!NIL_P(result) && rb_block_given_p()) {
02805         return rb_yield(result);
02806     }
02807     return result;
02808 }
02809 
02810 enum neighbor_char {
02811     NEIGHBOR_NOT_CHAR,
02812     NEIGHBOR_FOUND,
02813     NEIGHBOR_WRAPPED
02814 };
02815 
02816 static enum neighbor_char
02817 enc_succ_char(char *p, long len, rb_encoding *enc)
02818 {
02819     long i;
02820     int l;
02821     while (1) {
02822         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02823             p[i] = '\0';
02824         if (i < 0)
02825             return NEIGHBOR_WRAPPED;
02826         ++((unsigned char*)p)[i];
02827         l = rb_enc_precise_mbclen(p, p+len, enc);
02828         if (MBCLEN_CHARFOUND_P(l)) {
02829             l = MBCLEN_CHARFOUND_LEN(l);
02830             if (l == len) {
02831                 return NEIGHBOR_FOUND;
02832             }
02833             else {
02834                 memset(p+l, 0xff, len-l);
02835             }
02836         }
02837         if (MBCLEN_INVALID_P(l) && i < len-1) {
02838             long len2;
02839             int l2;
02840             for (len2 = len-1; 0 < len2; len2--) {
02841                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02842                 if (!MBCLEN_INVALID_P(l2))
02843                     break;
02844             }
02845             memset(p+len2+1, 0xff, len-(len2+1));
02846         }
02847     }
02848 }
02849 
02850 static enum neighbor_char
02851 enc_pred_char(char *p, long len, rb_encoding *enc)
02852 {
02853     long i;
02854     int l;
02855     while (1) {
02856         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02857             p[i] = '\xff';
02858         if (i < 0)
02859             return NEIGHBOR_WRAPPED;
02860         --((unsigned char*)p)[i];
02861         l = rb_enc_precise_mbclen(p, p+len, enc);
02862         if (MBCLEN_CHARFOUND_P(l)) {
02863             l = MBCLEN_CHARFOUND_LEN(l);
02864             if (l == len) {
02865                 return NEIGHBOR_FOUND;
02866             }
02867             else {
02868                 memset(p+l, 0, len-l);
02869             }
02870         }
02871         if (MBCLEN_INVALID_P(l) && i < len-1) {
02872             long len2;
02873             int l2;
02874             for (len2 = len-1; 0 < len2; len2--) {
02875                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02876                 if (!MBCLEN_INVALID_P(l2))
02877                     break;
02878             }
02879             memset(p+len2+1, 0, len-(len2+1));
02880         }
02881     }
02882 }
02883 
02884 /*
02885   overwrite +p+ by succeeding letter in +enc+ and returns
02886   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
02887   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
02888   assuming each ranges are successive, and mbclen
02889   never change in each ranges.
02890   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
02891   character.
02892  */
02893 static enum neighbor_char
02894 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02895 {
02896     enum neighbor_char ret;
02897     unsigned int c;
02898     int ctype;
02899     int range;
02900     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02901 
02902     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02903     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02904         ctype = ONIGENC_CTYPE_DIGIT;
02905     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02906         ctype = ONIGENC_CTYPE_ALPHA;
02907     else
02908         return NEIGHBOR_NOT_CHAR;
02909 
02910     MEMCPY(save, p, char, len);
02911     ret = enc_succ_char(p, len, enc);
02912     if (ret == NEIGHBOR_FOUND) {
02913         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02914         if (rb_enc_isctype(c, ctype, enc))
02915             return NEIGHBOR_FOUND;
02916     }
02917     MEMCPY(p, save, char, len);
02918     range = 1;
02919     while (1) {
02920         MEMCPY(save, p, char, len);
02921         ret = enc_pred_char(p, len, enc);
02922         if (ret == NEIGHBOR_FOUND) {
02923             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02924             if (!rb_enc_isctype(c, ctype, enc)) {
02925                 MEMCPY(p, save, char, len);
02926                 break;
02927             }
02928         }
02929         else {
02930             MEMCPY(p, save, char, len);
02931             break;
02932         }
02933         range++;
02934     }
02935     if (range == 1) {
02936         return NEIGHBOR_NOT_CHAR;
02937     }
02938 
02939     if (ctype != ONIGENC_CTYPE_DIGIT) {
02940         MEMCPY(carry, p, char, len);
02941         return NEIGHBOR_WRAPPED;
02942     }
02943 
02944     MEMCPY(carry, p, char, len);
02945     enc_succ_char(carry, len, enc);
02946     return NEIGHBOR_WRAPPED;
02947 }
02948 
02949 
02950 /*
02951  *  call-seq:
02952  *     str.succ   -> new_str
02953  *     str.next   -> new_str
02954  *
02955  *  Returns the successor to <i>str</i>. The successor is calculated by
02956  *  incrementing characters starting from the rightmost alphanumeric (or
02957  *  the rightmost character if there are no alphanumerics) in the
02958  *  string. Incrementing a digit always results in another digit, and
02959  *  incrementing a letter results in another letter of the same case.
02960  *  Incrementing nonalphanumerics uses the underlying character set's
02961  *  collating sequence.
02962  *
02963  *  If the increment generates a ``carry,'' the character to the left of
02964  *  it is incremented. This process repeats until there is no carry,
02965  *  adding an additional character if necessary.
02966  *
02967  *     "abcd".succ        #=> "abce"
02968  *     "THX1138".succ     #=> "THX1139"
02969  *     "<<koala>>".succ   #=> "<<koalb>>"
02970  *     "1999zzz".succ     #=> "2000aaa"
02971  *     "ZZZ9999".succ     #=> "AAAA0000"
02972  *     "***".succ         #=> "**+"
02973  */
02974 
02975 VALUE
02976 rb_str_succ(VALUE orig)
02977 {
02978     rb_encoding *enc;
02979     VALUE str;
02980     char *sbeg, *s, *e, *last_alnum = 0;
02981     int c = -1;
02982     long l;
02983     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02984     long carry_pos = 0, carry_len = 1;
02985     enum neighbor_char neighbor = NEIGHBOR_FOUND;
02986 
02987     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02988     rb_enc_cr_str_copy_for_substr(str, orig);
02989     OBJ_INFECT(str, orig);
02990     if (RSTRING_LEN(str) == 0) return str;
02991 
02992     enc = STR_ENC_GET(orig);
02993     sbeg = RSTRING_PTR(str);
02994     s = e = sbeg + RSTRING_LEN(str);
02995 
02996     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02997         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02998             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02999                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
03000                 s = last_alnum;
03001                 break;
03002             }
03003         }
03004         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
03005         neighbor = enc_succ_alnum_char(s, l, enc, carry);
03006         switch (neighbor) {
03007           case NEIGHBOR_NOT_CHAR:
03008             continue;
03009           case NEIGHBOR_FOUND:
03010             return str;
03011           case NEIGHBOR_WRAPPED:
03012             last_alnum = s;
03013             break;
03014         }
03015         c = 1;
03016         carry_pos = s - sbeg;
03017         carry_len = l;
03018     }
03019     if (c == -1) {              /* str contains no alnum */
03020         s = e;
03021         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
03022             enum neighbor_char neighbor;
03023             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
03024             neighbor = enc_succ_char(s, l, enc);
03025             if (neighbor == NEIGHBOR_FOUND)
03026                 return str;
03027             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
03028                 /* wrapped to \0...\0.  search next valid char. */
03029                 enc_succ_char(s, l, enc);
03030             }
03031             if (!rb_enc_asciicompat(enc)) {
03032                 MEMCPY(carry, s, char, l);
03033                 carry_len = l;
03034             }
03035             carry_pos = s - sbeg;
03036         }
03037     }
03038     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
03039     s = RSTRING_PTR(str) + carry_pos;
03040     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
03041     memmove(s, carry, carry_len);
03042     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
03043     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03044     rb_enc_str_coderange(str);
03045     return str;
03046 }
03047 
03048 
03049 /*
03050  *  call-seq:
03051  *     str.succ!   -> str
03052  *     str.next!   -> str
03053  *
03054  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
03055  *  place.
03056  */
03057 
03058 static VALUE
03059 rb_str_succ_bang(VALUE str)
03060 {
03061     rb_str_shared_replace(str, rb_str_succ(str));
03062 
03063     return str;
03064 }
03065 
03066 
03067 /*
03068  *  call-seq:
03069  *     str.upto(other_str, exclusive=false) {|s| block }   -> str
03070  *     str.upto(other_str, exclusive=false)                -> an_enumerator
03071  *
03072  *  Iterates through successive values, starting at <i>str</i> and
03073  *  ending at <i>other_str</i> inclusive, passing each value in turn to
03074  *  the block. The <code>String#succ</code> method is used to generate
03075  *  each value.  If optional second argument exclusive is omitted or is false,
03076  *  the last value will be included; otherwise it will be excluded.
03077  *
03078  *  If no block is given, an enumerator is returned instead.
03079  *
03080  *     "a8".upto("b6") {|s| print s, ' ' }
03081  *     for s in "a8".."b6"
03082  *       print s, ' '
03083  *     end
03084  *
03085  *  <em>produces:</em>
03086  *
03087  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
03088  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
03089  *
03090  *  If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
03091  *  both are recognized as decimal numbers. In addition, the width of
03092  *  string (e.g. leading zeros) is handled appropriately.
03093  *
03094  *     "9".upto("11").to_a   #=> ["9", "10", "11"]
03095  *     "25".upto("5").to_a   #=> []
03096  *     "07".upto("11").to_a  #=> ["07", "08", "09", "10", "11"]
03097  */
03098 
03099 static VALUE
03100 rb_str_upto(int argc, VALUE *argv, VALUE beg)
03101 {
03102     VALUE end, exclusive;
03103     VALUE current, after_end;
03104     ID succ;
03105     int n, excl, ascii;
03106     rb_encoding *enc;
03107 
03108     rb_scan_args(argc, argv, "11", &end, &exclusive);
03109     RETURN_ENUMERATOR(beg, argc, argv);
03110     excl = RTEST(exclusive);
03111     CONST_ID(succ, "succ");
03112     StringValue(end);
03113     enc = rb_enc_check(beg, end);
03114     ascii = (is_ascii_string(beg) && is_ascii_string(end));
03115     /* single character */
03116     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
03117         char c = RSTRING_PTR(beg)[0];
03118         char e = RSTRING_PTR(end)[0];
03119 
03120         if (c > e || (excl && c == e)) return beg;
03121         for (;;) {
03122             rb_yield(rb_enc_str_new(&c, 1, enc));
03123             if (!excl && c == e) break;
03124             c++;
03125             if (excl && c == e) break;
03126         }
03127         return beg;
03128     }
03129     /* both edges are all digits */
03130     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
03131         char *s, *send;
03132         VALUE b, e;
03133         int width;
03134 
03135         s = RSTRING_PTR(beg); send = RSTRING_END(beg);
03136         width = rb_long2int(send - s);
03137         while (s < send) {
03138             if (!ISDIGIT(*s)) goto no_digits;
03139             s++;
03140         }
03141         s = RSTRING_PTR(end); send = RSTRING_END(end);
03142         while (s < send) {
03143             if (!ISDIGIT(*s)) goto no_digits;
03144             s++;
03145         }
03146         b = rb_str_to_inum(beg, 10, FALSE);
03147         e = rb_str_to_inum(end, 10, FALSE);
03148         if (FIXNUM_P(b) && FIXNUM_P(e)) {
03149             long bi = FIX2LONG(b);
03150             long ei = FIX2LONG(e);
03151             rb_encoding *usascii = rb_usascii_encoding();
03152 
03153             while (bi <= ei) {
03154                 if (excl && bi == ei) break;
03155                 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
03156                 bi++;
03157             }
03158         }
03159         else {
03160             ID op = excl ? '<' : rb_intern("<=");
03161             VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
03162 
03163             args[0] = INT2FIX(width);
03164             while (rb_funcall(b, op, 1, e)) {
03165                 args[1] = b;
03166                 rb_yield(rb_str_format(numberof(args), args, fmt));
03167                 b = rb_funcall(b, succ, 0, 0);
03168             }
03169         }
03170         return beg;
03171     }
03172     /* normal case */
03173   no_digits:
03174     n = rb_str_cmp(beg, end);
03175     if (n > 0 || (excl && n == 0)) return beg;
03176 
03177     after_end = rb_funcall(end, succ, 0, 0);
03178     current = rb_str_dup(beg);
03179     while (!rb_str_equal(current, after_end)) {
03180         VALUE next = Qnil;
03181         if (excl || !rb_str_equal(current, end))
03182             next = rb_funcall(current, succ, 0, 0);
03183         rb_yield(current);
03184         if (NIL_P(next)) break;
03185         current = next;
03186         StringValue(current);
03187         if (excl && rb_str_equal(current, end)) break;
03188         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
03189             break;
03190     }
03191 
03192     return beg;
03193 }
03194 
03195 static VALUE
03196 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03197 {
03198     if (rb_reg_search(re, str, 0, 0) >= 0) {
03199         VALUE match = rb_backref_get();
03200         int nth = rb_reg_backref_number(match, backref);
03201         return rb_reg_nth_match(nth, match);
03202     }
03203     return Qnil;
03204 }
03205 
03206 static VALUE
03207 rb_str_aref(VALUE str, VALUE indx)
03208 {
03209     long idx;
03210 
03211     if (FIXNUM_P(indx)) {
03212         idx = FIX2LONG(indx);
03213 
03214       num_index:
03215         str = rb_str_substr(str, idx, 1);
03216         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03217         return str;
03218     }
03219 
03220     if (SPECIAL_CONST_P(indx)) goto generic;
03221     switch (BUILTIN_TYPE(indx)) {
03222       case T_REGEXP:
03223         return rb_str_subpat(str, indx, INT2FIX(0));
03224 
03225       case T_STRING:
03226         if (rb_str_index(str, indx, 0) != -1)
03227             return rb_str_dup(indx);
03228         return Qnil;
03229 
03230       generic:
03231       default:
03232         /* check if indx is Range */
03233         {
03234             long beg, len;
03235             VALUE tmp;
03236 
03237             len = str_strlen(str, STR_ENC_GET(str));
03238             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03239               case Qfalse:
03240                 break;
03241               case Qnil:
03242                 return Qnil;
03243               default:
03244                 tmp = rb_str_substr(str, beg, len);
03245                 return tmp;
03246             }
03247         }
03248         idx = NUM2LONG(indx);
03249         goto num_index;
03250     }
03251 
03252     UNREACHABLE;
03253 }
03254 
03255 
03256 /*
03257  *  call-seq:
03258  *     str[index]                 -> new_str or nil
03259  *     str[start, length]         -> new_str or nil
03260  *     str[range]                 -> new_str or nil
03261  *     str[regexp]                -> new_str or nil
03262  *     str[regexp, capture]       -> new_str or nil
03263  *     str[match_str]             -> new_str or nil
03264  *     str.slice(index)           -> new_str or nil
03265  *     str.slice(start, length)   -> new_str or nil
03266  *     str.slice(range)           -> new_str or nil
03267  *     str.slice(regexp)          -> new_str or nil
03268  *     str.slice(regexp, capture) -> new_str or nil
03269  *     str.slice(match_str)       -> new_str or nil
03270  *
03271  *  Element Reference --- If passed a single +index+, returns a substring of
03272  *  one character at that index. If passed a +start+ index and a +length+,
03273  *  returns a substring containing +length+ characters starting at the
03274  *  +index+. If passed a +range+, its beginning and end are interpreted as
03275  *  offsets delimiting the substring to be returned.
03276  *
03277  *  In these three cases, if an index is negative, it is counted from the end
03278  *  of the string.  For the +start+ and +range+ cases the starting index
03279  *  is just before a character and an index matching the string's size.
03280  *  Additionally, an empty string is returned when the starting index for a
03281  *  character range is at the end of the string.
03282  *
03283  *  Returns +nil+ if the initial index falls outside the string or the length
03284  *  is negative.
03285  *
03286  *  If a +Regexp+ is supplied, the matching portion of the string is
03287  *  returned.  If a +capture+ follows the regular expression, which may be a
03288  *  capture group index or name, follows the regular expression that component
03289  *  of the MatchData is returned instead.
03290  *
03291  *  If a +match_str+ is given, that string is returned if it occurs in
03292  *  the string.
03293  *
03294  *  Returns +nil+ if the regular expression does not match or the match string
03295  *  cannot be found.
03296  *
03297  *     a = "hello there"
03298  *
03299  *     a[1]                   #=> "e"
03300  *     a[2, 3]                #=> "llo"
03301  *     a[2..3]                #=> "ll"
03302  *
03303  *     a[-3, 2]               #=> "er"
03304  *     a[7..-2]               #=> "her"
03305  *     a[-4..-2]              #=> "her"
03306  *     a[-2..-4]              #=> ""
03307  *
03308  *     a[11, 0]               #=> ""
03309  *     a[11]                  #=> nil
03310  *     a[12, 0]               #=> nil
03311  *     a[12..-1]              #=> nil
03312  *
03313  *     a[/[aeiou](.)\1/]      #=> "ell"
03314  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
03315  *     a[/[aeiou](.)\1/, 1]   #=> "l"
03316  *     a[/[aeiou](.)\1/, 2]   #=> nil
03317  *
03318  *     a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l"
03319  *     a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"]     #=> "e"
03320  *
03321  *     a["lo"]                #=> "lo"
03322  *     a["bye"]               #=> nil
03323  */
03324 
03325 static VALUE
03326 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03327 {
03328     if (argc == 2) {
03329         if (RB_TYPE_P(argv[0], T_REGEXP)) {
03330             return rb_str_subpat(str, argv[0], argv[1]);
03331         }
03332         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03333     }
03334     rb_check_arity(argc, 1, 2);
03335     return rb_str_aref(str, argv[0]);
03336 }
03337 
03338 VALUE
03339 rb_str_drop_bytes(VALUE str, long len)
03340 {
03341     char *ptr = RSTRING_PTR(str);
03342     long olen = RSTRING_LEN(str), nlen;
03343 
03344     str_modifiable(str);
03345     if (len > olen) len = olen;
03346     nlen = olen - len;
03347     if (nlen <= RSTRING_EMBED_LEN_MAX) {
03348         char *oldptr = ptr;
03349         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03350         STR_SET_EMBED(str);
03351         STR_SET_EMBED_LEN(str, nlen);
03352         ptr = RSTRING(str)->as.ary;
03353         memmove(ptr, oldptr + len, nlen);
03354         if (fl == STR_NOEMBED) xfree(oldptr);
03355     }
03356     else {
03357         if (!STR_SHARED_P(str)) rb_str_new4(str);
03358         ptr = RSTRING(str)->as.heap.ptr += len;
03359         RSTRING(str)->as.heap.len = nlen;
03360     }
03361     ptr[nlen] = 0;
03362     ENC_CODERANGE_CLEAR(str);
03363     return str;
03364 }
03365 
03366 static void
03367 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03368 {
03369     if (beg == 0 && RSTRING_LEN(val) == 0) {
03370         rb_str_drop_bytes(str, len);
03371         OBJ_INFECT(str, val);
03372         return;
03373     }
03374 
03375     rb_str_modify(str);
03376     if (len < RSTRING_LEN(val)) {
03377         /* expand string */
03378         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03379     }
03380 
03381     if (RSTRING_LEN(val) != len) {
03382         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03383                 RSTRING_PTR(str) + beg + len,
03384                 RSTRING_LEN(str) - (beg + len));
03385     }
03386     if (RSTRING_LEN(val) < beg && len < 0) {
03387         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03388     }
03389     if (RSTRING_LEN(val) > 0) {
03390         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03391     }
03392     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03393     if (RSTRING_PTR(str)) {
03394         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03395     }
03396     OBJ_INFECT(str, val);
03397 }
03398 
03399 static void
03400 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03401 {
03402     long slen;
03403     char *p, *e;
03404     rb_encoding *enc;
03405     int singlebyte = single_byte_optimizable(str);
03406     int cr;
03407 
03408     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03409 
03410     StringValue(val);
03411     enc = rb_enc_check(str, val);
03412     slen = str_strlen(str, enc);
03413 
03414     if (slen < beg) {
03415       out_of_range:
03416         rb_raise(rb_eIndexError, "index %ld out of string", beg);
03417     }
03418     if (beg < 0) {
03419         if (-beg > slen) {
03420             goto out_of_range;
03421         }
03422         beg += slen;
03423     }
03424     if (slen < len || slen < beg + len) {
03425         len = slen - beg;
03426     }
03427     str_modify_keep_cr(str);
03428     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03429     if (!p) p = RSTRING_END(str);
03430     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03431     if (!e) e = RSTRING_END(str);
03432     /* error check */
03433     beg = p - RSTRING_PTR(str); /* physical position */
03434     len = e - p;                /* physical length */
03435     rb_str_splice_0(str, beg, len, val);
03436     rb_enc_associate(str, enc);
03437     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03438     if (cr != ENC_CODERANGE_BROKEN)
03439         ENC_CODERANGE_SET(str, cr);
03440 }
03441 
03442 void
03443 rb_str_update(VALUE str, long beg, long len, VALUE val)
03444 {
03445     rb_str_splice(str, beg, len, val);
03446 }
03447 
03448 static void
03449 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03450 {
03451     int nth;
03452     VALUE match;
03453     long start, end, len;
03454     rb_encoding *enc;
03455     struct re_registers *regs;
03456 
03457     if (rb_reg_search(re, str, 0, 0) < 0) {
03458         rb_raise(rb_eIndexError, "regexp not matched");
03459     }
03460     match = rb_backref_get();
03461     nth = rb_reg_backref_number(match, backref);
03462     regs = RMATCH_REGS(match);
03463     if (nth >= regs->num_regs) {
03464       out_of_range:
03465         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03466     }
03467     if (nth < 0) {
03468         if (-nth >= regs->num_regs) {
03469             goto out_of_range;
03470         }
03471         nth += regs->num_regs;
03472     }
03473 
03474     start = BEG(nth);
03475     if (start == -1) {
03476         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03477     }
03478     end = END(nth);
03479     len = end - start;
03480     StringValue(val);
03481     enc = rb_enc_check(str, val);
03482     rb_str_splice_0(str, start, len, val);
03483     rb_enc_associate(str, enc);
03484 }
03485 
03486 static VALUE
03487 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03488 {
03489     long idx, beg;
03490 
03491     if (FIXNUM_P(indx)) {
03492         idx = FIX2LONG(indx);
03493       num_index:
03494         rb_str_splice(str, idx, 1, val);
03495         return val;
03496     }
03497 
03498     if (SPECIAL_CONST_P(indx)) goto generic;
03499     switch (TYPE(indx)) {
03500       case T_REGEXP:
03501         rb_str_subpat_set(str, indx, INT2FIX(0), val);
03502         return val;
03503 
03504       case T_STRING:
03505         beg = rb_str_index(str, indx, 0);
03506         if (beg < 0) {
03507             rb_raise(rb_eIndexError, "string not matched");
03508         }
03509         beg = rb_str_sublen(str, beg);
03510         rb_str_splice(str, beg, str_strlen(indx, 0), val);
03511         return val;
03512 
03513       generic:
03514       default:
03515         /* check if indx is Range */
03516         {
03517             long beg, len;
03518             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03519                 rb_str_splice(str, beg, len, val);
03520                 return val;
03521             }
03522         }
03523         idx = NUM2LONG(indx);
03524         goto num_index;
03525     }
03526 }
03527 
03528 /*
03529  *  call-seq:
03530  *     str[fixnum] = new_str
03531  *     str[fixnum, fixnum] = new_str
03532  *     str[range] = aString
03533  *     str[regexp] = new_str
03534  *     str[regexp, fixnum] = new_str
03535  *     str[regexp, name] = new_str
03536  *     str[other_str] = new_str
03537  *
03538  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
03539  *  portion of the string affected is determined using the same criteria as
03540  *  <code>String#[]</code>. If the replacement string is not the same length as
03541  *  the text it is replacing, the string will be adjusted accordingly. If the
03542  *  regular expression or string is used as the index doesn't match a position
03543  *  in the string, <code>IndexError</code> is raised. If the regular expression
03544  *  form is used, the optional second <code>Fixnum</code> allows you to specify
03545  *  which portion of the match to replace (effectively using the
03546  *  <code>MatchData</code> indexing rules. The forms that take a
03547  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
03548  *  out of range; the <code>Range</code> form will raise a
03549  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
03550  *  will raise an <code>IndexError</code> on negative match.
03551  */
03552 
03553 static VALUE
03554 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03555 {
03556     if (argc == 3) {
03557         if (RB_TYPE_P(argv[0], T_REGEXP)) {
03558             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03559         }
03560         else {
03561             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03562         }
03563         return argv[2];
03564     }
03565     rb_check_arity(argc, 2, 3);
03566     return rb_str_aset(str, argv[0], argv[1]);
03567 }
03568 
03569 /*
03570  *  call-seq:
03571  *     str.insert(index, other_str)   -> str
03572  *
03573  *  Inserts <i>other_str</i> before the character at the given
03574  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
03575  *  end of the string, and insert <em>after</em> the given character.
03576  *  The intent is insert <i>aString</i> so that it starts at the given
03577  *  <i>index</i>.
03578  *
03579  *     "abcd".insert(0, 'X')    #=> "Xabcd"
03580  *     "abcd".insert(3, 'X')    #=> "abcXd"
03581  *     "abcd".insert(4, 'X')    #=> "abcdX"
03582  *     "abcd".insert(-3, 'X')   #=> "abXcd"
03583  *     "abcd".insert(-1, 'X')   #=> "abcdX"
03584  */
03585 
03586 static VALUE
03587 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03588 {
03589     long pos = NUM2LONG(idx);
03590 
03591     if (pos == -1) {
03592         return rb_str_append(str, str2);
03593     }
03594     else if (pos < 0) {
03595         pos++;
03596     }
03597     rb_str_splice(str, pos, 0, str2);
03598     return str;
03599 }
03600 
03601 
03602 /*
03603  *  call-seq:
03604  *     str.slice!(fixnum)           -> fixnum or nil
03605  *     str.slice!(fixnum, fixnum)   -> new_str or nil
03606  *     str.slice!(range)            -> new_str or nil
03607  *     str.slice!(regexp)           -> new_str or nil
03608  *     str.slice!(other_str)        -> new_str or nil
03609  *
03610  *  Deletes the specified portion from <i>str</i>, and returns the portion
03611  *  deleted.
03612  *
03613  *     string = "this is a string"
03614  *     string.slice!(2)        #=> "i"
03615  *     string.slice!(3..6)     #=> " is "
03616  *     string.slice!(/s.*t/)   #=> "sa st"
03617  *     string.slice!("r")      #=> "r"
03618  *     string                  #=> "thing"
03619  */
03620 
03621 static VALUE
03622 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03623 {
03624     VALUE result;
03625     VALUE buf[3];
03626     int i;
03627 
03628     rb_check_arity(argc, 1, 2);
03629     for (i=0; i<argc; i++) {
03630         buf[i] = argv[i];
03631     }
03632     str_modify_keep_cr(str);
03633     result = rb_str_aref_m(argc, buf, str);
03634     if (!NIL_P(result)) {
03635         buf[i] = rb_str_new(0,0);
03636         rb_str_aset_m(argc+1, buf, str);
03637     }
03638     return result;
03639 }
03640 
03641 static VALUE
03642 get_pat(VALUE pat, int quote)
03643 {
03644     VALUE val;
03645 
03646     switch (TYPE(pat)) {
03647       case T_REGEXP:
03648         return pat;
03649 
03650       case T_STRING:
03651         break;
03652 
03653       default:
03654         val = rb_check_string_type(pat);
03655         if (NIL_P(val)) {
03656             Check_Type(pat, T_REGEXP);
03657         }
03658         pat = val;
03659     }
03660 
03661     if (quote) {
03662         pat = rb_reg_quote(pat);
03663     }
03664 
03665     return rb_reg_regcomp(pat);
03666 }
03667 
03668 
03669 /*
03670  *  call-seq:
03671  *     str.sub!(pattern, replacement)          -> str or nil
03672  *     str.sub!(pattern) {|match| block }      -> str or nil
03673  *
03674  *  Performs the same substitution as String#sub in-place.
03675  *
03676  *  Returns +str+ if a substitution was performed or +nil+ if no substitution
03677  *  was performed.
03678  */
03679 
03680 static VALUE
03681 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03682 {
03683     VALUE pat, repl, hash = Qnil;
03684     int iter = 0;
03685     int tainted = 0;
03686     int untrusted = 0;
03687     long plen;
03688     int min_arity = rb_block_given_p() ? 1 : 2;
03689 
03690     rb_check_arity(argc, min_arity, 2);
03691     if (argc == 1) {
03692         iter = 1;
03693     }
03694     else {
03695         repl = argv[1];
03696         hash = rb_check_hash_type(argv[1]);
03697         if (NIL_P(hash)) {
03698             StringValue(repl);
03699         }
03700         if (OBJ_TAINTED(repl)) tainted = 1;
03701         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03702     }
03703 
03704     pat = get_pat(argv[0], 1);
03705     str_modifiable(str);
03706     if (rb_reg_search(pat, str, 0, 0) >= 0) {
03707         rb_encoding *enc;
03708         int cr = ENC_CODERANGE(str);
03709         VALUE match = rb_backref_get();
03710         struct re_registers *regs = RMATCH_REGS(match);
03711         long beg0 = BEG(0);
03712         long end0 = END(0);
03713         char *p, *rp;
03714         long len, rlen;
03715 
03716         if (iter || !NIL_P(hash)) {
03717             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03718 
03719             if (iter) {
03720                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03721             }
03722             else {
03723                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03724                 repl = rb_obj_as_string(repl);
03725             }
03726             str_mod_check(str, p, len);
03727             rb_check_frozen(str);
03728         }
03729         else {
03730             repl = rb_reg_regsub(repl, str, regs, pat);
03731         }
03732         enc = rb_enc_compatible(str, repl);
03733         if (!enc) {
03734             rb_encoding *str_enc = STR_ENC_GET(str);
03735             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03736             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03737                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03738                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03739                          rb_enc_name(str_enc),
03740                          rb_enc_name(STR_ENC_GET(repl)));
03741             }
03742             enc = STR_ENC_GET(repl);
03743         }
03744         rb_str_modify(str);
03745         rb_enc_associate(str, enc);
03746         if (OBJ_TAINTED(repl)) tainted = 1;
03747         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03748         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03749             int cr2 = ENC_CODERANGE(repl);
03750             if (cr2 == ENC_CODERANGE_BROKEN ||
03751                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03752                 cr = ENC_CODERANGE_UNKNOWN;
03753             else
03754                 cr = cr2;
03755         }
03756         plen = end0 - beg0;
03757         rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03758         len = RSTRING_LEN(str);
03759         if (rlen > plen) {
03760             RESIZE_CAPA(str, len + rlen - plen);
03761         }
03762         p = RSTRING_PTR(str);
03763         if (rlen != plen) {
03764             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03765         }
03766         memcpy(p + beg0, rp, rlen);
03767         len += rlen - plen;
03768         STR_SET_LEN(str, len);
03769         RSTRING_PTR(str)[len] = '\0';
03770         ENC_CODERANGE_SET(str, cr);
03771         if (tainted) OBJ_TAINT(str);
03772         if (untrusted) OBJ_UNTRUST(str);
03773 
03774         return str;
03775     }
03776     return Qnil;
03777 }
03778 
03779 
03780 /*
03781  *  call-seq:
03782  *     str.sub(pattern, replacement)         -> new_str
03783  *     str.sub(pattern, hash)                -> new_str
03784  *     str.sub(pattern) {|match| block }     -> new_str
03785  *
03786  *  Returns a copy of +str+ with the _first_ occurrence of +pattern+
03787  *  replaced by the second argument. The +pattern+ is typically a Regexp; if
03788  *  given as a String, any regular expression metacharacters it contains will
03789  *  be interpreted literally, e.g. <code>'\\\d'</code> will match a backlash
03790  *  followed by 'd', instead of a digit.
03791  *
03792  *  If +replacement+ is a String it will be substituted for the matched text.
03793  *  It may contain back-references to the pattern's capture groups of the form
03794  *  <code>"\\d"</code>, where <i>d</i> is a group number, or
03795  *  <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a
03796  *  double-quoted string, both back-references must be preceded by an
03797  *  additional backslash. However, within +replacement+ the special match
03798  *  variables, such as <code>&$</code>, will not refer to the current match.
03799  *
03800  *  If the second argument is a Hash, and the matched text is one of its keys,
03801  *  the corresponding value is the replacement string.
03802  *
03803  *  In the block form, the current match string is passed in as a parameter,
03804  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03805  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03806  *  returned by the block will be substituted for the match on each call.
03807  *
03808  *  The result inherits any tainting in the original string or any supplied
03809  *  replacement string.
03810  *
03811  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
03812  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
03813  *     "hello".sub(/./) {|s| s.ord.to_s + ' ' }     #=> "104 ello"
03814  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
03815  *     'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
03816  *      #=> "Is /bin/bash your preferred shell?"
03817  */
03818 
03819 static VALUE
03820 rb_str_sub(int argc, VALUE *argv, VALUE str)
03821 {
03822     str = rb_str_dup(str);
03823     rb_str_sub_bang(argc, argv, str);
03824     return str;
03825 }
03826 
03827 static VALUE
03828 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03829 {
03830     VALUE pat, val, repl, match, dest, hash = Qnil;
03831     struct re_registers *regs;
03832     long beg, n;
03833     long beg0, end0;
03834     long offset, blen, slen, len, last;
03835     int iter = 0;
03836     char *sp, *cp;
03837     int tainted = 0;
03838     rb_encoding *str_enc;
03839 
03840     switch (argc) {
03841       case 1:
03842         RETURN_ENUMERATOR(str, argc, argv);
03843         iter = 1;
03844         break;
03845       case 2:
03846         repl = argv[1];
03847         hash = rb_check_hash_type(argv[1]);
03848         if (NIL_P(hash)) {
03849             StringValue(repl);
03850         }
03851         if (OBJ_TAINTED(repl)) tainted = 1;
03852         break;
03853       default:
03854         rb_check_arity(argc, 1, 2);
03855     }
03856 
03857     pat = get_pat(argv[0], 1);
03858     beg = rb_reg_search(pat, str, 0, 0);
03859     if (beg < 0) {
03860         if (bang) return Qnil;  /* no match, no substitution */
03861         return rb_str_dup(str);
03862     }
03863 
03864     offset = 0;
03865     n = 0;
03866     blen = RSTRING_LEN(str) + 30; /* len + margin */
03867     dest = rb_str_buf_new(blen);
03868     sp = RSTRING_PTR(str);
03869     slen = RSTRING_LEN(str);
03870     cp = sp;
03871     str_enc = STR_ENC_GET(str);
03872     rb_enc_associate(dest, str_enc);
03873     ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
03874 
03875     do {
03876         n++;
03877         match = rb_backref_get();
03878         regs = RMATCH_REGS(match);
03879         beg0 = BEG(0);
03880         end0 = END(0);
03881         if (iter || !NIL_P(hash)) {
03882             if (iter) {
03883                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03884             }
03885             else {
03886                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03887                 val = rb_obj_as_string(val);
03888             }
03889             str_mod_check(str, sp, slen);
03890             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
03891                 rb_raise(rb_eRuntimeError, "block should not cheat");
03892             }
03893         }
03894         else {
03895             val = rb_reg_regsub(repl, str, regs, pat);
03896         }
03897 
03898         if (OBJ_TAINTED(val)) tainted = 1;
03899 
03900         len = beg - offset;     /* copy pre-match substr */
03901         if (len) {
03902             rb_enc_str_buf_cat(dest, cp, len, str_enc);
03903         }
03904 
03905         rb_str_buf_append(dest, val);
03906 
03907         last = offset;
03908         offset = end0;
03909         if (beg0 == end0) {
03910             /*
03911              * Always consume at least one character of the input string
03912              * in order to prevent infinite loops.
03913              */
03914             if (RSTRING_LEN(str) <= end0) break;
03915             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03916             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03917             offset = end0 + len;
03918         }
03919         cp = RSTRING_PTR(str) + offset;
03920         if (offset > RSTRING_LEN(str)) break;
03921         beg = rb_reg_search(pat, str, offset, 0);
03922     } while (beg >= 0);
03923     if (RSTRING_LEN(str) > offset) {
03924         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03925     }
03926     rb_reg_search(pat, str, last, 0);
03927     if (bang) {
03928         rb_str_shared_replace(str, dest);
03929     }
03930     else {
03931         RBASIC(dest)->klass = rb_obj_class(str);
03932         OBJ_INFECT(dest, str);
03933         str = dest;
03934     }
03935 
03936     if (tainted) OBJ_TAINT(str);
03937     return str;
03938 }
03939 
03940 
03941 /*
03942  *  call-seq:
03943  *     str.gsub!(pattern, replacement)        -> str or nil
03944  *     str.gsub!(pattern) {|match| block }    -> str or nil
03945  *     str.gsub!(pattern)                     -> an_enumerator
03946  *
03947  *  Performs the substitutions of <code>String#gsub</code> in place, returning
03948  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
03949  *  If no block and no <i>replacement</i> is given, an enumerator is returned instead.
03950  */
03951 
03952 static VALUE
03953 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03954 {
03955     str_modify_keep_cr(str);
03956     return str_gsub(argc, argv, str, 1);
03957 }
03958 
03959 
03960 /*
03961  *  call-seq:
03962  *     str.gsub(pattern, replacement)       -> new_str
03963  *     str.gsub(pattern, hash)              -> new_str
03964  *     str.gsub(pattern) {|match| block }   -> new_str
03965  *     str.gsub(pattern)                    -> enumerator
03966  *
03967  *  Returns a copy of <i>str</i> with the <em>all</em> occurrences of
03968  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03969  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03970  *  regular expression metacharacters it contains will be interpreted
03971  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03972  *  instead of a digit.
03973  *
03974  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03975  *  the matched text. It may contain back-references to the pattern's capture
03976  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03977  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03978  *  double-quoted string, both back-references must be preceded by an
03979  *  additional backslash. However, within <i>replacement</i> the special match
03980  *  variables, such as <code>$&</code>, will not refer to the current match.
03981  *
03982  *  If the second argument is a <code>Hash</code>, and the matched text is one
03983  *  of its keys, the corresponding value is the replacement string.
03984  *
03985  *  In the block form, the current match string is passed in as a parameter,
03986  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03987  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03988  *  returned by the block will be substituted for the match on each call.
03989  *
03990  *  The result inherits any tainting in the original string or any supplied
03991  *  replacement string.
03992  *
03993  *  When neither a block nor a second argument is supplied, an
03994  *  <code>Enumerator</code> is returned.
03995  *
03996  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
03997  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
03998  *     "hello".gsub(/./) {|s| s.ord.to_s + ' '}      #=> "104 101 108 108 111 "
03999  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
04000  *     'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*')    #=> "h3ll*"
04001  */
04002 
04003 static VALUE
04004 rb_str_gsub(int argc, VALUE *argv, VALUE str)
04005 {
04006     return str_gsub(argc, argv, str, 0);
04007 }
04008 
04009 
04010 /*
04011  *  call-seq:
04012  *     str.replace(other_str)   -> str
04013  *
04014  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
04015  *  values in <i>other_str</i>.
04016  *
04017  *     s = "hello"         #=> "hello"
04018  *     s.replace "world"   #=> "world"
04019  */
04020 
04021 VALUE
04022 rb_str_replace(VALUE str, VALUE str2)
04023 {
04024     str_modifiable(str);
04025     if (str == str2) return str;
04026 
04027     StringValue(str2);
04028     str_discard(str);
04029     return str_replace(str, str2);
04030 }
04031 
04032 /*
04033  *  call-seq:
04034  *     string.clear    ->  string
04035  *
04036  *  Makes string empty.
04037  *
04038  *     a = "abcde"
04039  *     a.clear    #=> ""
04040  */
04041 
04042 static VALUE
04043 rb_str_clear(VALUE str)
04044 {
04045     str_discard(str);
04046     STR_SET_EMBED(str);
04047     STR_SET_EMBED_LEN(str, 0);
04048     RSTRING_PTR(str)[0] = 0;
04049     if (rb_enc_asciicompat(STR_ENC_GET(str)))
04050         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04051     else
04052         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04053     return str;
04054 }
04055 
04056 /*
04057  *  call-seq:
04058  *     string.chr    ->  string
04059  *
04060  *  Returns a one-character string at the beginning of the string.
04061  *
04062  *     a = "abcde"
04063  *     a.chr    #=> "a"
04064  */
04065 
04066 static VALUE
04067 rb_str_chr(VALUE str)
04068 {
04069     return rb_str_substr(str, 0, 1);
04070 }
04071 
04072 /*
04073  *  call-seq:
04074  *     str.getbyte(index)          -> 0 .. 255
04075  *
04076  *  returns the <i>index</i>th byte as an integer.
04077  */
04078 static VALUE
04079 rb_str_getbyte(VALUE str, VALUE index)
04080 {
04081     long pos = NUM2LONG(index);
04082 
04083     if (pos < 0)
04084         pos += RSTRING_LEN(str);
04085     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
04086         return Qnil;
04087 
04088     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
04089 }
04090 
04091 /*
04092  *  call-seq:
04093  *     str.setbyte(index, integer) -> integer
04094  *
04095  *  modifies the <i>index</i>th byte as <i>integer</i>.
04096  */
04097 static VALUE
04098 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
04099 {
04100     long pos = NUM2LONG(index);
04101     int byte = NUM2INT(value);
04102 
04103     rb_str_modify(str);
04104 
04105     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
04106         rb_raise(rb_eIndexError, "index %ld out of string", pos);
04107     if (pos < 0)
04108         pos += RSTRING_LEN(str);
04109 
04110     RSTRING_PTR(str)[pos] = byte;
04111 
04112     return value;
04113 }
04114 
04115 static VALUE
04116 str_byte_substr(VALUE str, long beg, long len)
04117 {
04118     char *p, *s = RSTRING_PTR(str);
04119     long n = RSTRING_LEN(str);
04120     VALUE str2;
04121 
04122     if (beg > n || len < 0) return Qnil;
04123     if (beg < 0) {
04124         beg += n;
04125         if (beg < 0) return Qnil;
04126     }
04127     if (beg + len > n)
04128         len = n - beg;
04129     if (len <= 0) {
04130         len = 0;
04131         p = 0;
04132     }
04133     else
04134         p = s + beg;
04135 
04136     if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
04137         str2 = rb_str_new4(str);
04138         str2 = str_new3(rb_obj_class(str2), str2);
04139         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
04140         RSTRING(str2)->as.heap.len = len;
04141     }
04142     else {
04143         str2 = rb_str_new5(str, p, len);
04144     }
04145 
04146     str_enc_copy(str2, str);
04147 
04148     if (RSTRING_LEN(str2) == 0) {
04149         if (!rb_enc_asciicompat(STR_ENC_GET(str)))
04150             ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
04151         else
04152             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04153     }
04154     else {
04155         switch (ENC_CODERANGE(str)) {
04156           case ENC_CODERANGE_7BIT:
04157             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04158             break;
04159           default:
04160             ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
04161             break;
04162         }
04163     }
04164 
04165     OBJ_INFECT(str2, str);
04166 
04167     return str2;
04168 }
04169 
04170 static VALUE
04171 str_byte_aref(VALUE str, VALUE indx)
04172 {
04173     long idx;
04174     switch (TYPE(indx)) {
04175       case T_FIXNUM:
04176         idx = FIX2LONG(indx);
04177 
04178       num_index:
04179         str = str_byte_substr(str, idx, 1);
04180         if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
04181         return str;
04182 
04183       default:
04184         /* check if indx is Range */
04185         {
04186             long beg, len = RSTRING_LEN(str);
04187 
04188             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
04189               case Qfalse:
04190                 break;
04191               case Qnil:
04192                 return Qnil;
04193               default:
04194                 return str_byte_substr(str, beg, len);
04195             }
04196         }
04197         idx = NUM2LONG(indx);
04198         goto num_index;
04199     }
04200 
04201     UNREACHABLE;
04202 }
04203 
04204 /*
04205  *  call-seq:
04206  *     str.byteslice(fixnum)           -> new_str or nil
04207  *     str.byteslice(fixnum, fixnum)   -> new_str or nil
04208  *     str.byteslice(range)            -> new_str or nil
04209  *
04210  *  Byte Reference---If passed a single <code>Fixnum</code>, returns a
04211  *  substring of one byte at that position. If passed two <code>Fixnum</code>
04212  *  objects, returns a substring starting at the offset given by the first, and
04213  *  a length given by the second. If given a <code>Range</code>, a substring containing
04214  *  bytes at offsets given by the range is returned. In all three cases, if
04215  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
04216  *  <code>nil</code> if the initial offset falls outside the string, the length
04217  *  is negative, or the beginning of the range is greater than the end.
04218  *  The encoding of the resulted string keeps original encoding.
04219  *
04220  *     "hello".byteslice(1)     #=> "e"
04221  *     "hello".byteslice(-1)    #=> "o"
04222  *     "hello".byteslice(1, 2)  #=> "el"
04223  *     "\x80\u3042".byteslice(1, 3) #=> "\u3042"
04224  *     "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
04225  */
04226 
04227 static VALUE
04228 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
04229 {
04230     if (argc == 2) {
04231         return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
04232     }
04233     rb_check_arity(argc, 1, 2);
04234     return str_byte_aref(str, argv[0]);
04235 }
04236 
04237 /*
04238  *  call-seq:
04239  *     str.reverse   -> new_str
04240  *
04241  *  Returns a new string with the characters from <i>str</i> in reverse order.
04242  *
04243  *     "stressed".reverse   #=> "desserts"
04244  */
04245 
04246 static VALUE
04247 rb_str_reverse(VALUE str)
04248 {
04249     rb_encoding *enc;
04250     VALUE rev;
04251     char *s, *e, *p;
04252     int single = 1;
04253 
04254     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
04255     enc = STR_ENC_GET(str);
04256     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
04257     s = RSTRING_PTR(str); e = RSTRING_END(str);
04258     p = RSTRING_END(rev);
04259 
04260     if (RSTRING_LEN(str) > 1) {
04261         if (single_byte_optimizable(str)) {
04262             while (s < e) {
04263                 *--p = *s++;
04264             }
04265         }
04266         else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
04267             while (s < e) {
04268                 int clen = rb_enc_fast_mbclen(s, e, enc);
04269 
04270                 if (clen > 1 || (*s & 0x80)) single = 0;
04271                 p -= clen;
04272                 memcpy(p, s, clen);
04273                 s += clen;
04274             }
04275         }
04276         else {
04277             while (s < e) {
04278                 int clen = rb_enc_mbclen(s, e, enc);
04279 
04280                 if (clen > 1 || (*s & 0x80)) single = 0;
04281                 p -= clen;
04282                 memcpy(p, s, clen);
04283                 s += clen;
04284             }
04285         }
04286     }
04287     STR_SET_LEN(rev, RSTRING_LEN(str));
04288     OBJ_INFECT(rev, str);
04289     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
04290         if (single) {
04291             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04292         }
04293         else {
04294             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04295         }
04296     }
04297     rb_enc_cr_str_copy_for_substr(rev, str);
04298 
04299     return rev;
04300 }
04301 
04302 
04303 /*
04304  *  call-seq:
04305  *     str.reverse!   -> str
04306  *
04307  *  Reverses <i>str</i> in place.
04308  */
04309 
04310 static VALUE
04311 rb_str_reverse_bang(VALUE str)
04312 {
04313     if (RSTRING_LEN(str) > 1) {
04314         if (single_byte_optimizable(str)) {
04315             char *s, *e, c;
04316 
04317             str_modify_keep_cr(str);
04318             s = RSTRING_PTR(str);
04319             e = RSTRING_END(str) - 1;
04320             while (s < e) {
04321                 c = *s;
04322                 *s++ = *e;
04323                 *e-- = c;
04324             }
04325         }
04326         else {
04327             rb_str_shared_replace(str, rb_str_reverse(str));
04328         }
04329     }
04330     else {
04331         str_modify_keep_cr(str);
04332     }
04333     return str;
04334 }
04335 
04336 
04337 /*
04338  *  call-seq:
04339  *     str.include? other_str   -> true or false
04340  *
04341  *  Returns <code>true</code> if <i>str</i> contains the given string or
04342  *  character.
04343  *
04344  *     "hello".include? "lo"   #=> true
04345  *     "hello".include? "ol"   #=> false
04346  *     "hello".include? ?h     #=> true
04347  */
04348 
04349 static VALUE
04350 rb_str_include(VALUE str, VALUE arg)
04351 {
04352     long i;
04353 
04354     StringValue(arg);
04355     i = rb_str_index(str, arg, 0);
04356 
04357     if (i == -1) return Qfalse;
04358     return Qtrue;
04359 }
04360 
04361 
04362 /*
04363  *  call-seq:
04364  *     str.to_i(base=10)   -> integer
04365  *
04366  *  Returns the result of interpreting leading characters in <i>str</i> as an
04367  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
04368  *  end of a valid number are ignored. If there is not a valid number at the
04369  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
04370  *  exception when <i>base</i> is valid.
04371  *
04372  *     "12345".to_i             #=> 12345
04373  *     "99 red balloons".to_i   #=> 99
04374  *     "0a".to_i                #=> 0
04375  *     "0a".to_i(16)            #=> 10
04376  *     "hello".to_i             #=> 0
04377  *     "1100101".to_i(2)        #=> 101
04378  *     "1100101".to_i(8)        #=> 294977
04379  *     "1100101".to_i(10)       #=> 1100101
04380  *     "1100101".to_i(16)       #=> 17826049
04381  */
04382 
04383 static VALUE
04384 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04385 {
04386     int base;
04387 
04388     if (argc == 0) base = 10;
04389     else {
04390         VALUE b;
04391 
04392         rb_scan_args(argc, argv, "01", &b);
04393         base = NUM2INT(b);
04394     }
04395     if (base < 0) {
04396         rb_raise(rb_eArgError, "invalid radix %d", base);
04397     }
04398     return rb_str_to_inum(str, base, FALSE);
04399 }
04400 
04401 
04402 /*
04403  *  call-seq:
04404  *     str.to_f   -> float
04405  *
04406  *  Returns the result of interpreting leading characters in <i>str</i> as a
04407  *  floating point number. Extraneous characters past the end of a valid number
04408  *  are ignored. If there is not a valid number at the start of <i>str</i>,
04409  *  <code>0.0</code> is returned. This method never raises an exception.
04410  *
04411  *     "123.45e1".to_f        #=> 1234.5
04412  *     "45.67 degrees".to_f   #=> 45.67
04413  *     "thx1138".to_f         #=> 0.0
04414  */
04415 
04416 static VALUE
04417 rb_str_to_f(VALUE str)
04418 {
04419     return DBL2NUM(rb_str_to_dbl(str, FALSE));
04420 }
04421 
04422 
04423 /*
04424  *  call-seq:
04425  *     str.to_s     -> str
04426  *     str.to_str   -> str
04427  *
04428  *  Returns the receiver.
04429  */
04430 
04431 static VALUE
04432 rb_str_to_s(VALUE str)
04433 {
04434     if (rb_obj_class(str) != rb_cString) {
04435         return str_duplicate(rb_cString, str);
04436     }
04437     return str;
04438 }
04439 
04440 #if 0
04441 static void
04442 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04443 {
04444     char s[RUBY_MAX_CHAR_LEN];
04445     int n = rb_enc_codelen(c, enc);
04446 
04447     rb_enc_mbcput(c, s, enc);
04448     rb_enc_str_buf_cat(str, s, n, enc);
04449 }
04450 #endif
04451 
04452 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
04453 
04454 int
04455 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04456 {
04457     char buf[CHAR_ESC_LEN + 1];
04458     int l;
04459 
04460 #if SIZEOF_INT > 4
04461     c &= 0xffffffff;
04462 #endif
04463     if (unicode_p) {
04464         if (c < 0x7F && ISPRINT(c)) {
04465             snprintf(buf, CHAR_ESC_LEN, "%c", c);
04466         }
04467         else if (c < 0x10000) {
04468             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04469         }
04470         else {
04471             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04472         }
04473     }
04474     else {
04475         if (c < 0x100) {
04476             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04477         }
04478         else {
04479             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04480         }
04481     }
04482     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
04483     rb_str_buf_cat(result, buf, l);
04484     return l;
04485 }
04486 
04487 /*
04488  * call-seq:
04489  *   str.inspect   -> string
04490  *
04491  * Returns a printable version of _str_, surrounded by quote marks,
04492  * with special characters escaped.
04493  *
04494  *    str = "hello"
04495  *    str[3] = "\b"
04496  *    str.inspect       #=> "\"hel\\bo\""
04497  */
04498 
04499 VALUE
04500 rb_str_inspect(VALUE str)
04501 {
04502     rb_encoding *enc = STR_ENC_GET(str);
04503     const char *p, *pend, *prev;
04504     char buf[CHAR_ESC_LEN + 1];
04505     VALUE result = rb_str_buf_new(0);
04506     rb_encoding *resenc = rb_default_internal_encoding();
04507     int unicode_p = rb_enc_unicode_p(enc);
04508     int asciicompat = rb_enc_asciicompat(enc);
04509     static rb_encoding *utf16, *utf32;
04510 
04511     if (!utf16) utf16 = rb_enc_find("UTF-16");
04512     if (!utf32) utf32 = rb_enc_find("UTF-32");
04513     if (resenc == NULL) resenc = rb_default_external_encoding();
04514     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04515     rb_enc_associate(result, resenc);
04516     str_buf_cat2(result, "\"");
04517 
04518     p = RSTRING_PTR(str); pend = RSTRING_END(str);
04519     prev = p;
04520     if (enc == utf16) {
04521         const unsigned char *q = (const unsigned char *)p;
04522         if (q[0] == 0xFE && q[1] == 0xFF)
04523             enc = rb_enc_find("UTF-16BE");
04524         else if (q[0] == 0xFF && q[1] == 0xFE)
04525             enc = rb_enc_find("UTF-16LE");
04526         else
04527             unicode_p = 0;
04528     }
04529     else if (enc == utf32) {
04530         const unsigned char *q = (const unsigned char *)p;
04531         if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
04532             enc = rb_enc_find("UTF-32BE");
04533         else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
04534             enc = rb_enc_find("UTF-32LE");
04535         else
04536             unicode_p = 0;
04537     }
04538     while (p < pend) {
04539         unsigned int c, cc;
04540         int n;
04541 
04542         n = rb_enc_precise_mbclen(p, pend, enc);
04543         if (!MBCLEN_CHARFOUND_P(n)) {
04544             if (p > prev) str_buf_cat(result, prev, p - prev);
04545             n = rb_enc_mbminlen(enc);
04546             if (pend < p + n)
04547                 n = (int)(pend - p);
04548             while (n--) {
04549                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04550                 str_buf_cat(result, buf, strlen(buf));
04551                 prev = ++p;
04552             }
04553             continue;
04554         }
04555         n = MBCLEN_CHARFOUND_LEN(n);
04556         c = rb_enc_mbc_to_codepoint(p, pend, enc);
04557         p += n;
04558         if ((asciicompat || unicode_p) &&
04559           (c == '"'|| c == '\\' ||
04560             (c == '#' &&
04561              p < pend &&
04562              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04563              (cc = rb_enc_codepoint(p,pend,enc),
04564               (cc == '$' || cc == '@' || cc == '{'))))) {
04565             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04566             str_buf_cat2(result, "\\");
04567             if (asciicompat || enc == resenc) {
04568                 prev = p - n;
04569                 continue;
04570             }
04571         }
04572         switch (c) {
04573           case '\n': cc = 'n'; break;
04574           case '\r': cc = 'r'; break;
04575           case '\t': cc = 't'; break;
04576           case '\f': cc = 'f'; break;
04577           case '\013': cc = 'v'; break;
04578           case '\010': cc = 'b'; break;
04579           case '\007': cc = 'a'; break;
04580           case 033: cc = 'e'; break;
04581           default: cc = 0; break;
04582         }
04583         if (cc) {
04584             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04585             buf[0] = '\\';
04586             buf[1] = (char)cc;
04587             str_buf_cat(result, buf, 2);
04588             prev = p;
04589             continue;
04590         }
04591         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04592             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04593             continue;
04594         }
04595         else {
04596             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04597             rb_str_buf_cat_escaped_char(result, c, unicode_p);
04598             prev = p;
04599             continue;
04600         }
04601     }
04602     if (p > prev) str_buf_cat(result, prev, p - prev);
04603     str_buf_cat2(result, "\"");
04604 
04605     OBJ_INFECT(result, str);
04606     return result;
04607 }
04608 
04609 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04610 
04611 /*
04612  *  call-seq:
04613  *     str.dump   -> new_str
04614  *
04615  *  Produces a version of +str+ with all non-printing characters replaced by
04616  *  <code>\nnn</code> notation and all special characters escaped.
04617  *
04618  *    "hello \n ''".dump  #=> "\"hello \\n ''\"
04619  */
04620 
04621 VALUE
04622 rb_str_dump(VALUE str)
04623 {
04624     rb_encoding *enc = rb_enc_get(str);
04625     long len;
04626     const char *p, *pend;
04627     char *q, *qend;
04628     VALUE result;
04629     int u8 = (enc == rb_utf8_encoding());
04630 
04631     len = 2;                    /* "" */
04632     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04633     while (p < pend) {
04634         unsigned char c = *p++;
04635         switch (c) {
04636           case '"':  case '\\':
04637           case '\n': case '\r':
04638           case '\t': case '\f':
04639           case '\013': case '\010': case '\007': case '\033':
04640             len += 2;
04641             break;
04642 
04643           case '#':
04644             len += IS_EVSTR(p, pend) ? 2 : 1;
04645             break;
04646 
04647           default:
04648             if (ISPRINT(c)) {
04649                 len++;
04650             }
04651             else {
04652                 if (u8) {       /* \u{NN} */
04653                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
04654                     if (MBCLEN_CHARFOUND_P(n-1)) {
04655                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04656                         while (cc >>= 4) len++;
04657                         len += 5;
04658                         p += MBCLEN_CHARFOUND_LEN(n)-1;
04659                         break;
04660                     }
04661                 }
04662                 len += 4;       /* \xNN */
04663             }
04664             break;
04665         }
04666     }
04667     if (!rb_enc_asciicompat(enc)) {
04668         len += 19;              /* ".force_encoding('')" */
04669         len += strlen(enc->name);
04670     }
04671 
04672     result = rb_str_new5(str, 0, len);
04673     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04674     q = RSTRING_PTR(result); qend = q + len + 1;
04675 
04676     *q++ = '"';
04677     while (p < pend) {
04678         unsigned char c = *p++;
04679 
04680         if (c == '"' || c == '\\') {
04681             *q++ = '\\';
04682             *q++ = c;
04683         }
04684         else if (c == '#') {
04685             if (IS_EVSTR(p, pend)) *q++ = '\\';
04686             *q++ = '#';
04687         }
04688         else if (c == '\n') {
04689             *q++ = '\\';
04690             *q++ = 'n';
04691         }
04692         else if (c == '\r') {
04693             *q++ = '\\';
04694             *q++ = 'r';
04695         }
04696         else if (c == '\t') {
04697             *q++ = '\\';
04698             *q++ = 't';
04699         }
04700         else if (c == '\f') {
04701             *q++ = '\\';
04702             *q++ = 'f';
04703         }
04704         else if (c == '\013') {
04705             *q++ = '\\';
04706             *q++ = 'v';
04707         }
04708         else if (c == '\010') {
04709             *q++ = '\\';
04710             *q++ = 'b';
04711         }
04712         else if (c == '\007') {
04713             *q++ = '\\';
04714             *q++ = 'a';
04715         }
04716         else if (c == '\033') {
04717             *q++ = '\\';
04718             *q++ = 'e';
04719         }
04720         else if (ISPRINT(c)) {
04721             *q++ = c;
04722         }
04723         else {
04724             *q++ = '\\';
04725             if (u8) {
04726                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04727                 if (MBCLEN_CHARFOUND_P(n)) {
04728                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04729                     p += n;
04730                     snprintf(q, qend-q, "u{%x}", cc);
04731                     q += strlen(q);
04732                     continue;
04733                 }
04734             }
04735             snprintf(q, qend-q, "x%02X", c);
04736             q += 3;
04737         }
04738     }
04739     *q++ = '"';
04740     *q = '\0';
04741     if (!rb_enc_asciicompat(enc)) {
04742         snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04743         enc = rb_ascii8bit_encoding();
04744     }
04745     OBJ_INFECT(result, str);
04746     /* result from dump is ASCII */
04747     rb_enc_associate(result, enc);
04748     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04749     return result;
04750 }
04751 
04752 
04753 static void
04754 rb_str_check_dummy_enc(rb_encoding *enc)
04755 {
04756     if (rb_enc_dummy_p(enc)) {
04757         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04758                  rb_enc_name(enc));
04759     }
04760 }
04761 
04762 /*
04763  *  call-seq:
04764  *     str.upcase!   -> str or nil
04765  *
04766  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
04767  *  were made.
04768  *  Note: case replacement is effective only in ASCII region.
04769  */
04770 
04771 static VALUE
04772 rb_str_upcase_bang(VALUE str)
04773 {
04774     rb_encoding *enc;
04775     char *s, *send;
04776     int modify = 0;
04777     int n;
04778 
04779     str_modify_keep_cr(str);
04780     enc = STR_ENC_GET(str);
04781     rb_str_check_dummy_enc(enc);
04782     s = RSTRING_PTR(str); send = RSTRING_END(str);
04783     if (single_byte_optimizable(str)) {
04784         while (s < send) {
04785             unsigned int c = *(unsigned char*)s;
04786 
04787             if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04788                 *s = 'A' + (c - 'a');
04789                 modify = 1;
04790             }
04791             s++;
04792         }
04793     }
04794     else {
04795         int ascompat = rb_enc_asciicompat(enc);
04796 
04797         while (s < send) {
04798             unsigned int c;
04799 
04800             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04801                 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04802                     *s = 'A' + (c - 'a');
04803                     modify = 1;
04804                 }
04805                 s++;
04806             }
04807             else {
04808                 c = rb_enc_codepoint_len(s, send, &n, enc);
04809                 if (rb_enc_islower(c, enc)) {
04810                     /* assuming toupper returns codepoint with same size */
04811                     rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04812                     modify = 1;
04813                 }
04814                 s += n;
04815             }
04816         }
04817     }
04818 
04819     if (modify) return str;
04820     return Qnil;
04821 }
04822 
04823 
04824 /*
04825  *  call-seq:
04826  *     str.upcase   -> new_str
04827  *
04828  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
04829  *  uppercase counterparts. The operation is locale insensitive---only
04830  *  characters ``a'' to ``z'' are affected.
04831  *  Note: case replacement is effective only in ASCII region.
04832  *
04833  *     "hEllO".upcase   #=> "HELLO"
04834  */
04835 
04836 static VALUE
04837 rb_str_upcase(VALUE str)
04838 {
04839     str = rb_str_dup(str);
04840     rb_str_upcase_bang(str);
04841     return str;
04842 }
04843 
04844 
04845 /*
04846  *  call-seq:
04847  *     str.downcase!   -> str or nil
04848  *
04849  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
04850  *  changes were made.
04851  *  Note: case replacement is effective only in ASCII region.
04852  */
04853 
04854 static VALUE
04855 rb_str_downcase_bang(VALUE str)
04856 {
04857     rb_encoding *enc;
04858     char *s, *send;
04859     int modify = 0;
04860 
04861     str_modify_keep_cr(str);
04862     enc = STR_ENC_GET(str);
04863     rb_str_check_dummy_enc(enc);
04864     s = RSTRING_PTR(str); send = RSTRING_END(str);
04865     if (single_byte_optimizable(str)) {
04866         while (s < send) {
04867             unsigned int c = *(unsigned char*)s;
04868 
04869             if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04870                 *s = 'a' + (c - 'A');
04871                 modify = 1;
04872             }
04873             s++;
04874         }
04875     }
04876     else {
04877         int ascompat = rb_enc_asciicompat(enc);
04878 
04879         while (s < send) {
04880             unsigned int c;
04881             int n;
04882 
04883             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04884                 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04885                     *s = 'a' + (c - 'A');
04886                     modify = 1;
04887                 }
04888                 s++;
04889             }
04890             else {
04891                 c = rb_enc_codepoint_len(s, send, &n, enc);
04892                 if (rb_enc_isupper(c, enc)) {
04893                     /* assuming toupper returns codepoint with same size */
04894                     rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04895                     modify = 1;
04896                 }
04897                 s += n;
04898             }
04899         }
04900     }
04901 
04902     if (modify) return str;
04903     return Qnil;
04904 }
04905 
04906 
04907 /*
04908  *  call-seq:
04909  *     str.downcase   -> new_str
04910  *
04911  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
04912  *  lowercase counterparts. The operation is locale insensitive---only
04913  *  characters ``A'' to ``Z'' are affected.
04914  *  Note: case replacement is effective only in ASCII region.
04915  *
04916  *     "hEllO".downcase   #=> "hello"
04917  */
04918 
04919 static VALUE
04920 rb_str_downcase(VALUE str)
04921 {
04922     str = rb_str_dup(str);
04923     rb_str_downcase_bang(str);
04924     return str;
04925 }
04926 
04927 
04928 /*
04929  *  call-seq:
04930  *     str.capitalize!   -> str or nil
04931  *
04932  *  Modifies <i>str</i> by converting the first character to uppercase and the
04933  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
04934  *  Note: case conversion is effective only in ASCII region.
04935  *
04936  *     a = "hello"
04937  *     a.capitalize!   #=> "Hello"
04938  *     a               #=> "Hello"
04939  *     a.capitalize!   #=> nil
04940  */
04941 
04942 static VALUE
04943 rb_str_capitalize_bang(VALUE str)
04944 {
04945     rb_encoding *enc;
04946     char *s, *send;
04947     int modify = 0;
04948     unsigned int c;
04949     int n;
04950 
04951     str_modify_keep_cr(str);
04952     enc = STR_ENC_GET(str);
04953     rb_str_check_dummy_enc(enc);
04954     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04955     s = RSTRING_PTR(str); send = RSTRING_END(str);
04956 
04957     c = rb_enc_codepoint_len(s, send, &n, enc);
04958     if (rb_enc_islower(c, enc)) {
04959         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04960         modify = 1;
04961     }
04962     s += n;
04963     while (s < send) {
04964         c = rb_enc_codepoint_len(s, send, &n, enc);
04965         if (rb_enc_isupper(c, enc)) {
04966             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04967             modify = 1;
04968         }
04969         s += n;
04970     }
04971 
04972     if (modify) return str;
04973     return Qnil;
04974 }
04975 
04976 
04977 /*
04978  *  call-seq:
04979  *     str.capitalize   -> new_str
04980  *
04981  *  Returns a copy of <i>str</i> with the first character converted to uppercase
04982  *  and the remainder to lowercase.
04983  *  Note: case conversion is effective only in ASCII region.
04984  *
04985  *     "hello".capitalize    #=> "Hello"
04986  *     "HELLO".capitalize    #=> "Hello"
04987  *     "123ABC".capitalize   #=> "123abc"
04988  */
04989 
04990 static VALUE
04991 rb_str_capitalize(VALUE str)
04992 {
04993     str = rb_str_dup(str);
04994     rb_str_capitalize_bang(str);
04995     return str;
04996 }
04997 
04998 
04999 /*
05000  *  call-seq:
05001  *     str.swapcase!   -> str or nil
05002  *
05003  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
05004  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
05005  *  Note: case conversion is effective only in ASCII region.
05006  */
05007 
05008 static VALUE
05009 rb_str_swapcase_bang(VALUE str)
05010 {
05011     rb_encoding *enc;
05012     char *s, *send;
05013     int modify = 0;
05014     int n;
05015 
05016     str_modify_keep_cr(str);
05017     enc = STR_ENC_GET(str);
05018     rb_str_check_dummy_enc(enc);
05019     s = RSTRING_PTR(str); send = RSTRING_END(str);
05020     while (s < send) {
05021         unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
05022 
05023         if (rb_enc_isupper(c, enc)) {
05024             /* assuming toupper returns codepoint with same size */
05025             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
05026             modify = 1;
05027         }
05028         else if (rb_enc_islower(c, enc)) {
05029             /* assuming tolower returns codepoint with same size */
05030             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
05031             modify = 1;
05032         }
05033         s += n;
05034     }
05035 
05036     if (modify) return str;
05037     return Qnil;
05038 }
05039 
05040 
05041 /*
05042  *  call-seq:
05043  *     str.swapcase   -> new_str
05044  *
05045  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
05046  *  to lowercase and lowercase characters converted to uppercase.
05047  *  Note: case conversion is effective only in ASCII region.
05048  *
05049  *     "Hello".swapcase          #=> "hELLO"
05050  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
05051  */
05052 
05053 static VALUE
05054 rb_str_swapcase(VALUE str)
05055 {
05056     str = rb_str_dup(str);
05057     rb_str_swapcase_bang(str);
05058     return str;
05059 }
05060 
05061 typedef unsigned char *USTR;
05062 
05063 struct tr {
05064     int gen;
05065     unsigned int now, max;
05066     char *p, *pend;
05067 };
05068 
05069 static unsigned int
05070 trnext(struct tr *t, rb_encoding *enc)
05071 {
05072     int n;
05073 
05074     for (;;) {
05075         if (!t->gen) {
05076 nextpart:
05077             if (t->p == t->pend) return -1;
05078             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
05079                 t->p += n;
05080             }
05081             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
05082             t->p += n;
05083             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
05084                 t->p += n;
05085                 if (t->p < t->pend) {
05086                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
05087                     t->p += n;
05088                     if (t->now > c) {
05089                         if (t->now < 0x80 && c < 0x80) {
05090                             rb_raise(rb_eArgError,
05091                                      "invalid range \"%c-%c\" in string transliteration",
05092                                      t->now, c);
05093                         }
05094                         else {
05095                             rb_raise(rb_eArgError, "invalid range in string transliteration");
05096                         }
05097                         continue; /* not reached */
05098                     }
05099                     t->gen = 1;
05100                     t->max = c;
05101                 }
05102             }
05103             return t->now;
05104         }
05105         else {
05106             while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
05107                 if (t->now == t->max) {
05108                     t->gen = 0;
05109                     goto nextpart;
05110                 }
05111             }
05112             if (t->now < t->max) {
05113                 return t->now;
05114             }
05115             else {
05116                 t->gen = 0;
05117                 return t->max;
05118             }
05119         }
05120     }
05121 }
05122 
05123 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
05124 
05125 static VALUE
05126 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
05127 {
05128     const unsigned int errc = -1;
05129     unsigned int trans[256];
05130     rb_encoding *enc, *e1, *e2;
05131     struct tr trsrc, trrepl;
05132     int cflag = 0;
05133     unsigned int c, c0, last = 0;
05134     int modify = 0, i, l;
05135     char *s, *send;
05136     VALUE hash = 0;
05137     int singlebyte = single_byte_optimizable(str);
05138     int cr;
05139 
05140 #define CHECK_IF_ASCII(c) \
05141     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
05142            (cr = ENC_CODERANGE_VALID) : 0)
05143 
05144     StringValue(src);
05145     StringValue(repl);
05146     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05147     if (RSTRING_LEN(repl) == 0) {
05148         return rb_str_delete_bang(1, &src, str);
05149     }
05150 
05151     cr = ENC_CODERANGE(str);
05152     e1 = rb_enc_check(str, src);
05153     e2 = rb_enc_check(str, repl);
05154     if (e1 == e2) {
05155         enc = e1;
05156     }
05157     else {
05158         enc = rb_enc_check(src, repl);
05159     }
05160     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
05161     if (RSTRING_LEN(src) > 1 &&
05162         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
05163         trsrc.p + l < trsrc.pend) {
05164         cflag = 1;
05165         trsrc.p += l;
05166     }
05167     trrepl.p = RSTRING_PTR(repl);
05168     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
05169     trsrc.gen = trrepl.gen = 0;
05170     trsrc.now = trrepl.now = 0;
05171     trsrc.max = trrepl.max = 0;
05172 
05173     if (cflag) {
05174         for (i=0; i<256; i++) {
05175             trans[i] = 1;
05176         }
05177         while ((c = trnext(&trsrc, enc)) != errc) {
05178             if (c < 256) {
05179                 trans[c] = errc;
05180             }
05181             else {
05182                 if (!hash) hash = rb_hash_new();
05183                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
05184             }
05185         }
05186         while ((c = trnext(&trrepl, enc)) != errc)
05187             /* retrieve last replacer */;
05188         last = trrepl.now;
05189         for (i=0; i<256; i++) {
05190             if (trans[i] != errc) {
05191                 trans[i] = last;
05192             }
05193         }
05194     }
05195     else {
05196         unsigned int r;
05197 
05198         for (i=0; i<256; i++) {
05199             trans[i] = errc;
05200         }
05201         while ((c = trnext(&trsrc, enc)) != errc) {
05202             r = trnext(&trrepl, enc);
05203             if (r == errc) r = trrepl.now;
05204             if (c < 256) {
05205                 trans[c] = r;
05206                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
05207             }
05208             else {
05209                 if (!hash) hash = rb_hash_new();
05210                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
05211             }
05212         }
05213     }
05214 
05215     if (cr == ENC_CODERANGE_VALID)
05216         cr = ENC_CODERANGE_7BIT;
05217     str_modify_keep_cr(str);
05218     s = RSTRING_PTR(str); send = RSTRING_END(str);
05219     if (sflag) {
05220         int clen, tlen;
05221         long offset, max = RSTRING_LEN(str);
05222         unsigned int save = -1;
05223         char *buf = ALLOC_N(char, max), *t = buf;
05224 
05225         while (s < send) {
05226             int may_modify = 0;
05227 
05228             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05229             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05230 
05231             s += clen;
05232             if (c < 256) {
05233                 c = trans[c];
05234             }
05235             else if (hash) {
05236                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05237                 if (NIL_P(tmp)) {
05238                     if (cflag) c = last;
05239                     else c = errc;
05240                 }
05241                 else if (cflag) c = errc;
05242                 else c = NUM2INT(tmp);
05243             }
05244             else {
05245                 c = errc;
05246             }
05247             if (c != (unsigned int)-1) {
05248                 if (save == c) {
05249                     CHECK_IF_ASCII(c);
05250                     continue;
05251                 }
05252                 save = c;
05253                 tlen = rb_enc_codelen(c, enc);
05254                 modify = 1;
05255             }
05256             else {
05257                 save = -1;
05258                 c = c0;
05259                 if (enc != e1) may_modify = 1;
05260             }
05261             while (t - buf + tlen >= max) {
05262                 offset = t - buf;
05263                 max *= 2;
05264                 REALLOC_N(buf, char, max);
05265                 t = buf + offset;
05266             }
05267             rb_enc_mbcput(c, t, enc);
05268             if (may_modify && memcmp(s, t, tlen) != 0) {
05269                 modify = 1;
05270             }
05271             CHECK_IF_ASCII(c);
05272             t += tlen;
05273         }
05274         if (!STR_EMBED_P(str)) {
05275             xfree(RSTRING(str)->as.heap.ptr);
05276         }
05277         *t = '\0';
05278         RSTRING(str)->as.heap.ptr = buf;
05279         RSTRING(str)->as.heap.len = t - buf;
05280         STR_SET_NOEMBED(str);
05281         RSTRING(str)->as.heap.aux.capa = max;
05282     }
05283     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
05284         while (s < send) {
05285             c = (unsigned char)*s;
05286             if (trans[c] != errc) {
05287                 if (!cflag) {
05288                     c = trans[c];
05289                     *s = c;
05290                     modify = 1;
05291                 }
05292                 else {
05293                     *s = last;
05294                     modify = 1;
05295                 }
05296             }
05297             CHECK_IF_ASCII(c);
05298             s++;
05299         }
05300     }
05301     else {
05302         int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
05303         long offset;
05304         char *buf = ALLOC_N(char, max), *t = buf;
05305 
05306         while (s < send) {
05307             int may_modify = 0;
05308             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05309             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05310 
05311             if (c < 256) {
05312                 c = trans[c];
05313             }
05314             else if (hash) {
05315                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05316                 if (NIL_P(tmp)) {
05317                     if (cflag) c = last;
05318                     else c = errc;
05319                 }
05320                 else if (cflag) c = errc;
05321                 else c = NUM2INT(tmp);
05322             }
05323             else {
05324                 c = cflag ? last : errc;
05325             }
05326             if (c != errc) {
05327                 tlen = rb_enc_codelen(c, enc);
05328                 modify = 1;
05329             }
05330             else {
05331                 c = c0;
05332                 if (enc != e1) may_modify = 1;
05333             }
05334             while (t - buf + tlen >= max) {
05335                 offset = t - buf;
05336                 max *= 2;
05337                 REALLOC_N(buf, char, max);
05338                 t = buf + offset;
05339             }
05340             if (s != t) {
05341                 rb_enc_mbcput(c, t, enc);
05342                 if (may_modify && memcmp(s, t, tlen) != 0) {
05343                     modify = 1;
05344                 }
05345             }
05346             CHECK_IF_ASCII(c);
05347             s += clen;
05348             t += tlen;
05349         }
05350         if (!STR_EMBED_P(str)) {
05351             xfree(RSTRING(str)->as.heap.ptr);
05352         }
05353         *t = '\0';
05354         RSTRING(str)->as.heap.ptr = buf;
05355         RSTRING(str)->as.heap.len = t - buf;
05356         STR_SET_NOEMBED(str);
05357         RSTRING(str)->as.heap.aux.capa = max;
05358     }
05359 
05360     if (modify) {
05361         if (cr != ENC_CODERANGE_BROKEN)
05362             ENC_CODERANGE_SET(str, cr);
05363         rb_enc_associate(str, enc);
05364         return str;
05365     }
05366     return Qnil;
05367 }
05368 
05369 
05370 /*
05371  *  call-seq:
05372  *     str.tr!(from_str, to_str)   -> str or nil
05373  *
05374  *  Translates <i>str</i> in place, using the same rules as
05375  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
05376  *  changes were made.
05377  */
05378 
05379 static VALUE
05380 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05381 {
05382     return tr_trans(str, src, repl, 0);
05383 }
05384 
05385 
05386 /*
05387  *  call-seq:
05388  *     str.tr(from_str, to_str)   => new_str
05389  *
05390  *  Returns a copy of +str+ with the characters in +from_str+ replaced by the
05391  *  corresponding characters in +to_str+.  If +to_str+ is shorter than
05392  *  +from_str+, it is padded with its last character in order to maintain the
05393  *  correspondence.
05394  *
05395  *     "hello".tr('el', 'ip')      #=> "hippo"
05396  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
05397  *     "hello".tr('aeiou', 'AA*')  #=> "hAll*"
05398  *
05399  *  Both strings may use the <code>c1-c2</code> notation to denote ranges of
05400  *  characters, and +from_str+ may start with a <code>^</code>, which denotes
05401  *  all characters except those listed.
05402  *
05403  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
05404  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
05405  *
05406  *  The backslash character <code></code> can be used to escape
05407  *  <code>^</code> or <code>-</code> and is otherwise ignored unless it
05408  *  appears at the end of a range or the end of the +from_str+ or +to_str+:
05409  *
05410  *     "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
05411  *     "hello-world".tr("a\\-eo", "*")   #=> "h*ll**w*rld"
05412  *
05413  *     "hello\r\nworld".tr("\r", "")   #=> "hello\nworld"
05414  *     "hello\r\nworld".tr("\\r", "")  #=> "hello\r\nwold"
05415  *     "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
05416  *
05417  *     "X['\\b']".tr("X\\", "")   #=> "['b']"
05418  *     "X['\\b']".tr("X-\\]", "") #=> "'b'"
05419  */
05420 
05421 static VALUE
05422 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05423 {
05424     str = rb_str_dup(str);
05425     tr_trans(str, src, repl, 0);
05426     return str;
05427 }
05428 
05429 #define TR_TABLE_SIZE 257
05430 static void
05431 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
05432                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05433 {
05434     const unsigned int errc = -1;
05435     char buf[256];
05436     struct tr tr;
05437     unsigned int c;
05438     VALUE table = 0, ptable = 0;
05439     int i, l, cflag = 0;
05440 
05441     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05442     tr.gen = tr.now = tr.max = 0;
05443 
05444     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05445         cflag = 1;
05446         tr.p += l;
05447     }
05448     if (first) {
05449         for (i=0; i<256; i++) {
05450             stable[i] = 1;
05451         }
05452         stable[256] = cflag;
05453     }
05454     else if (stable[256] && !cflag) {
05455         stable[256] = 0;
05456     }
05457     for (i=0; i<256; i++) {
05458         buf[i] = cflag;
05459     }
05460 
05461     while ((c = trnext(&tr, enc)) != errc) {
05462         if (c < 256) {
05463             buf[c & 0xff] = !cflag;
05464         }
05465         else {
05466             VALUE key = UINT2NUM(c);
05467 
05468             if (!table && (first || *tablep || stable[256])) {
05469                 if (cflag) {
05470                     ptable = *ctablep;
05471                     table = ptable ? ptable : rb_hash_new();
05472                     *ctablep = table;
05473                 }
05474                 else {
05475                     table = rb_hash_new();
05476                     ptable = *tablep;
05477                     *tablep = table;
05478                 }
05479             }
05480             if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
05481                 rb_hash_aset(table, key, Qtrue);
05482             }
05483         }
05484     }
05485     for (i=0; i<256; i++) {
05486         stable[i] = stable[i] && buf[i];
05487     }
05488     if (!table && !cflag) {
05489         *tablep = 0;
05490     }
05491 }
05492 
05493 
05494 static int
05495 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
05496 {
05497     if (c < 256) {
05498         return table[c] != 0;
05499     }
05500     else {
05501         VALUE v = UINT2NUM(c);
05502 
05503         if (del) {
05504             if (!NIL_P(rb_hash_lookup(del, v)) &&
05505                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05506                 return TRUE;
05507             }
05508         }
05509         else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
05510             return FALSE;
05511         }
05512         return table[256] ? TRUE : FALSE;
05513     }
05514 }
05515 
05516 /*
05517  *  call-seq:
05518  *     str.delete!([other_str]+)   -> str or nil
05519  *
05520  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
05521  *  <code>nil</code> if <i>str</i> was not modified.
05522  */
05523 
05524 static VALUE
05525 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05526 {
05527     char squeez[TR_TABLE_SIZE];
05528     rb_encoding *enc = 0;
05529     char *s, *send, *t;
05530     VALUE del = 0, nodel = 0;
05531     int modify = 0;
05532     int i, ascompat, cr;
05533 
05534     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05535     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
05536     for (i=0; i<argc; i++) {
05537         VALUE s = argv[i];
05538 
05539         StringValue(s);
05540         enc = rb_enc_check(str, s);
05541         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05542     }
05543 
05544     str_modify_keep_cr(str);
05545     ascompat = rb_enc_asciicompat(enc);
05546     s = t = RSTRING_PTR(str);
05547     send = RSTRING_END(str);
05548     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05549     while (s < send) {
05550         unsigned int c;
05551         int clen;
05552 
05553         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05554             if (squeez[c]) {
05555                 modify = 1;
05556             }
05557             else {
05558                 if (t != s) *t = c;
05559                 t++;
05560             }
05561             s++;
05562         }
05563         else {
05564             c = rb_enc_codepoint_len(s, send, &clen, enc);
05565 
05566             if (tr_find(c, squeez, del, nodel)) {
05567                 modify = 1;
05568             }
05569             else {
05570                 if (t != s) rb_enc_mbcput(c, t, enc);
05571                 t += clen;
05572                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05573             }
05574             s += clen;
05575         }
05576     }
05577     *t = '\0';
05578     STR_SET_LEN(str, t - RSTRING_PTR(str));
05579     ENC_CODERANGE_SET(str, cr);
05580 
05581     if (modify) return str;
05582     return Qnil;
05583 }
05584 
05585 
05586 /*
05587  *  call-seq:
05588  *     str.delete([other_str]+)   -> new_str
05589  *
05590  *  Returns a copy of <i>str</i> with all characters in the intersection of its
05591  *  arguments deleted. Uses the same rules for building the set of characters as
05592  *  <code>String#count</code>.
05593  *
05594  *     "hello".delete "l","lo"        #=> "heo"
05595  *     "hello".delete "lo"            #=> "he"
05596  *     "hello".delete "aeiou", "^e"   #=> "hell"
05597  *     "hello".delete "ej-m"          #=> "ho"
05598  */
05599 
05600 static VALUE
05601 rb_str_delete(int argc, VALUE *argv, VALUE str)
05602 {
05603     str = rb_str_dup(str);
05604     rb_str_delete_bang(argc, argv, str);
05605     return str;
05606 }
05607 
05608 
05609 /*
05610  *  call-seq:
05611  *     str.squeeze!([other_str]*)   -> str or nil
05612  *
05613  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
05614  *  <code>nil</code> if no changes were made.
05615  */
05616 
05617 static VALUE
05618 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05619 {
05620     char squeez[TR_TABLE_SIZE];
05621     rb_encoding *enc = 0;
05622     VALUE del = 0, nodel = 0;
05623     char *s, *send, *t;
05624     int i, modify = 0;
05625     int ascompat, singlebyte = single_byte_optimizable(str);
05626     unsigned int save;
05627 
05628     if (argc == 0) {
05629         enc = STR_ENC_GET(str);
05630     }
05631     else {
05632         for (i=0; i<argc; i++) {
05633             VALUE s = argv[i];
05634 
05635             StringValue(s);
05636             enc = rb_enc_check(str, s);
05637             if (singlebyte && !single_byte_optimizable(s))
05638                 singlebyte = 0;
05639             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05640         }
05641     }
05642 
05643     str_modify_keep_cr(str);
05644     s = t = RSTRING_PTR(str);
05645     if (!s || RSTRING_LEN(str) == 0) return Qnil;
05646     send = RSTRING_END(str);
05647     save = -1;
05648     ascompat = rb_enc_asciicompat(enc);
05649 
05650     if (singlebyte) {
05651         while (s < send) {
05652             unsigned int c = *(unsigned char*)s++;
05653             if (c != save || (argc > 0 && !squeez[c])) {
05654                 *t++ = save = c;
05655             }
05656         }
05657     } else {
05658         while (s < send) {
05659             unsigned int c;
05660             int clen;
05661 
05662             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05663                 if (c != save || (argc > 0 && !squeez[c])) {
05664                     *t++ = save = c;
05665                 }
05666                 s++;
05667             }
05668             else {
05669                 c = rb_enc_codepoint_len(s, send, &clen, enc);
05670 
05671                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05672                     if (t != s) rb_enc_mbcput(c, t, enc);
05673                     save = c;
05674                     t += clen;
05675                 }
05676                 s += clen;
05677             }
05678         }
05679     }
05680 
05681     *t = '\0';
05682     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05683         STR_SET_LEN(str, t - RSTRING_PTR(str));
05684         modify = 1;
05685     }
05686 
05687     if (modify) return str;
05688     return Qnil;
05689 }
05690 
05691 
05692 /*
05693  *  call-seq:
05694  *     str.squeeze([other_str]*)    -> new_str
05695  *
05696  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
05697  *  procedure described for <code>String#count</code>. Returns a new string
05698  *  where runs of the same character that occur in this set are replaced by a
05699  *  single character. If no arguments are given, all runs of identical
05700  *  characters are replaced by a single character.
05701  *
05702  *     "yellow moon".squeeze                  #=> "yelow mon"
05703  *     "  now   is  the".squeeze(" ")         #=> " now is the"
05704  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
05705  */
05706 
05707 static VALUE
05708 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05709 {
05710     str = rb_str_dup(str);
05711     rb_str_squeeze_bang(argc, argv, str);
05712     return str;
05713 }
05714 
05715 
05716 /*
05717  *  call-seq:
05718  *     str.tr_s!(from_str, to_str)   -> str or nil
05719  *
05720  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
05721  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
05722  */
05723 
05724 static VALUE
05725 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05726 {
05727     return tr_trans(str, src, repl, 1);
05728 }
05729 
05730 
05731 /*
05732  *  call-seq:
05733  *     str.tr_s(from_str, to_str)   -> new_str
05734  *
05735  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
05736  *  then removes duplicate characters in regions that were affected by the
05737  *  translation.
05738  *
05739  *     "hello".tr_s('l', 'r')     #=> "hero"
05740  *     "hello".tr_s('el', '*')    #=> "h*o"
05741  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
05742  */
05743 
05744 static VALUE
05745 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05746 {
05747     str = rb_str_dup(str);
05748     tr_trans(str, src, repl, 1);
05749     return str;
05750 }
05751 
05752 
05753 /*
05754  *  call-seq:
05755  *     str.count([other_str]+)   -> fixnum
05756  *
05757  *  Each +other_str+ parameter defines a set of characters to count.  The
05758  *  intersection of these sets defines the characters to count in +str+.  Any
05759  *  +other_str+ that starts with a caret <code>^</code> is negated.  The
05760  *  sequence <code>c1-c2</code> means all characters between c1 and c2.  The
05761  *  backslash character <code></code> can be used to escape <code>^</code> or
05762  *  <code>-</code> and is otherwise ignored unless it appears at the end of a
05763  *  sequence or the end of a +other_str+.
05764  *
05765  *     a = "hello world"
05766  *     a.count "lo"                   #=> 5
05767  *     a.count "lo", "o"              #=> 2
05768  *     a.count "hello", "^l"          #=> 4
05769  *     a.count "ej-m"                 #=> 4
05770  *
05771  *     "hello^world".count "\\^aeiou" #=> 4
05772  *     "hello-world".count "a\\-eo"   #=> 4
05773  *
05774  *     c = "hello world\\r\\n"
05775  *     c.count "\\"                   #=> 2
05776  *     c.count "\\A"                  #=> 0
05777  *     c.count "X-\\w"                #=> 3
05778  */
05779 
05780 static VALUE
05781 rb_str_count(int argc, VALUE *argv, VALUE str)
05782 {
05783     char table[TR_TABLE_SIZE];
05784     rb_encoding *enc = 0;
05785     VALUE del = 0, nodel = 0;
05786     char *s, *send;
05787     int i;
05788     int ascompat;
05789 
05790     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
05791     for (i=0; i<argc; i++) {
05792         VALUE tstr = argv[i];
05793         unsigned char c;
05794 
05795         StringValue(tstr);
05796         enc = rb_enc_check(str, tstr);
05797         if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05798             (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05799             int n = 0;
05800 
05801             s = RSTRING_PTR(str);
05802             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05803             send = RSTRING_END(str);
05804             while (s < send) {
05805                 if (*(unsigned char*)s++ == c) n++;
05806             }
05807             return INT2NUM(n);
05808         }
05809         tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05810     }
05811 
05812     s = RSTRING_PTR(str);
05813     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05814     send = RSTRING_END(str);
05815     ascompat = rb_enc_asciicompat(enc);
05816     i = 0;
05817     while (s < send) {
05818         unsigned int c;
05819 
05820         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05821             if (table[c]) {
05822                 i++;
05823             }
05824             s++;
05825         }
05826         else {
05827             int clen;
05828             c = rb_enc_codepoint_len(s, send, &clen, enc);
05829             if (tr_find(c, table, del, nodel)) {
05830                 i++;
05831             }
05832             s += clen;
05833         }
05834     }
05835 
05836     return INT2NUM(i);
05837 }
05838 
05839 static const char isspacetable[256] = {
05840     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05841     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05842     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05843     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05844     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05845     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05846     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05847     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05848     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05849     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05850     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05851     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05852     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05853     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05854     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05855     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05856 };
05857 
05858 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05859 
05860 /*
05861  *  call-seq:
05862  *     str.split(pattern=$;, [limit])   -> anArray
05863  *
05864  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
05865  *  of these substrings.
05866  *
05867  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
05868  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
05869  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
05870  *  of contiguous whitespace characters ignored.
05871  *
05872  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
05873  *  pattern matches. Whenever the pattern matches a zero-length string,
05874  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
05875  *  groups, the respective matches will be returned in the array as well.
05876  *
05877  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
05878  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
05879  *  split on whitespace as if ` ' were specified.
05880  *
05881  *  If the <i>limit</i> parameter is omitted, trailing null fields are
05882  *  suppressed. If <i>limit</i> is a positive number, at most that number of
05883  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
05884  *  string is returned as the only entry in an array). If negative, there is no
05885  *  limit to the number of fields returned, and trailing null fields are not
05886  *  suppressed.
05887  *
05888  *  When the input +str+ is empty an empty Array is returned as the string is
05889  *  considered to have no fields to split.
05890  *
05891  *     " now's  the time".split        #=> ["now's", "the", "time"]
05892  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
05893  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
05894  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
05895  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
05896  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
05897  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
05898  *
05899  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
05900  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
05901  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
05902  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
05903  *
05904  *     "".split(',', -1)               #=> []
05905  */
05906 
05907 static VALUE
05908 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05909 {
05910     rb_encoding *enc;
05911     VALUE spat;
05912     VALUE limit;
05913     enum {awk, string, regexp} split_type;
05914     long beg, end, i = 0;
05915     int lim = 0;
05916     VALUE result, tmp;
05917 
05918     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05919         lim = NUM2INT(limit);
05920         if (lim <= 0) limit = Qnil;
05921         else if (lim == 1) {
05922             if (RSTRING_LEN(str) == 0)
05923                 return rb_ary_new2(0);
05924             return rb_ary_new3(1, str);
05925         }
05926         i = 1;
05927     }
05928 
05929     enc = STR_ENC_GET(str);
05930     if (NIL_P(spat)) {
05931         if (!NIL_P(rb_fs)) {
05932             spat = rb_fs;
05933             goto fs_set;
05934         }
05935         split_type = awk;
05936     }
05937     else {
05938       fs_set:
05939         if (RB_TYPE_P(spat, T_STRING)) {
05940             rb_encoding *enc2 = STR_ENC_GET(spat);
05941 
05942             split_type = string;
05943             if (RSTRING_LEN(spat) == 0) {
05944                 /* Special case - split into chars */
05945                 spat = rb_reg_regcomp(spat);
05946                 split_type = regexp;
05947             }
05948             else if (rb_enc_asciicompat(enc2) == 1) {
05949                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05950                     split_type = awk;
05951                 }
05952             }
05953             else {
05954                 int l;
05955                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05956                     RSTRING_LEN(spat) == l) {
05957                     split_type = awk;
05958                 }
05959             }
05960         }
05961         else {
05962             spat = get_pat(spat, 1);
05963             split_type = regexp;
05964         }
05965     }
05966 
05967     result = rb_ary_new();
05968     beg = 0;
05969     if (split_type == awk) {
05970         char *ptr = RSTRING_PTR(str);
05971         char *eptr = RSTRING_END(str);
05972         char *bptr = ptr;
05973         int skip = 1;
05974         unsigned int c;
05975 
05976         end = beg;
05977         if (is_ascii_string(str)) {
05978             while (ptr < eptr) {
05979                 c = (unsigned char)*ptr++;
05980                 if (skip) {
05981                     if (ascii_isspace(c)) {
05982                         beg = ptr - bptr;
05983                     }
05984                     else {
05985                         end = ptr - bptr;
05986                         skip = 0;
05987                         if (!NIL_P(limit) && lim <= i) break;
05988                     }
05989                 }
05990                 else if (ascii_isspace(c)) {
05991                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05992                     skip = 1;
05993                     beg = ptr - bptr;
05994                     if (!NIL_P(limit)) ++i;
05995                 }
05996                 else {
05997                     end = ptr - bptr;
05998                 }
05999             }
06000         }
06001         else {
06002             while (ptr < eptr) {
06003                 int n;
06004 
06005                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
06006                 ptr += n;
06007                 if (skip) {
06008                     if (rb_isspace(c)) {
06009                         beg = ptr - bptr;
06010                     }
06011                     else {
06012                         end = ptr - bptr;
06013                         skip = 0;
06014                         if (!NIL_P(limit) && lim <= i) break;
06015                     }
06016                 }
06017                 else if (rb_isspace(c)) {
06018                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
06019                     skip = 1;
06020                     beg = ptr - bptr;
06021                     if (!NIL_P(limit)) ++i;
06022                 }
06023                 else {
06024                     end = ptr - bptr;
06025                 }
06026             }
06027         }
06028     }
06029     else if (split_type == string) {
06030         char *ptr = RSTRING_PTR(str);
06031         char *temp = ptr;
06032         char *eptr = RSTRING_END(str);
06033         char *sptr = RSTRING_PTR(spat);
06034         long slen = RSTRING_LEN(spat);
06035 
06036         if (is_broken_string(str)) {
06037             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
06038         }
06039         if (is_broken_string(spat)) {
06040             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
06041         }
06042         enc = rb_enc_check(str, spat);
06043         while (ptr < eptr &&
06044                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
06045             /* Check we are at the start of a char */
06046             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
06047             if (t != ptr + end) {
06048                 ptr = t;
06049                 continue;
06050             }
06051             rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
06052             ptr += end + slen;
06053             if (!NIL_P(limit) && lim <= ++i) break;
06054         }
06055         beg = ptr - temp;
06056     }
06057     else {
06058         char *ptr = RSTRING_PTR(str);
06059         long len = RSTRING_LEN(str);
06060         long start = beg;
06061         long idx;
06062         int last_null = 0;
06063         struct re_registers *regs;
06064 
06065         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
06066             regs = RMATCH_REGS(rb_backref_get());
06067             if (start == end && BEG(0) == END(0)) {
06068                 if (!ptr) {
06069                     rb_ary_push(result, str_new_empty(str));
06070                     break;
06071                 }
06072                 else if (last_null == 1) {
06073                     rb_ary_push(result, rb_str_subseq(str, beg,
06074                                                       rb_enc_fast_mbclen(ptr+beg,
06075                                                                          ptr+len,
06076                                                                          enc)));
06077                     beg = start;
06078                 }
06079                 else {
06080                     if (ptr+start == ptr+len)
06081                         start++;
06082                     else
06083                         start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
06084                     last_null = 1;
06085                     continue;
06086                 }
06087             }
06088             else {
06089                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
06090                 beg = start = END(0);
06091             }
06092             last_null = 0;
06093 
06094             for (idx=1; idx < regs->num_regs; idx++) {
06095                 if (BEG(idx) == -1) continue;
06096                 if (BEG(idx) == END(idx))
06097                     tmp = str_new_empty(str);
06098                 else
06099                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
06100                 rb_ary_push(result, tmp);
06101             }
06102             if (!NIL_P(limit) && lim <= ++i) break;
06103         }
06104     }
06105     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
06106         if (RSTRING_LEN(str) == beg)
06107             tmp = str_new_empty(str);
06108         else
06109             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
06110         rb_ary_push(result, tmp);
06111     }
06112     if (NIL_P(limit) && lim == 0) {
06113         long len;
06114         while ((len = RARRAY_LEN(result)) > 0 &&
06115                (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
06116             rb_ary_pop(result);
06117     }
06118 
06119     return result;
06120 }
06121 
06122 VALUE
06123 rb_str_split(VALUE str, const char *sep0)
06124 {
06125     VALUE sep;
06126 
06127     StringValue(str);
06128     sep = rb_str_new2(sep0);
06129     return rb_str_split_m(1, &sep, str);
06130 }
06131 
06132 
06133 static VALUE
06134 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
06135 {
06136     rb_encoding *enc;
06137     VALUE rs;
06138     unsigned int newline;
06139     const char *p, *pend, *s, *ptr;
06140     long len, rslen;
06141     VALUE line;
06142     int n;
06143     VALUE orig = str;
06144     VALUE UNINITIALIZED_VAR(ary);
06145 
06146     if (argc == 0) {
06147         rs = rb_rs;
06148     }
06149     else {
06150         rb_scan_args(argc, argv, "01", &rs);
06151     }
06152 
06153     if (rb_block_given_p()) {
06154         if (wantarray) {
06155 #if 0 /* next major */
06156             rb_warn("given block not used");
06157             ary = rb_ary_new();
06158 #else
06159             rb_warning("passing a block to String#lines is deprecated");
06160             wantarray = 0;
06161 #endif
06162         }
06163     }
06164     else {
06165         if (wantarray)
06166             ary = rb_ary_new();
06167         else
06168             RETURN_ENUMERATOR(str, argc, argv);
06169     }
06170 
06171     if (NIL_P(rs)) {
06172         if (wantarray) {
06173             rb_ary_push(ary, str);
06174             return ary;
06175         }
06176         else {
06177             rb_yield(str);
06178             return orig;
06179         }
06180     }
06181     str = rb_str_new4(str);
06182     ptr = p = s = RSTRING_PTR(str);
06183     pend = p + RSTRING_LEN(str);
06184     len = RSTRING_LEN(str);
06185     StringValue(rs);
06186     if (rs == rb_default_rs) {
06187         enc = rb_enc_get(str);
06188         while (p < pend) {
06189             char *p0;
06190 
06191             p = memchr(p, '\n', pend - p);
06192             if (!p) break;
06193             p0 = rb_enc_left_char_head(s, p, pend, enc);
06194             if (!rb_enc_is_newline(p0, pend, enc)) {
06195                 p++;
06196                 continue;
06197             }
06198             p = p0 + rb_enc_mbclen(p0, pend, enc);
06199             line = rb_str_subseq(str, s - ptr, p - s);
06200             if (wantarray)
06201                 rb_ary_push(ary, line);
06202             else
06203                 rb_yield(line);
06204             str_mod_check(str, ptr, len);
06205             s = p;
06206         }
06207         goto finish;
06208     }
06209 
06210     enc = rb_enc_check(str, rs);
06211     rslen = RSTRING_LEN(rs);
06212     if (rslen == 0) {
06213         newline = '\n';
06214     }
06215     else {
06216         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
06217     }
06218 
06219     while (p < pend) {
06220         unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
06221 
06222       again:
06223         if (rslen == 0 && c == newline) {
06224             p += n;
06225             if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
06226                 goto again;
06227             }
06228             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
06229                 p += n;
06230             }
06231             p -= n;
06232         }
06233         if (c == newline &&
06234             (rslen <= 1 ||
06235              (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
06236             const char *pp = p + (rslen ? rslen : n);
06237             line = rb_str_subseq(str, s - ptr, pp - s);
06238             if (wantarray)
06239                 rb_ary_push(ary, line);
06240             else
06241                 rb_yield(line);
06242             str_mod_check(str, ptr, len);
06243             s = pp;
06244         }
06245         p += n;
06246     }
06247 
06248   finish:
06249     if (s != pend) {
06250         line = rb_str_subseq(str, s - ptr, pend - s);
06251         if (wantarray)
06252             rb_ary_push(ary, line);
06253         else
06254             rb_yield(line);
06255         RB_GC_GUARD(str);
06256     }
06257 
06258     if (wantarray)
06259         return ary;
06260     else
06261         return orig;
06262 }
06263 
06264 /*
06265  *  call-seq:
06266  *     str.each_line(separator=$/) {|substr| block }   -> str
06267  *     str.each_line(separator=$/)                     -> an_enumerator
06268  *
06269  *  Splits <i>str</i> using the supplied parameter as the record
06270  *  separator (<code>$/</code> by default), passing each substring in
06271  *  turn to the supplied block.  If a zero-length record separator is
06272  *  supplied, the string is split into paragraphs delimited by
06273  *  multiple successive newlines.
06274  *
06275  *  If no block is given, an enumerator is returned instead.
06276  *
06277  *     print "Example one\n"
06278  *     "hello\nworld".each_line {|s| p s}
06279  *     print "Example two\n"
06280  *     "hello\nworld".each_line('l') {|s| p s}
06281  *     print "Example three\n"
06282  *     "hello\n\n\nworld".each_line('') {|s| p s}
06283  *
06284  *  <em>produces:</em>
06285  *
06286  *     Example one
06287  *     "hello\n"
06288  *     "world"
06289  *     Example two
06290  *     "hel"
06291  *     "l"
06292  *     "o\nworl"
06293  *     "d"
06294  *     Example three
06295  *     "hello\n\n\n"
06296  *     "world"
06297  */
06298 
06299 static VALUE
06300 rb_str_each_line(int argc, VALUE *argv, VALUE str)
06301 {
06302     return rb_str_enumerate_lines(argc, argv, str, 0);
06303 }
06304 
06305 /*
06306  *  call-seq:
06307  *     str.lines(separator=$/)  -> an_array
06308  *
06309  *  Returns an array of lines in <i>str</i> split using the supplied
06310  *  record separator (<code>$/</code> by default).  This is a
06311  *  shorthand for <code>str.each_line(separator).to_a</code>.
06312  *
06313  *  If a block is given, which is a deprecated form, works the same as
06314  *  <code>each_line</code>.
06315  */
06316 
06317 static VALUE
06318 rb_str_lines(int argc, VALUE *argv, VALUE str)
06319 {
06320     return rb_str_enumerate_lines(argc, argv, str, 1);
06321 }
06322 
06323 static VALUE
06324 rb_str_each_byte_size(VALUE str, VALUE args)
06325 {
06326     return LONG2FIX(RSTRING_LEN(str));
06327 }
06328 
06329 static VALUE
06330 rb_str_enumerate_bytes(VALUE str, int wantarray)
06331 {
06332     long i;
06333     VALUE UNINITIALIZED_VAR(ary);
06334 
06335     if (rb_block_given_p()) {
06336         if (wantarray) {
06337 #if 0 /* next major */
06338             rb_warn("given block not used");
06339             ary = rb_ary_new();
06340 #else
06341             rb_warning("passing a block to String#bytes is deprecated");
06342             wantarray = 0;
06343 #endif
06344         }
06345     }
06346     else {
06347         if (wantarray)
06348             ary = rb_ary_new2(RSTRING_LEN(str));
06349         else
06350             RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
06351     }
06352 
06353     for (i=0; i<RSTRING_LEN(str); i++) {
06354         if (wantarray)
06355             rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06356         else
06357             rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06358     }
06359     if (wantarray)
06360         return ary;
06361     else
06362         return str;
06363 }
06364 
06365 /*
06366  *  call-seq:
06367  *     str.each_byte {|fixnum| block }    -> str
06368  *     str.each_byte                      -> an_enumerator
06369  *
06370  *  Passes each byte in <i>str</i> to the given block, or returns an
06371  *  enumerator if no block is given.
06372  *
06373  *     "hello".each_byte {|c| print c, ' ' }
06374  *
06375  *  <em>produces:</em>
06376  *
06377  *     104 101 108 108 111
06378  */
06379 
06380 static VALUE
06381 rb_str_each_byte(VALUE str)
06382 {
06383     return rb_str_enumerate_bytes(str, 0);
06384 }
06385 
06386 /*
06387  *  call-seq:
06388  *     str.bytes    -> an_array
06389  *
06390  *  Returns an array of bytes in <i>str</i>.  This is a shorthand for
06391  *  <code>str.each_byte.to_a</code>.
06392  *
06393  *  If a block is given, which is a deprecated form, works the same as
06394  *  <code>each_byte</code>.
06395  */
06396 
06397 static VALUE
06398 rb_str_bytes(VALUE str)
06399 {
06400     return rb_str_enumerate_bytes(str, 1);
06401 }
06402 
06403 static VALUE
06404 rb_str_each_char_size(VALUE str)
06405 {
06406     long len = RSTRING_LEN(str);
06407     if (!single_byte_optimizable(str)) {
06408         const char *ptr = RSTRING_PTR(str);
06409         rb_encoding *enc = rb_enc_get(str);
06410         const char *end_ptr = ptr + len;
06411         for (len = 0; ptr < end_ptr; ++len) {
06412             ptr += rb_enc_mbclen(ptr, end_ptr, enc);
06413         }
06414     }
06415     return LONG2FIX(len);
06416 }
06417 
06418 static VALUE
06419 rb_str_enumerate_chars(VALUE str, int wantarray)
06420 {
06421     VALUE orig = str;
06422     VALUE substr;
06423     long i, len, n;
06424     const char *ptr;
06425     rb_encoding *enc;
06426     VALUE UNINITIALIZED_VAR(ary);
06427 
06428     if (rb_block_given_p()) {
06429         if (wantarray) {
06430 #if 0 /* next major */
06431             rb_warn("given block not used");
06432             ary = rb_ary_new();
06433 #else
06434             rb_warning("passing a block to String#chars is deprecated");
06435             wantarray = 0;
06436 #endif
06437         }
06438     }
06439     else {
06440         if (wantarray)
06441             ary = rb_ary_new();
06442         else
06443             RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
06444     }
06445 
06446     str = rb_str_new4(str);
06447     ptr = RSTRING_PTR(str);
06448     len = RSTRING_LEN(str);
06449     enc = rb_enc_get(str);
06450     switch (ENC_CODERANGE(str)) {
06451       case ENC_CODERANGE_VALID:
06452       case ENC_CODERANGE_7BIT:
06453         for (i = 0; i < len; i += n) {
06454             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
06455             substr = rb_str_subseq(str, i, n);
06456             if (wantarray)
06457                 rb_ary_push(ary, substr);
06458             else
06459                 rb_yield(substr);
06460         }
06461         break;
06462       default:
06463         for (i = 0; i < len; i += n) {
06464             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
06465             substr = rb_str_subseq(str, i, n);
06466             if (wantarray)
06467                 rb_ary_push(ary, substr);
06468             else
06469                 rb_yield(substr);
06470         }
06471     }
06472     RB_GC_GUARD(str);
06473     if (wantarray)
06474         return ary;
06475     else
06476         return orig;
06477 }
06478 
06479 /*
06480  *  call-seq:
06481  *     str.each_char {|cstr| block }    -> str
06482  *     str.each_char                    -> an_enumerator
06483  *
06484  *  Passes each character in <i>str</i> to the given block, or returns
06485  *  an enumerator if no block is given.
06486  *
06487  *     "hello".each_char {|c| print c, ' ' }
06488  *
06489  *  <em>produces:</em>
06490  *
06491  *     h e l l o
06492  */
06493 
06494 static VALUE
06495 rb_str_each_char(VALUE str)
06496 {
06497     return rb_str_enumerate_chars(str, 0);
06498 }
06499 
06500 /*
06501  *  call-seq:
06502  *     str.chars    -> an_array
06503  *
06504  *  Returns an array of characters in <i>str</i>.  This is a shorthand
06505  *  for <code>str.each_char.to_a</code>.
06506  *
06507  *  If a block is given, which is a deprecated form, works the same as
06508  *  <code>each_char</code>.
06509  */
06510 
06511 static VALUE
06512 rb_str_chars(VALUE str)
06513 {
06514     return rb_str_enumerate_chars(str, 1);
06515 }
06516 
06517 
06518 static VALUE
06519 rb_str_enumerate_codepoints(VALUE str, int wantarray)
06520 {
06521     VALUE orig = str;
06522     int n;
06523     unsigned int c;
06524     const char *ptr, *end;
06525     rb_encoding *enc;
06526     VALUE UNINITIALIZED_VAR(ary);
06527 
06528     if (single_byte_optimizable(str))
06529         return rb_str_enumerate_bytes(str, wantarray);
06530 
06531     if (rb_block_given_p()) {
06532         if (wantarray) {
06533 #if 0 /* next major */
06534             rb_warn("given block not used");
06535             ary = rb_ary_new();
06536 #else
06537             rb_warning("passing a block to String#codepoints is deprecated");
06538             wantarray = 0;
06539 #endif
06540         }
06541     }
06542     else {
06543         if (wantarray)
06544             ary = rb_ary_new();
06545         else
06546             RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
06547     }
06548 
06549     str = rb_str_new4(str);
06550     ptr = RSTRING_PTR(str);
06551     end = RSTRING_END(str);
06552     enc = STR_ENC_GET(str);
06553     while (ptr < end) {
06554         c = rb_enc_codepoint_len(ptr, end, &n, enc);
06555         if (wantarray)
06556             rb_ary_push(ary, UINT2NUM(c));
06557         else
06558             rb_yield(UINT2NUM(c));
06559         ptr += n;
06560     }
06561     RB_GC_GUARD(str);
06562     if (wantarray)
06563         return ary;
06564     else
06565         return orig;
06566 }
06567 
06568 /*
06569  *  call-seq:
06570  *     str.each_codepoint {|integer| block }    -> str
06571  *     str.each_codepoint                       -> an_enumerator
06572  *
06573  *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
06574  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
06575  *  given block.
06576  *
06577  *  If no block is given, an enumerator is returned instead.
06578  *
06579  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
06580  *
06581  *  <em>produces:</em>
06582  *
06583  *     104 101 108 108 111 1593
06584  */
06585 
06586 static VALUE
06587 rb_str_each_codepoint(VALUE str)
06588 {
06589     return rb_str_enumerate_codepoints(str, 0);
06590 }
06591 
06592 /*
06593  *  call-seq:
06594  *     str.codepoints   -> an_array
06595  *
06596  *  Returns an array of the <code>Integer</code> ordinals of the
06597  *  characters in <i>str</i>.  This is a shorthand for
06598  *  <code>str.each_codepoint.to_a</code>.
06599  *
06600  *  If a block is given, which is a deprecated form, works the same as
06601  *  <code>each_codepoint</code>.
06602  */
06603 
06604 static VALUE
06605 rb_str_codepoints(VALUE str)
06606 {
06607     return rb_str_enumerate_codepoints(str, 1);
06608 }
06609 
06610 
06611 static long
06612 chopped_length(VALUE str)
06613 {
06614     rb_encoding *enc = STR_ENC_GET(str);
06615     const char *p, *p2, *beg, *end;
06616 
06617     beg = RSTRING_PTR(str);
06618     end = beg + RSTRING_LEN(str);
06619     if (beg > end) return 0;
06620     p = rb_enc_prev_char(beg, end, end, enc);
06621     if (!p) return 0;
06622     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
06623         p2 = rb_enc_prev_char(beg, p, end, enc);
06624         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
06625     }
06626     return p - beg;
06627 }
06628 
06629 /*
06630  *  call-seq:
06631  *     str.chop!   -> str or nil
06632  *
06633  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
06634  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
06635  *  <code>String#chomp!</code>.
06636  */
06637 
06638 static VALUE
06639 rb_str_chop_bang(VALUE str)
06640 {
06641     str_modify_keep_cr(str);
06642     if (RSTRING_LEN(str) > 0) {
06643         long len;
06644         len = chopped_length(str);
06645         STR_SET_LEN(str, len);
06646         RSTRING_PTR(str)[len] = '\0';
06647         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06648             ENC_CODERANGE_CLEAR(str);
06649         }
06650         return str;
06651     }
06652     return Qnil;
06653 }
06654 
06655 
06656 /*
06657  *  call-seq:
06658  *     str.chop   -> new_str
06659  *
06660  *  Returns a new <code>String</code> with the last character removed.  If the
06661  *  string ends with <code>\r\n</code>, both characters are removed. Applying
06662  *  <code>chop</code> to an empty string returns an empty
06663  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
06664  *  the string unchanged if it doesn't end in a record separator.
06665  *
06666  *     "string\r\n".chop   #=> "string"
06667  *     "string\n\r".chop   #=> "string\n"
06668  *     "string\n".chop     #=> "string"
06669  *     "string".chop       #=> "strin"
06670  *     "x".chop.chop       #=> ""
06671  */
06672 
06673 static VALUE
06674 rb_str_chop(VALUE str)
06675 {
06676     return rb_str_subseq(str, 0, chopped_length(str));
06677 }
06678 
06679 
06680 /*
06681  *  call-seq:
06682  *     str.chomp!(separator=$/)   -> str or nil
06683  *
06684  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
06685  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
06686  */
06687 
06688 static VALUE
06689 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06690 {
06691     rb_encoding *enc;
06692     VALUE rs;
06693     int newline;
06694     char *p, *pp, *e;
06695     long len, rslen;
06696 
06697     str_modify_keep_cr(str);
06698     len = RSTRING_LEN(str);
06699     if (len == 0) return Qnil;
06700     p = RSTRING_PTR(str);
06701     e = p + len;
06702     if (argc == 0) {
06703         rs = rb_rs;
06704         if (rs == rb_default_rs) {
06705           smart_chomp:
06706             enc = rb_enc_get(str);
06707             if (rb_enc_mbminlen(enc) > 1) {
06708                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06709                 if (rb_enc_is_newline(pp, e, enc)) {
06710                     e = pp;
06711                 }
06712                 pp = e - rb_enc_mbminlen(enc);
06713                 if (pp >= p) {
06714                     pp = rb_enc_left_char_head(p, pp, e, enc);
06715                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06716                         e = pp;
06717                     }
06718                 }
06719                 if (e == RSTRING_END(str)) {
06720                     return Qnil;
06721                 }
06722                 len = e - RSTRING_PTR(str);
06723                 STR_SET_LEN(str, len);
06724             }
06725             else {
06726                 if (RSTRING_PTR(str)[len-1] == '\n') {
06727                     STR_DEC_LEN(str);
06728                     if (RSTRING_LEN(str) > 0 &&
06729                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06730                         STR_DEC_LEN(str);
06731                     }
06732                 }
06733                 else if (RSTRING_PTR(str)[len-1] == '\r') {
06734                     STR_DEC_LEN(str);
06735                 }
06736                 else {
06737                     return Qnil;
06738                 }
06739             }
06740             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06741             return str;
06742         }
06743     }
06744     else {
06745         rb_scan_args(argc, argv, "01", &rs);
06746     }
06747     if (NIL_P(rs)) return Qnil;
06748     StringValue(rs);
06749     rslen = RSTRING_LEN(rs);
06750     if (rslen == 0) {
06751         while (len>0 && p[len-1] == '\n') {
06752             len--;
06753             if (len>0 && p[len-1] == '\r')
06754                 len--;
06755         }
06756         if (len < RSTRING_LEN(str)) {
06757             STR_SET_LEN(str, len);
06758             RSTRING_PTR(str)[len] = '\0';
06759             return str;
06760         }
06761         return Qnil;
06762     }
06763     if (rslen > len) return Qnil;
06764     newline = RSTRING_PTR(rs)[rslen-1];
06765     if (rslen == 1 && newline == '\n')
06766         goto smart_chomp;
06767 
06768     enc = rb_enc_check(str, rs);
06769     if (is_broken_string(rs)) {
06770         return Qnil;
06771     }
06772     pp = e - rslen;
06773     if (p[len-1] == newline &&
06774         (rslen <= 1 ||
06775          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06776         if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06777             return Qnil;
06778         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06779             ENC_CODERANGE_CLEAR(str);
06780         }
06781         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06782         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06783         return str;
06784     }
06785     return Qnil;
06786 }
06787 
06788 
06789 /*
06790  *  call-seq:
06791  *     str.chomp(separator=$/)   -> new_str
06792  *
06793  *  Returns a new <code>String</code> with the given record separator removed
06794  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
06795  *  changed from the default Ruby record separator, then <code>chomp</code> also
06796  *  removes carriage return characters (that is it will remove <code>\n</code>,
06797  *  <code>\r</code>, and <code>\r\n</code>).
06798  *
06799  *     "hello".chomp            #=> "hello"
06800  *     "hello\n".chomp          #=> "hello"
06801  *     "hello\r\n".chomp        #=> "hello"
06802  *     "hello\n\r".chomp        #=> "hello\n"
06803  *     "hello\r".chomp          #=> "hello"
06804  *     "hello \n there".chomp   #=> "hello \n there"
06805  *     "hello".chomp("llo")     #=> "he"
06806  */
06807 
06808 static VALUE
06809 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06810 {
06811     str = rb_str_dup(str);
06812     rb_str_chomp_bang(argc, argv, str);
06813     return str;
06814 }
06815 
06816 /*
06817  *  call-seq:
06818  *     str.lstrip!   -> self or nil
06819  *
06820  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
06821  *  change was made. See also <code>String#rstrip!</code> and
06822  *  <code>String#strip!</code>.
06823  *
06824  *     "  hello  ".lstrip   #=> "hello  "
06825  *     "hello".lstrip!      #=> nil
06826  */
06827 
06828 static VALUE
06829 rb_str_lstrip_bang(VALUE str)
06830 {
06831     rb_encoding *enc;
06832     char *s, *t, *e;
06833 
06834     str_modify_keep_cr(str);
06835     enc = STR_ENC_GET(str);
06836     s = RSTRING_PTR(str);
06837     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06838     e = t = RSTRING_END(str);
06839     /* remove spaces at head */
06840     while (s < e) {
06841         int n;
06842         unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06843 
06844         if (!rb_isspace(cc)) break;
06845         s += n;
06846     }
06847 
06848     if (s > RSTRING_PTR(str)) {
06849         STR_SET_LEN(str, t-s);
06850         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06851         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06852         return str;
06853     }
06854     return Qnil;
06855 }
06856 
06857 
06858 /*
06859  *  call-seq:
06860  *     str.lstrip   -> new_str
06861  *
06862  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
06863  *  <code>String#rstrip</code> and <code>String#strip</code>.
06864  *
06865  *     "  hello  ".lstrip   #=> "hello  "
06866  *     "hello".lstrip       #=> "hello"
06867  */
06868 
06869 static VALUE
06870 rb_str_lstrip(VALUE str)
06871 {
06872     str = rb_str_dup(str);
06873     rb_str_lstrip_bang(str);
06874     return str;
06875 }
06876 
06877 
06878 /*
06879  *  call-seq:
06880  *     str.rstrip!   -> self or nil
06881  *
06882  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
06883  *  no change was made. See also <code>String#lstrip!</code> and
06884  *  <code>String#strip!</code>.
06885  *
06886  *     "  hello  ".rstrip   #=> "  hello"
06887  *     "hello".rstrip!      #=> nil
06888  */
06889 
06890 static VALUE
06891 rb_str_rstrip_bang(VALUE str)
06892 {
06893     rb_encoding *enc;
06894     char *s, *t, *e;
06895 
06896     str_modify_keep_cr(str);
06897     enc = STR_ENC_GET(str);
06898     rb_str_check_dummy_enc(enc);
06899     s = RSTRING_PTR(str);
06900     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06901     t = e = RSTRING_END(str);
06902 
06903     /* remove trailing spaces or '\0's */
06904     if (single_byte_optimizable(str)) {
06905         unsigned char c;
06906         while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06907     }
06908     else {
06909         char *tp;
06910 
06911         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06912             unsigned int c = rb_enc_codepoint(tp, e, enc);
06913             if (c && !rb_isspace(c)) break;
06914             t = tp;
06915         }
06916     }
06917     if (t < e) {
06918         long len = t-RSTRING_PTR(str);
06919 
06920         STR_SET_LEN(str, len);
06921         RSTRING_PTR(str)[len] = '\0';
06922         return str;
06923     }
06924     return Qnil;
06925 }
06926 
06927 
06928 /*
06929  *  call-seq:
06930  *     str.rstrip   -> new_str
06931  *
06932  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
06933  *  <code>String#lstrip</code> and <code>String#strip</code>.
06934  *
06935  *     "  hello  ".rstrip   #=> "  hello"
06936  *     "hello".rstrip       #=> "hello"
06937  */
06938 
06939 static VALUE
06940 rb_str_rstrip(VALUE str)
06941 {
06942     str = rb_str_dup(str);
06943     rb_str_rstrip_bang(str);
06944     return str;
06945 }
06946 
06947 
06948 /*
06949  *  call-seq:
06950  *     str.strip!   -> str or nil
06951  *
06952  *  Removes leading and trailing whitespace from <i>str</i>. Returns
06953  *  <code>nil</code> if <i>str</i> was not altered.
06954  */
06955 
06956 static VALUE
06957 rb_str_strip_bang(VALUE str)
06958 {
06959     VALUE l = rb_str_lstrip_bang(str);
06960     VALUE r = rb_str_rstrip_bang(str);
06961 
06962     if (NIL_P(l) && NIL_P(r)) return Qnil;
06963     return str;
06964 }
06965 
06966 
06967 /*
06968  *  call-seq:
06969  *     str.strip   -> new_str
06970  *
06971  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
06972  *
06973  *     "    hello    ".strip   #=> "hello"
06974  *     "\tgoodbye\r\n".strip   #=> "goodbye"
06975  */
06976 
06977 static VALUE
06978 rb_str_strip(VALUE str)
06979 {
06980     str = rb_str_dup(str);
06981     rb_str_strip_bang(str);
06982     return str;
06983 }
06984 
06985 static VALUE
06986 scan_once(VALUE str, VALUE pat, long *start)
06987 {
06988     VALUE result, match;
06989     struct re_registers *regs;
06990     int i;
06991 
06992     if (rb_reg_search(pat, str, *start, 0) >= 0) {
06993         match = rb_backref_get();
06994         regs = RMATCH_REGS(match);
06995         if (BEG(0) == END(0)) {
06996             rb_encoding *enc = STR_ENC_GET(str);
06997             /*
06998              * Always consume at least one character of the input string
06999              */
07000             if (RSTRING_LEN(str) > END(0))
07001                 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
07002                                                    RSTRING_END(str), enc);
07003             else
07004                 *start = END(0)+1;
07005         }
07006         else {
07007             *start = END(0);
07008         }
07009         if (regs->num_regs == 1) {
07010             return rb_reg_nth_match(0, match);
07011         }
07012         result = rb_ary_new2(regs->num_regs);
07013         for (i=1; i < regs->num_regs; i++) {
07014             rb_ary_push(result, rb_reg_nth_match(i, match));
07015         }
07016 
07017         return result;
07018     }
07019     return Qnil;
07020 }
07021 
07022 
07023 /*
07024  *  call-seq:
07025  *     str.scan(pattern)                         -> array
07026  *     str.scan(pattern) {|match, ...| block }   -> str
07027  *
07028  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
07029  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
07030  *  generated and either added to the result array or passed to the block. If
07031  *  the pattern contains no groups, each individual result consists of the
07032  *  matched string, <code>$&</code>.  If the pattern contains groups, each
07033  *  individual result is itself an array containing one entry per group.
07034  *
07035  *     a = "cruel world"
07036  *     a.scan(/\w+/)        #=> ["cruel", "world"]
07037  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
07038  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
07039  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
07040  *
07041  *  And the block form:
07042  *
07043  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
07044  *     print "\n"
07045  *     a.scan(/(.)(.)/) {|x,y| print y, x }
07046  *     print "\n"
07047  *
07048  *  <em>produces:</em>
07049  *
07050  *     <<cruel>> <<world>>
07051  *     rceu lowlr
07052  */
07053 
07054 static VALUE
07055 rb_str_scan(VALUE str, VALUE pat)
07056 {
07057     VALUE result;
07058     long start = 0;
07059     long last = -1, prev = 0;
07060     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
07061 
07062     pat = get_pat(pat, 1);
07063     if (!rb_block_given_p()) {
07064         VALUE ary = rb_ary_new();
07065 
07066         while (!NIL_P(result = scan_once(str, pat, &start))) {
07067             last = prev;
07068             prev = start;
07069             rb_ary_push(ary, result);
07070         }
07071         if (last >= 0) rb_reg_search(pat, str, last, 0);
07072         return ary;
07073     }
07074 
07075     while (!NIL_P(result = scan_once(str, pat, &start))) {
07076         last = prev;
07077         prev = start;
07078         rb_yield(result);
07079         str_mod_check(str, p, len);
07080     }
07081     if (last >= 0) rb_reg_search(pat, str, last, 0);
07082     return str;
07083 }
07084 
07085 
07086 /*
07087  *  call-seq:
07088  *     str.hex   -> integer
07089  *
07090  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
07091  *  (with an optional sign and an optional <code>0x</code>) and returns the
07092  *  corresponding number. Zero is returned on error.
07093  *
07094  *     "0x0a".hex     #=> 10
07095  *     "-1234".hex    #=> -4660
07096  *     "0".hex        #=> 0
07097  *     "wombat".hex   #=> 0
07098  */
07099 
07100 static VALUE
07101 rb_str_hex(VALUE str)
07102 {
07103     return rb_str_to_inum(str, 16, FALSE);
07104 }
07105 
07106 
07107 /*
07108  *  call-seq:
07109  *     str.oct   -> integer
07110  *
07111  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
07112  *  optional sign) and returns the corresponding number.  Returns 0 if the
07113  *  conversion fails.
07114  *
07115  *     "123".oct       #=> 83
07116  *     "-377".oct      #=> -255
07117  *     "bad".oct       #=> 0
07118  *     "0377bad".oct   #=> 255
07119  */
07120 
07121 static VALUE
07122 rb_str_oct(VALUE str)
07123 {
07124     return rb_str_to_inum(str, -8, FALSE);
07125 }
07126 
07127 
07128 /*
07129  *  call-seq:
07130  *     str.crypt(salt_str)   -> new_str
07131  *
07132  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the
07133  *  standard library function <code>crypt(3)</code> with the given
07134  *  salt string.  While the format and the result are system and
07135  *  implementation dependent, using a salt matching the regular
07136  *  expression <code>\A[a-zA-Z0-9./]{2}</code> should be valid and
07137  *  safe on any platform, in which only the first two characters are
07138  *  significant.
07139  *
07140  *  This method is for use in system specific scripts, so if you want
07141  *  a cross-platform hash function consider using Digest or OpenSSL
07142  *  instead.
07143  */
07144 
07145 static VALUE
07146 rb_str_crypt(VALUE str, VALUE salt)
07147 {
07148     extern char *crypt(const char *, const char *);
07149     VALUE result;
07150     const char *s, *saltp;
07151     char *res;
07152 #ifdef BROKEN_CRYPT
07153     char salt_8bit_clean[3];
07154 #endif
07155 
07156     StringValue(salt);
07157     if (RSTRING_LEN(salt) < 2)
07158         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
07159 
07160     s = RSTRING_PTR(str);
07161     if (!s) s = "";
07162     saltp = RSTRING_PTR(salt);
07163 #ifdef BROKEN_CRYPT
07164     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
07165         salt_8bit_clean[0] = saltp[0] & 0x7f;
07166         salt_8bit_clean[1] = saltp[1] & 0x7f;
07167         salt_8bit_clean[2] = '\0';
07168         saltp = salt_8bit_clean;
07169     }
07170 #endif
07171     res = crypt(s, saltp);
07172     if (!res) {
07173         rb_sys_fail("crypt");
07174     }
07175     result = rb_str_new2(res);
07176     OBJ_INFECT(result, str);
07177     OBJ_INFECT(result, salt);
07178     return result;
07179 }
07180 
07181 
07182 /*
07183  *  call-seq:
07184  *     str.intern   -> symbol
07185  *     str.to_sym   -> symbol
07186  *
07187  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
07188  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
07189  *
07190  *     "Koala".intern         #=> :Koala
07191  *     s = 'cat'.to_sym       #=> :cat
07192  *     s == :cat              #=> true
07193  *     s = '@cat'.to_sym      #=> :@cat
07194  *     s == :@cat             #=> true
07195  *
07196  *  This can also be used to create symbols that cannot be represented using the
07197  *  <code>:xxx</code> notation.
07198  *
07199  *     'cat and dog'.to_sym   #=> :"cat and dog"
07200  */
07201 
07202 VALUE
07203 rb_str_intern(VALUE s)
07204 {
07205     VALUE str = RB_GC_GUARD(s);
07206     ID id;
07207 
07208     id = rb_intern_str(str);
07209     return ID2SYM(id);
07210 }
07211 
07212 
07213 /*
07214  *  call-seq:
07215  *     str.ord   -> integer
07216  *
07217  *  Return the <code>Integer</code> ordinal of a one-character string.
07218  *
07219  *     "a".ord         #=> 97
07220  */
07221 
07222 VALUE
07223 rb_str_ord(VALUE s)
07224 {
07225     unsigned int c;
07226 
07227     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
07228     return UINT2NUM(c);
07229 }
07230 /*
07231  *  call-seq:
07232  *     str.sum(n=16)   -> integer
07233  *
07234  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
07235  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
07236  *  to 16. The result is simply the sum of the binary value of each character in
07237  *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
07238  *  checksum.
07239  */
07240 
07241 static VALUE
07242 rb_str_sum(int argc, VALUE *argv, VALUE str)
07243 {
07244     VALUE vbits;
07245     int bits;
07246     char *ptr, *p, *pend;
07247     long len;
07248     VALUE sum = INT2FIX(0);
07249     unsigned long sum0 = 0;
07250 
07251     if (argc == 0) {
07252         bits = 16;
07253     }
07254     else {
07255         rb_scan_args(argc, argv, "01", &vbits);
07256         bits = NUM2INT(vbits);
07257     }
07258     ptr = p = RSTRING_PTR(str);
07259     len = RSTRING_LEN(str);
07260     pend = p + len;
07261 
07262     while (p < pend) {
07263         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
07264             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
07265             str_mod_check(str, ptr, len);
07266             sum0 = 0;
07267         }
07268         sum0 += (unsigned char)*p;
07269         p++;
07270     }
07271 
07272     if (bits == 0) {
07273         if (sum0) {
07274             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
07275         }
07276     }
07277     else {
07278         if (sum == INT2FIX(0)) {
07279             if (bits < (int)sizeof(long)*CHAR_BIT) {
07280                 sum0 &= (((unsigned long)1)<<bits)-1;
07281             }
07282             sum = LONG2FIX(sum0);
07283         }
07284         else {
07285             VALUE mod;
07286 
07287             if (sum0) {
07288                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
07289             }
07290 
07291             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
07292             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
07293             sum = rb_funcall(sum, '&', 1, mod);
07294         }
07295     }
07296     return sum;
07297 }
07298 
07299 static VALUE
07300 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
07301 {
07302     rb_encoding *enc;
07303     VALUE w;
07304     long width, len, flen = 1, fclen = 1;
07305     VALUE res;
07306     char *p;
07307     const char *f = " ";
07308     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
07309     volatile VALUE pad;
07310     int singlebyte = 1, cr;
07311 
07312     rb_scan_args(argc, argv, "11", &w, &pad);
07313     enc = STR_ENC_GET(str);
07314     width = NUM2LONG(w);
07315     if (argc == 2) {
07316         StringValue(pad);
07317         enc = rb_enc_check(str, pad);
07318         f = RSTRING_PTR(pad);
07319         flen = RSTRING_LEN(pad);
07320         fclen = str_strlen(pad, enc);
07321         singlebyte = single_byte_optimizable(pad);
07322         if (flen == 0 || fclen == 0) {
07323             rb_raise(rb_eArgError, "zero width padding");
07324         }
07325     }
07326     len = str_strlen(str, enc);
07327     if (width < 0 || len >= width) return rb_str_dup(str);
07328     n = width - len;
07329     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
07330     rlen = n - llen;
07331     cr = ENC_CODERANGE(str);
07332     if (flen > 1) {
07333        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
07334        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
07335     }
07336     size = RSTRING_LEN(str);
07337     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
07338        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
07339        (len += llen2 + rlen2) >= LONG_MAX - size) {
07340        rb_raise(rb_eArgError, "argument too big");
07341     }
07342     len += size;
07343     res = rb_str_new5(str, 0, len);
07344     p = RSTRING_PTR(res);
07345     if (flen <= 1) {
07346        memset(p, *f, llen);
07347        p += llen;
07348     }
07349     else {
07350        while (llen >= fclen) {
07351             memcpy(p,f,flen);
07352             p += flen;
07353             llen -= fclen;
07354         }
07355        if (llen > 0) {
07356            memcpy(p, f, llen2);
07357            p += llen2;
07358         }
07359     }
07360     memcpy(p, RSTRING_PTR(str), size);
07361     p += size;
07362     if (flen <= 1) {
07363        memset(p, *f, rlen);
07364        p += rlen;
07365     }
07366     else {
07367        while (rlen >= fclen) {
07368             memcpy(p,f,flen);
07369             p += flen;
07370             rlen -= fclen;
07371         }
07372        if (rlen > 0) {
07373            memcpy(p, f, rlen2);
07374            p += rlen2;
07375         }
07376     }
07377     *p = '\0';
07378     STR_SET_LEN(res, p-RSTRING_PTR(res));
07379     OBJ_INFECT(res, str);
07380     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
07381     rb_enc_associate(res, enc);
07382     if (argc == 2)
07383         cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
07384     if (cr != ENC_CODERANGE_BROKEN)
07385         ENC_CODERANGE_SET(res, cr);
07386     return res;
07387 }
07388 
07389 
07390 /*
07391  *  call-seq:
07392  *     str.ljust(integer, padstr=' ')   -> new_str
07393  *
07394  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07395  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
07396  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07397  *
07398  *     "hello".ljust(4)            #=> "hello"
07399  *     "hello".ljust(20)           #=> "hello               "
07400  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
07401  */
07402 
07403 static VALUE
07404 rb_str_ljust(int argc, VALUE *argv, VALUE str)
07405 {
07406     return rb_str_justify(argc, argv, str, 'l');
07407 }
07408 
07409 
07410 /*
07411  *  call-seq:
07412  *     str.rjust(integer, padstr=' ')   -> new_str
07413  *
07414  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07415  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
07416  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07417  *
07418  *     "hello".rjust(4)            #=> "hello"
07419  *     "hello".rjust(20)           #=> "               hello"
07420  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
07421  */
07422 
07423 static VALUE
07424 rb_str_rjust(int argc, VALUE *argv, VALUE str)
07425 {
07426     return rb_str_justify(argc, argv, str, 'r');
07427 }
07428 
07429 
07430 /*
07431  *  call-seq:
07432  *     str.center(width, padstr=' ')   -> new_str
07433  *
07434  *  Centers +str+ in +width+.  If +width+ is greater than the length of +str+,
07435  *  returns a new String of length +width+ with +str+ centered and padded with
07436  *  +padstr+; otherwise, returns +str+.
07437  *
07438  *     "hello".center(4)         #=> "hello"
07439  *     "hello".center(20)        #=> "       hello        "
07440  *     "hello".center(20, '123') #=> "1231231hello12312312"
07441  */
07442 
07443 static VALUE
07444 rb_str_center(int argc, VALUE *argv, VALUE str)
07445 {
07446     return rb_str_justify(argc, argv, str, 'c');
07447 }
07448 
07449 /*
07450  *  call-seq:
07451  *     str.partition(sep)              -> [head, sep, tail]
07452  *     str.partition(regexp)           -> [head, match, tail]
07453  *
07454  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
07455  *  and returns the part before it, the match, and the part
07456  *  after it.
07457  *  If it is not found, returns two empty strings and <i>str</i>.
07458  *
07459  *     "hello".partition("l")         #=> ["he", "l", "lo"]
07460  *     "hello".partition("x")         #=> ["hello", "", ""]
07461  *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
07462  */
07463 
07464 static VALUE
07465 rb_str_partition(VALUE str, VALUE sep)
07466 {
07467     long pos;
07468     int regex = FALSE;
07469 
07470     if (RB_TYPE_P(sep, T_REGEXP)) {
07471         pos = rb_reg_search(sep, str, 0, 0);
07472         regex = TRUE;
07473     }
07474     else {
07475         VALUE tmp;
07476 
07477         tmp = rb_check_string_type(sep);
07478         if (NIL_P(tmp)) {
07479             rb_raise(rb_eTypeError, "type mismatch: %s given",
07480                      rb_obj_classname(sep));
07481         }
07482         sep = tmp;
07483         pos = rb_str_index(str, sep, 0);
07484     }
07485     if (pos < 0) {
07486       failed:
07487         return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
07488     }
07489     if (regex) {
07490         sep = rb_str_subpat(str, sep, INT2FIX(0));
07491         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
07492     }
07493     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
07494                           sep,
07495                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
07496                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
07497 }
07498 
07499 /*
07500  *  call-seq:
07501  *     str.rpartition(sep)             -> [head, sep, tail]
07502  *     str.rpartition(regexp)          -> [head, match, tail]
07503  *
07504  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
07505  *  of the string, and returns the part before it, the match, and the part
07506  *  after it.
07507  *  If it is not found, returns two empty strings and <i>str</i>.
07508  *
07509  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
07510  *     "hello".rpartition("x")         #=> ["", "", "hello"]
07511  *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
07512  */
07513 
07514 static VALUE
07515 rb_str_rpartition(VALUE str, VALUE sep)
07516 {
07517     long pos = RSTRING_LEN(str);
07518     int regex = FALSE;
07519 
07520     if (RB_TYPE_P(sep, T_REGEXP)) {
07521         pos = rb_reg_search(sep, str, pos, 1);
07522         regex = TRUE;
07523     }
07524     else {
07525         VALUE tmp;
07526 
07527         tmp = rb_check_string_type(sep);
07528         if (NIL_P(tmp)) {
07529             rb_raise(rb_eTypeError, "type mismatch: %s given",
07530                      rb_obj_classname(sep));
07531         }
07532         sep = tmp;
07533         pos = rb_str_sublen(str, pos);
07534         pos = rb_str_rindex(str, sep, pos);
07535     }
07536     if (pos < 0) {
07537         return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
07538     }
07539     if (regex) {
07540         sep = rb_reg_nth_match(0, rb_backref_get());
07541     }
07542     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
07543                           sep,
07544                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
07545 }
07546 
07547 /*
07548  *  call-seq:
07549  *     str.start_with?([prefixes]+)   -> true or false
07550  *
07551  *  Returns true if +str+ starts with one of the +prefixes+ given.
07552  *
07553  *    "hello".start_with?("hell")               #=> true
07554  *
07555  *    # returns true if one of the prefixes matches.
07556  *    "hello".start_with?("heaven", "hell")     #=> true
07557  *    "hello".start_with?("heaven", "paradise") #=> false
07558  */
07559 
07560 static VALUE
07561 rb_str_start_with(int argc, VALUE *argv, VALUE str)
07562 {
07563     int i;
07564 
07565     for (i=0; i<argc; i++) {
07566         VALUE tmp = argv[i];
07567         StringValue(tmp);
07568         rb_enc_check(str, tmp);
07569         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07570         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07571             return Qtrue;
07572     }
07573     return Qfalse;
07574 }
07575 
07576 /*
07577  *  call-seq:
07578  *     str.end_with?([suffixes]+)   -> true or false
07579  *
07580  *  Returns true if +str+ ends with one of the +suffixes+ given.
07581  */
07582 
07583 static VALUE
07584 rb_str_end_with(int argc, VALUE *argv, VALUE str)
07585 {
07586     int i;
07587     char *p, *s, *e;
07588     rb_encoding *enc;
07589 
07590     for (i=0; i<argc; i++) {
07591         VALUE tmp = argv[i];
07592         StringValue(tmp);
07593         enc = rb_enc_check(str, tmp);
07594         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07595         p = RSTRING_PTR(str);
07596         e = p + RSTRING_LEN(str);
07597         s = e - RSTRING_LEN(tmp);
07598         if (rb_enc_left_char_head(p, s, e, enc) != s)
07599             continue;
07600         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07601             return Qtrue;
07602     }
07603     return Qfalse;
07604 }
07605 
07606 void
07607 rb_str_setter(VALUE val, ID id, VALUE *var)
07608 {
07609     if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
07610         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
07611     }
07612     *var = val;
07613 }
07614 
07615 
07616 /*
07617  *  call-seq:
07618  *     str.force_encoding(encoding)   -> str
07619  *
07620  *  Changes the encoding to +encoding+ and returns self.
07621  */
07622 
07623 static VALUE
07624 rb_str_force_encoding(VALUE str, VALUE enc)
07625 {
07626     str_modifiable(str);
07627     rb_enc_associate(str, rb_to_encoding(enc));
07628     ENC_CODERANGE_CLEAR(str);
07629     return str;
07630 }
07631 
07632 /*
07633  *  call-seq:
07634  *     str.b   -> str
07635  *
07636  *  Returns a copied string whose encoding is ASCII-8BIT.
07637  */
07638 
07639 static VALUE
07640 rb_str_b(VALUE str)
07641 {
07642     VALUE str2 = str_alloc(rb_cString);
07643     str_replace_shared_without_enc(str2, str);
07644     OBJ_INFECT(str2, str);
07645     ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
07646     return str2;
07647 }
07648 
07649 /*
07650  *  call-seq:
07651  *     str.valid_encoding?  -> true or false
07652  *
07653  *  Returns true for a string which encoded correctly.
07654  *
07655  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
07656  *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
07657  *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
07658  */
07659 
07660 static VALUE
07661 rb_str_valid_encoding_p(VALUE str)
07662 {
07663     int cr = rb_enc_str_coderange(str);
07664 
07665     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07666 }
07667 
07668 /*
07669  *  call-seq:
07670  *     str.ascii_only?  -> true or false
07671  *
07672  *  Returns true for a string which has only ASCII characters.
07673  *
07674  *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
07675  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
07676  */
07677 
07678 static VALUE
07679 rb_str_is_ascii_only_p(VALUE str)
07680 {
07681     int cr = rb_enc_str_coderange(str);
07682 
07683     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07684 }
07685 
07700 VALUE
07701 rb_str_ellipsize(VALUE str, long len)
07702 {
07703     static const char ellipsis[] = "...";
07704     const long ellipsislen = sizeof(ellipsis) - 1;
07705     rb_encoding *const enc = rb_enc_get(str);
07706     const long blen = RSTRING_LEN(str);
07707     const char *const p = RSTRING_PTR(str), *e = p + blen;
07708     VALUE estr, ret = 0;
07709 
07710     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
07711     if (len * rb_enc_mbminlen(enc) >= blen ||
07712         (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
07713         ret = str;
07714     }
07715     else if (len <= ellipsislen ||
07716              !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
07717         if (rb_enc_asciicompat(enc)) {
07718             ret = rb_str_new_with_class(str, ellipsis, len);
07719             rb_enc_associate(ret, enc);
07720         }
07721         else {
07722             estr = rb_usascii_str_new(ellipsis, len);
07723             ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
07724         }
07725     }
07726     else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
07727         rb_str_cat(ret, ellipsis, ellipsislen);
07728     }
07729     else {
07730         estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
07731                              rb_enc_from_encoding(enc), 0, Qnil);
07732         rb_str_append(ret, estr);
07733     }
07734     return ret;
07735 }
07736 
07737 /**********************************************************************
07738  * Document-class: Symbol
07739  *
07740  *  <code>Symbol</code> objects represent names and some strings
07741  *  inside the Ruby
07742  *  interpreter. They are generated using the <code>:name</code> and
07743  *  <code>:"string"</code> literals
07744  *  syntax, and by the various <code>to_sym</code> methods. The same
07745  *  <code>Symbol</code> object will be created for a given name or string
07746  *  for the duration of a program's execution, regardless of the context
07747  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
07748  *  one context, a method in another, and a class in a third, the
07749  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
07750  *  all three contexts.
07751  *
07752  *     module One
07753  *       class Fred
07754  *       end
07755  *       $f1 = :Fred
07756  *     end
07757  *     module Two
07758  *       Fred = 1
07759  *       $f2 = :Fred
07760  *     end
07761  *     def Fred()
07762  *     end
07763  *     $f3 = :Fred
07764  *     $f1.object_id   #=> 2514190
07765  *     $f2.object_id   #=> 2514190
07766  *     $f3.object_id   #=> 2514190
07767  *
07768  */
07769 
07770 
07771 /*
07772  *  call-seq:
07773  *     sym == obj   -> true or false
07774  *
07775  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
07776  *  symbol, returns <code>true</code>.
07777  */
07778 
07779 static VALUE
07780 sym_equal(VALUE sym1, VALUE sym2)
07781 {
07782     if (sym1 == sym2) return Qtrue;
07783     return Qfalse;
07784 }
07785 
07786 
07787 static int
07788 sym_printable(const char *s, const char *send, rb_encoding *enc)
07789 {
07790     while (s < send) {
07791         int n;
07792         int c = rb_enc_codepoint_len(s, send, &n, enc);
07793 
07794         if (!rb_enc_isprint(c, enc)) return FALSE;
07795         s += n;
07796     }
07797     return TRUE;
07798 }
07799 
07800 int
07801 rb_str_symname_p(VALUE sym)
07802 {
07803     rb_encoding *enc;
07804     const char *ptr;
07805     long len;
07806     rb_encoding *resenc = rb_default_internal_encoding();
07807 
07808     if (resenc == NULL) resenc = rb_default_external_encoding();
07809     enc = STR_ENC_GET(sym);
07810     ptr = RSTRING_PTR(sym);
07811     len = RSTRING_LEN(sym);
07812     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07813         !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07814         return FALSE;
07815     }
07816     return TRUE;
07817 }
07818 
07819 VALUE
07820 rb_str_quote_unprintable(VALUE str)
07821 {
07822     rb_encoding *enc;
07823     const char *ptr;
07824     long len;
07825     rb_encoding *resenc;
07826 
07827     Check_Type(str, T_STRING);
07828     resenc = rb_default_internal_encoding();
07829     if (resenc == NULL) resenc = rb_default_external_encoding();
07830     enc = STR_ENC_GET(str);
07831     ptr = RSTRING_PTR(str);
07832     len = RSTRING_LEN(str);
07833     if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
07834         !sym_printable(ptr, ptr + len, enc)) {
07835         return rb_str_inspect(str);
07836     }
07837     return str;
07838 }
07839 
07840 VALUE
07841 rb_id_quote_unprintable(ID id)
07842 {
07843     return rb_str_quote_unprintable(rb_id2str(id));
07844 }
07845 
07846 /*
07847  *  call-seq:
07848  *     sym.inspect    -> string
07849  *
07850  *  Returns the representation of <i>sym</i> as a symbol literal.
07851  *
07852  *     :fred.inspect   #=> ":fred"
07853  */
07854 
07855 static VALUE
07856 sym_inspect(VALUE sym)
07857 {
07858     VALUE str;
07859     const char *ptr;
07860     long len;
07861     ID id = SYM2ID(sym);
07862     char *dest;
07863 
07864     sym = rb_id2str(id);
07865     if (!rb_str_symname_p(sym)) {
07866         str = rb_str_inspect(sym);
07867         len = RSTRING_LEN(str);
07868         rb_str_resize(str, len + 1);
07869         dest = RSTRING_PTR(str);
07870         memmove(dest + 1, dest, len);
07871         dest[0] = ':';
07872     }
07873     else {
07874         rb_encoding *enc = STR_ENC_GET(sym);
07875         ptr = RSTRING_PTR(sym);
07876         len = RSTRING_LEN(sym);
07877         str = rb_enc_str_new(0, len + 1, enc);
07878         dest = RSTRING_PTR(str);
07879         dest[0] = ':';
07880         memcpy(dest + 1, ptr, len);
07881     }
07882     return str;
07883 }
07884 
07885 
07886 /*
07887  *  call-seq:
07888  *     sym.id2name   -> string
07889  *     sym.to_s      -> string
07890  *
07891  *  Returns the name or string corresponding to <i>sym</i>.
07892  *
07893  *     :fred.id2name   #=> "fred"
07894  */
07895 
07896 
07897 VALUE
07898 rb_sym_to_s(VALUE sym)
07899 {
07900     ID id = SYM2ID(sym);
07901 
07902     return str_new3(rb_cString, rb_id2str(id));
07903 }
07904 
07905 
07906 /*
07907  * call-seq:
07908  *   sym.to_sym   -> sym
07909  *   sym.intern   -> sym
07910  *
07911  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
07912  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
07913  * in this case.
07914  */
07915 
07916 static VALUE
07917 sym_to_sym(VALUE sym)
07918 {
07919     return sym;
07920 }
07921 
07922 static VALUE
07923 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc)
07924 {
07925     VALUE obj;
07926 
07927     if (argc < 1) {
07928         rb_raise(rb_eArgError, "no receiver given");
07929     }
07930     obj = argv[0];
07931     return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc);
07932 }
07933 
07934 /*
07935  * call-seq:
07936  *   sym.to_proc
07937  *
07938  * Returns a _Proc_ object which respond to the given method by _sym_.
07939  *
07940  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
07941  */
07942 
07943 static VALUE
07944 sym_to_proc(VALUE sym)
07945 {
07946     static VALUE sym_proc_cache = Qfalse;
07947     enum {SYM_PROC_CACHE_SIZE = 67};
07948     VALUE proc;
07949     long id, index;
07950     VALUE *aryp;
07951 
07952     if (!sym_proc_cache) {
07953         sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07954         rb_gc_register_mark_object(sym_proc_cache);
07955         rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07956     }
07957 
07958     id = SYM2ID(sym);
07959     index = (id % SYM_PROC_CACHE_SIZE) << 1;
07960 
07961     aryp = RARRAY_PTR(sym_proc_cache);
07962     if (aryp[index] == sym) {
07963         return aryp[index + 1];
07964     }
07965     else {
07966         proc = rb_proc_new(sym_call, (VALUE)id);
07967         aryp[index] = sym;
07968         aryp[index + 1] = proc;
07969         return proc;
07970     }
07971 }
07972 
07973 /*
07974  * call-seq:
07975  *
07976  *   sym.succ
07977  *
07978  * Same as <code>sym.to_s.succ.intern</code>.
07979  */
07980 
07981 static VALUE
07982 sym_succ(VALUE sym)
07983 {
07984     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07985 }
07986 
07987 /*
07988  * call-seq:
07989  *
07990  *   symbol <=> other_symbol       -> -1, 0, +1 or nil
07991  *
07992  * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
07993  * symbols. Returns -1, 0, +1 or nil depending on whether +symbol+ is less
07994  * than, equal to, or greater than +other_symbol+.
07995  *
07996  *  +nil+ is returned if the two values are incomparable.
07997  *
07998  * See String#<=> for more information.
07999  */
08000 
08001 static VALUE
08002 sym_cmp(VALUE sym, VALUE other)
08003 {
08004     if (!SYMBOL_P(other)) {
08005         return Qnil;
08006     }
08007     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
08008 }
08009 
08010 /*
08011  * call-seq:
08012  *
08013  *   sym.casecmp(other)  -> -1, 0, +1 or nil
08014  *
08015  * Case-insensitive version of <code>Symbol#<=></code>.
08016  */
08017 
08018 static VALUE
08019 sym_casecmp(VALUE sym, VALUE other)
08020 {
08021     if (!SYMBOL_P(other)) {
08022         return Qnil;
08023     }
08024     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
08025 }
08026 
08027 /*
08028  * call-seq:
08029  *   sym =~ obj   -> fixnum or nil
08030  *
08031  * Returns <code>sym.to_s =~ obj</code>.
08032  */
08033 
08034 static VALUE
08035 sym_match(VALUE sym, VALUE other)
08036 {
08037     return rb_str_match(rb_sym_to_s(sym), other);
08038 }
08039 
08040 /*
08041  * call-seq:
08042  *   sym[idx]      -> char
08043  *   sym[b, n]     -> char
08044  *
08045  * Returns <code>sym.to_s[]</code>.
08046  */
08047 
08048 static VALUE
08049 sym_aref(int argc, VALUE *argv, VALUE sym)
08050 {
08051     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
08052 }
08053 
08054 /*
08055  * call-seq:
08056  *   sym.length    -> integer
08057  *
08058  * Same as <code>sym.to_s.length</code>.
08059  */
08060 
08061 static VALUE
08062 sym_length(VALUE sym)
08063 {
08064     return rb_str_length(rb_id2str(SYM2ID(sym)));
08065 }
08066 
08067 /*
08068  * call-seq:
08069  *   sym.empty?   -> true or false
08070  *
08071  * Returns that _sym_ is :"" or not.
08072  */
08073 
08074 static VALUE
08075 sym_empty(VALUE sym)
08076 {
08077     return rb_str_empty(rb_id2str(SYM2ID(sym)));
08078 }
08079 
08080 /*
08081  * call-seq:
08082  *   sym.upcase    -> symbol
08083  *
08084  * Same as <code>sym.to_s.upcase.intern</code>.
08085  */
08086 
08087 static VALUE
08088 sym_upcase(VALUE sym)
08089 {
08090     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
08091 }
08092 
08093 /*
08094  * call-seq:
08095  *   sym.downcase  -> symbol
08096  *
08097  * Same as <code>sym.to_s.downcase.intern</code>.
08098  */
08099 
08100 static VALUE
08101 sym_downcase(VALUE sym)
08102 {
08103     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
08104 }
08105 
08106 /*
08107  * call-seq:
08108  *   sym.capitalize  -> symbol
08109  *
08110  * Same as <code>sym.to_s.capitalize.intern</code>.
08111  */
08112 
08113 static VALUE
08114 sym_capitalize(VALUE sym)
08115 {
08116     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
08117 }
08118 
08119 /*
08120  * call-seq:
08121  *   sym.swapcase  -> symbol
08122  *
08123  * Same as <code>sym.to_s.swapcase.intern</code>.
08124  */
08125 
08126 static VALUE
08127 sym_swapcase(VALUE sym)
08128 {
08129     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
08130 }
08131 
08132 /*
08133  * call-seq:
08134  *   sym.encoding   -> encoding
08135  *
08136  * Returns the Encoding object that represents the encoding of _sym_.
08137  */
08138 
08139 static VALUE
08140 sym_encoding(VALUE sym)
08141 {
08142     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
08143 }
08144 
08145 ID
08146 rb_to_id(VALUE name)
08147 {
08148     VALUE tmp;
08149 
08150     switch (TYPE(name)) {
08151       default:
08152         tmp = rb_check_string_type(name);
08153         if (NIL_P(tmp)) {
08154             tmp = rb_inspect(name);
08155             rb_raise(rb_eTypeError, "%s is not a symbol",
08156                      RSTRING_PTR(tmp));
08157         }
08158         name = tmp;
08159         /* fall through */
08160       case T_STRING:
08161         name = rb_str_intern(name);
08162         /* fall through */
08163       case T_SYMBOL:
08164         return SYM2ID(name);
08165     }
08166 
08167     UNREACHABLE;
08168 }
08169 
08170 /*
08171  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
08172  *  bytes, typically representing characters. String objects may be created
08173  *  using <code>String::new</code> or as literals.
08174  *
08175  *  Because of aliasing issues, users of strings should be aware of the methods
08176  *  that modify the contents of a <code>String</code> object.  Typically,
08177  *  methods with names ending in ``!'' modify their receiver, while those
08178  *  without a ``!'' return a new <code>String</code>.  However, there are
08179  *  exceptions, such as <code>String#[]=</code>.
08180  *
08181  */
08182 
08183 void
08184 Init_String(void)
08185 {
08186 #undef rb_intern
08187 #define rb_intern(str) rb_intern_const(str)
08188 
08189     rb_cString  = rb_define_class("String", rb_cObject);
08190     rb_include_module(rb_cString, rb_mComparable);
08191     rb_define_alloc_func(rb_cString, empty_str_alloc);
08192     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
08193     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
08194     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
08195     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
08196     rb_define_method(rb_cString, "==", rb_str_equal, 1);
08197     rb_define_method(rb_cString, "===", rb_str_equal, 1);
08198     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
08199     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
08200     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
08201     rb_define_method(rb_cString, "+", rb_str_plus, 1);
08202     rb_define_method(rb_cString, "*", rb_str_times, 1);
08203     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
08204     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
08205     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
08206     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
08207     rb_define_method(rb_cString, "length", rb_str_length, 0);
08208     rb_define_method(rb_cString, "size", rb_str_length, 0);
08209     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
08210     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
08211     rb_define_method(rb_cString, "=~", rb_str_match, 1);
08212     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
08213     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
08214     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
08215     rb_define_method(rb_cString, "next", rb_str_succ, 0);
08216     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
08217     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
08218     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
08219     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
08220     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
08221     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
08222     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
08223     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
08224     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
08225     rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
08226 
08227     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
08228     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
08229     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
08230     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
08231     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
08232     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
08233 
08234     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
08235     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
08236     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
08237     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
08238 
08239     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
08240     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
08241     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
08242     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
08243 
08244     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
08245     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
08246     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
08247     rb_define_method(rb_cString, "lines", rb_str_lines, -1);
08248     rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
08249     rb_define_method(rb_cString, "chars", rb_str_chars, 0);
08250     rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
08251     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
08252     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
08253     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
08254     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
08255     rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
08256     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
08257     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
08258     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
08259     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
08260 
08261     rb_define_method(rb_cString, "include?", rb_str_include, 1);
08262     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
08263     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
08264 
08265     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
08266 
08267     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
08268     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
08269     rb_define_method(rb_cString, "center", rb_str_center, -1);
08270 
08271     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
08272     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
08273     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
08274     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
08275     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
08276     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
08277     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
08278 
08279     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
08280     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
08281     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
08282     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
08283     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
08284     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
08285     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
08286 
08287     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
08288     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
08289     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
08290     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
08291     rb_define_method(rb_cString, "count", rb_str_count, -1);
08292 
08293     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
08294     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
08295     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
08296     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
08297 
08298     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
08299     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
08300     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
08301     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
08302 
08303     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
08304 
08305     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
08306     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
08307 
08308     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
08309     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
08310 
08311     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
08312     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
08313     rb_define_method(rb_cString, "b", rb_str_b, 0);
08314     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
08315     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
08316 
08317     id_to_s = rb_intern("to_s");
08318 
08319     rb_fs = Qnil;
08320     rb_define_variable("$;", &rb_fs);
08321     rb_define_variable("$-F", &rb_fs);
08322 
08323     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
08324     rb_include_module(rb_cSymbol, rb_mComparable);
08325     rb_undef_alloc_func(rb_cSymbol);
08326     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
08327     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
08328 
08329     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
08330     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
08331     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
08332     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
08333     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
08334     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
08335     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
08336     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
08337     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
08338     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
08339 
08340     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
08341     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
08342     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
08343 
08344     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
08345     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
08346     rb_define_method(rb_cSymbol, "length", sym_length, 0);
08347     rb_define_method(rb_cSymbol, "size", sym_length, 0);
08348     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
08349     rb_define_method(rb_cSymbol, "match", sym_match, 1);
08350 
08351     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
08352     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
08353     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
08354     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
08355 
08356     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
08357 }
08358