Ruby
2.0.0p247(2013-06-27revision41674)
|
00001 /********************************************************************** 00002 00003 string.c - 00004 00005 $Author: nagachika $ 00006 created at: Mon Aug 9 17:12:58 JST 1993 00007 00008 Copyright (C) 1993-2007 Yukihiro Matsumoto 00009 Copyright (C) 2000 Network Applied Communication Laboratory, Inc. 00010 Copyright (C) 2000 Information-technology Promotion Agency, Japan 00011 00012 **********************************************************************/ 00013 00014 #include "ruby/ruby.h" 00015 #include "ruby/re.h" 00016 #include "ruby/encoding.h" 00017 #include "vm_core.h" 00018 #include "internal.h" 00019 #include "probes.h" 00020 #include <assert.h> 00021 00022 #define BEG(no) (regs->beg[(no)]) 00023 #define END(no) (regs->end[(no)]) 00024 00025 #include <math.h> 00026 #include <ctype.h> 00027 00028 #ifdef HAVE_UNISTD_H 00029 #include <unistd.h> 00030 #endif 00031 00032 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0])) 00033 00034 #undef rb_str_new_cstr 00035 #undef rb_tainted_str_new_cstr 00036 #undef rb_usascii_str_new_cstr 00037 #undef rb_external_str_new_cstr 00038 #undef rb_locale_str_new_cstr 00039 #undef rb_str_new2 00040 #undef rb_str_new3 00041 #undef rb_str_new4 00042 #undef rb_str_new5 00043 #undef rb_tainted_str_new2 00044 #undef rb_usascii_str_new2 00045 #undef rb_str_dup_frozen 00046 #undef rb_str_buf_new_cstr 00047 #undef rb_str_buf_new2 00048 #undef rb_str_buf_cat2 00049 #undef rb_str_cat2 00050 00051 static VALUE rb_str_clear(VALUE str); 00052 00053 VALUE rb_cString; 00054 VALUE rb_cSymbol; 00055 00056 #define RUBY_MAX_CHAR_LEN 16 00057 #define STR_TMPLOCK FL_USER7 00058 #define STR_NOEMBED FL_USER1 00059 #define STR_SHARED FL_USER2 /* = ELTS_SHARED */ 00060 #define STR_ASSOC FL_USER3 00061 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED) 00062 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC) 00063 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC) 00064 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC)) 00065 #define STR_UNSET_NOCAPA(s) do {\ 00066 if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\ 00067 } while (0) 00068 00069 00070 #define STR_SET_NOEMBED(str) do {\ 00071 FL_SET((str), STR_NOEMBED);\ 00072 STR_SET_EMBED_LEN((str), 0);\ 00073 } while (0) 00074 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED) 00075 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED)) 00076 #define STR_SET_EMBED_LEN(str, n) do { \ 00077 long tmp_n = (n);\ 00078 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\ 00079 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\ 00080 } while (0) 00081 00082 #define STR_SET_LEN(str, n) do { \ 00083 if (STR_EMBED_P(str)) {\ 00084 STR_SET_EMBED_LEN((str), (n));\ 00085 }\ 00086 else {\ 00087 RSTRING(str)->as.heap.len = (n);\ 00088 }\ 00089 } while (0) 00090 00091 #define STR_DEC_LEN(str) do {\ 00092 if (STR_EMBED_P(str)) {\ 00093 long n = RSTRING_LEN(str);\ 00094 n--;\ 00095 STR_SET_EMBED_LEN((str), n);\ 00096 }\ 00097 else {\ 00098 RSTRING(str)->as.heap.len--;\ 00099 }\ 00100 } while (0) 00101 00102 #define RESIZE_CAPA(str,capacity) do {\ 00103 if (STR_EMBED_P(str)) {\ 00104 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\ 00105 char *tmp = ALLOC_N(char, (capacity)+1);\ 00106 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\ 00107 RSTRING(str)->as.heap.ptr = tmp;\ 00108 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\ 00109 STR_SET_NOEMBED(str);\ 00110 RSTRING(str)->as.heap.aux.capa = (capacity);\ 00111 }\ 00112 }\ 00113 else {\ 00114 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\ 00115 if (!STR_NOCAPA_P(str))\ 00116 RSTRING(str)->as.heap.aux.capa = (capacity);\ 00117 }\ 00118 } while (0) 00119 00120 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) 00121 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) 00122 00123 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str)) 00124 00125 static inline int 00126 single_byte_optimizable(VALUE str) 00127 { 00128 rb_encoding *enc; 00129 00130 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */ 00131 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) 00132 return 1; 00133 00134 enc = STR_ENC_GET(str); 00135 if (rb_enc_mbmaxlen(enc) == 1) 00136 return 1; 00137 00138 /* Conservative. Possibly single byte. 00139 * "\xa1" in Shift_JIS for example. */ 00140 return 0; 00141 } 00142 00143 VALUE rb_fs; 00144 00145 static inline const char * 00146 search_nonascii(const char *p, const char *e) 00147 { 00148 #if SIZEOF_VALUE == 8 00149 # define NONASCII_MASK 0x8080808080808080ULL 00150 #elif SIZEOF_VALUE == 4 00151 # define NONASCII_MASK 0x80808080UL 00152 #endif 00153 #ifdef NONASCII_MASK 00154 if ((int)sizeof(VALUE) * 2 < e - p) { 00155 const VALUE *s, *t; 00156 const VALUE lowbits = sizeof(VALUE) - 1; 00157 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 00158 while (p < (const char *)s) { 00159 if (!ISASCII(*p)) 00160 return p; 00161 p++; 00162 } 00163 t = (const VALUE*)(~lowbits & (VALUE)e); 00164 while (s < t) { 00165 if (*s & NONASCII_MASK) { 00166 t = s; 00167 break; 00168 } 00169 s++; 00170 } 00171 p = (const char *)t; 00172 } 00173 #endif 00174 while (p < e) { 00175 if (!ISASCII(*p)) 00176 return p; 00177 p++; 00178 } 00179 return NULL; 00180 } 00181 00182 static int 00183 coderange_scan(const char *p, long len, rb_encoding *enc) 00184 { 00185 const char *e = p + len; 00186 00187 if (rb_enc_to_index(enc) == 0) { 00188 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ 00189 p = search_nonascii(p, e); 00190 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT; 00191 } 00192 00193 if (rb_enc_asciicompat(enc)) { 00194 p = search_nonascii(p, e); 00195 if (!p) { 00196 return ENC_CODERANGE_7BIT; 00197 } 00198 while (p < e) { 00199 int ret = rb_enc_precise_mbclen(p, e, enc); 00200 if (!MBCLEN_CHARFOUND_P(ret)) { 00201 return ENC_CODERANGE_BROKEN; 00202 } 00203 p += MBCLEN_CHARFOUND_LEN(ret); 00204 if (p < e) { 00205 p = search_nonascii(p, e); 00206 if (!p) { 00207 return ENC_CODERANGE_VALID; 00208 } 00209 } 00210 } 00211 if (e < p) { 00212 return ENC_CODERANGE_BROKEN; 00213 } 00214 return ENC_CODERANGE_VALID; 00215 } 00216 00217 while (p < e) { 00218 int ret = rb_enc_precise_mbclen(p, e, enc); 00219 00220 if (!MBCLEN_CHARFOUND_P(ret)) { 00221 return ENC_CODERANGE_BROKEN; 00222 } 00223 p += MBCLEN_CHARFOUND_LEN(ret); 00224 } 00225 if (e < p) { 00226 return ENC_CODERANGE_BROKEN; 00227 } 00228 return ENC_CODERANGE_VALID; 00229 } 00230 00231 long 00232 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr) 00233 { 00234 const char *p = s; 00235 00236 if (*cr == ENC_CODERANGE_BROKEN) 00237 return e - s; 00238 00239 if (rb_enc_to_index(enc) == 0) { 00240 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ 00241 p = search_nonascii(p, e); 00242 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; 00243 return e - s; 00244 } 00245 else if (rb_enc_asciicompat(enc)) { 00246 p = search_nonascii(p, e); 00247 if (!p) { 00248 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT; 00249 return e - s; 00250 } 00251 while (p < e) { 00252 int ret = rb_enc_precise_mbclen(p, e, enc); 00253 if (!MBCLEN_CHARFOUND_P(ret)) { 00254 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; 00255 return p - s; 00256 } 00257 p += MBCLEN_CHARFOUND_LEN(ret); 00258 if (p < e) { 00259 p = search_nonascii(p, e); 00260 if (!p) { 00261 *cr = ENC_CODERANGE_VALID; 00262 return e - s; 00263 } 00264 } 00265 } 00266 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; 00267 return p - s; 00268 } 00269 else { 00270 while (p < e) { 00271 int ret = rb_enc_precise_mbclen(p, e, enc); 00272 if (!MBCLEN_CHARFOUND_P(ret)) { 00273 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; 00274 return p - s; 00275 } 00276 p += MBCLEN_CHARFOUND_LEN(ret); 00277 } 00278 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; 00279 return p - s; 00280 } 00281 } 00282 00283 static inline void 00284 str_enc_copy(VALUE str1, VALUE str2) 00285 { 00286 rb_enc_set_index(str1, ENCODING_GET(str2)); 00287 } 00288 00289 static void 00290 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src) 00291 { 00292 /* this function is designed for copying encoding and coderange 00293 * from src to new string "dest" which is made from the part of src. 00294 */ 00295 str_enc_copy(dest, src); 00296 if (RSTRING_LEN(dest) == 0) { 00297 if (!rb_enc_asciicompat(STR_ENC_GET(src))) 00298 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); 00299 else 00300 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00301 return; 00302 } 00303 switch (ENC_CODERANGE(src)) { 00304 case ENC_CODERANGE_7BIT: 00305 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00306 break; 00307 case ENC_CODERANGE_VALID: 00308 if (!rb_enc_asciicompat(STR_ENC_GET(src)) || 00309 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest))) 00310 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); 00311 else 00312 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00313 break; 00314 default: 00315 break; 00316 } 00317 } 00318 00319 static void 00320 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src) 00321 { 00322 str_enc_copy(dest, src); 00323 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src)); 00324 } 00325 00326 int 00327 rb_enc_str_coderange(VALUE str) 00328 { 00329 int cr = ENC_CODERANGE(str); 00330 00331 if (cr == ENC_CODERANGE_UNKNOWN) { 00332 rb_encoding *enc = STR_ENC_GET(str); 00333 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); 00334 ENC_CODERANGE_SET(str, cr); 00335 } 00336 return cr; 00337 } 00338 00339 int 00340 rb_enc_str_asciionly_p(VALUE str) 00341 { 00342 rb_encoding *enc = STR_ENC_GET(str); 00343 00344 if (!rb_enc_asciicompat(enc)) 00345 return FALSE; 00346 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) 00347 return TRUE; 00348 return FALSE; 00349 } 00350 00351 static inline void 00352 str_mod_check(VALUE s, const char *p, long len) 00353 { 00354 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){ 00355 rb_raise(rb_eRuntimeError, "string modified"); 00356 } 00357 } 00358 00359 size_t 00360 rb_str_capacity(VALUE str) 00361 { 00362 if (STR_EMBED_P(str)) { 00363 return RSTRING_EMBED_LEN_MAX; 00364 } 00365 else if (STR_NOCAPA_P(str)) { 00366 return RSTRING(str)->as.heap.len; 00367 } 00368 else { 00369 return RSTRING(str)->as.heap.aux.capa; 00370 } 00371 } 00372 00373 static inline VALUE 00374 str_alloc(VALUE klass) 00375 { 00376 NEWOBJ_OF(str, struct RString, klass, T_STRING); 00377 00378 str->as.heap.ptr = 0; 00379 str->as.heap.len = 0; 00380 str->as.heap.aux.capa = 0; 00381 00382 return (VALUE)str; 00383 } 00384 00385 static inline VALUE 00386 empty_str_alloc(VALUE klass) 00387 { 00388 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) { 00389 RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline()); 00390 } 00391 return str_alloc(klass); 00392 } 00393 00394 static VALUE 00395 str_new(VALUE klass, const char *ptr, long len) 00396 { 00397 VALUE str; 00398 00399 if (len < 0) { 00400 rb_raise(rb_eArgError, "negative string size (or size too big)"); 00401 } 00402 00403 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) { 00404 RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline()); 00405 } 00406 00407 str = str_alloc(klass); 00408 if (len > RSTRING_EMBED_LEN_MAX) { 00409 RSTRING(str)->as.heap.aux.capa = len; 00410 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1); 00411 STR_SET_NOEMBED(str); 00412 } 00413 else if (len == 0) { 00414 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 00415 } 00416 if (ptr) { 00417 memcpy(RSTRING_PTR(str), ptr, len); 00418 } 00419 STR_SET_LEN(str, len); 00420 RSTRING_PTR(str)[len] = '\0'; 00421 return str; 00422 } 00423 00424 VALUE 00425 rb_str_new(const char *ptr, long len) 00426 { 00427 return str_new(rb_cString, ptr, len); 00428 } 00429 00430 VALUE 00431 rb_usascii_str_new(const char *ptr, long len) 00432 { 00433 VALUE str = rb_str_new(ptr, len); 00434 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00435 return str; 00436 } 00437 00438 VALUE 00439 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc) 00440 { 00441 VALUE str = rb_str_new(ptr, len); 00442 rb_enc_associate(str, enc); 00443 return str; 00444 } 00445 00446 VALUE 00447 rb_str_new_cstr(const char *ptr) 00448 { 00449 if (!ptr) { 00450 rb_raise(rb_eArgError, "NULL pointer given"); 00451 } 00452 return rb_str_new(ptr, strlen(ptr)); 00453 } 00454 00455 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr)) 00456 #define rb_str_new2 rb_str_new_cstr 00457 00458 VALUE 00459 rb_usascii_str_new_cstr(const char *ptr) 00460 { 00461 VALUE str = rb_str_new2(ptr); 00462 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00463 return str; 00464 } 00465 00466 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr)) 00467 #define rb_usascii_str_new2 rb_usascii_str_new_cstr 00468 00469 VALUE 00470 rb_tainted_str_new(const char *ptr, long len) 00471 { 00472 VALUE str = rb_str_new(ptr, len); 00473 00474 OBJ_TAINT(str); 00475 return str; 00476 } 00477 00478 VALUE 00479 rb_tainted_str_new_cstr(const char *ptr) 00480 { 00481 VALUE str = rb_str_new2(ptr); 00482 00483 OBJ_TAINT(str); 00484 return str; 00485 } 00486 00487 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr)) 00488 #define rb_tainted_str_new2 rb_tainted_str_new_cstr 00489 00490 VALUE 00491 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts) 00492 { 00493 extern VALUE rb_cEncodingConverter; 00494 rb_econv_t *ec; 00495 rb_econv_result_t ret; 00496 long len, olen; 00497 VALUE econv_wrapper; 00498 VALUE newstr; 00499 const unsigned char *start, *sp; 00500 unsigned char *dest, *dp; 00501 size_t converted_output = 0; 00502 00503 if (!to) return str; 00504 if (!from) from = rb_enc_get(str); 00505 if (from == to) return str; 00506 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) || 00507 to == rb_ascii8bit_encoding()) { 00508 if (STR_ENC_GET(str) != to) { 00509 str = rb_str_dup(str); 00510 rb_enc_associate(str, to); 00511 } 00512 return str; 00513 } 00514 00515 len = RSTRING_LEN(str); 00516 newstr = rb_str_new(0, len); 00517 olen = len; 00518 00519 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter); 00520 RBASIC(econv_wrapper)->klass = 0; 00521 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts); 00522 if (!ec) return str; 00523 DATA_PTR(econv_wrapper) = ec; 00524 00525 sp = (unsigned char*)RSTRING_PTR(str); 00526 start = sp; 00527 while ((dest = (unsigned char*)RSTRING_PTR(newstr)), 00528 (dp = dest + converted_output), 00529 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)), 00530 ret == econv_destination_buffer_full) { 00531 /* destination buffer short */ 00532 size_t converted_input = sp - start; 00533 size_t rest = len - converted_input; 00534 converted_output = dp - dest; 00535 rb_str_set_len(newstr, converted_output); 00536 if (converted_input && converted_output && 00537 rest < (LONG_MAX / converted_output)) { 00538 rest = (rest * converted_output) / converted_input; 00539 } 00540 else { 00541 rest = olen; 00542 } 00543 olen += rest < 2 ? 2 : rest; 00544 rb_str_resize(newstr, olen); 00545 } 00546 DATA_PTR(econv_wrapper) = 0; 00547 rb_econv_close(ec); 00548 rb_gc_force_recycle(econv_wrapper); 00549 switch (ret) { 00550 case econv_finished: 00551 len = dp - (unsigned char*)RSTRING_PTR(newstr); 00552 rb_str_set_len(newstr, len); 00553 rb_enc_associate(newstr, to); 00554 return newstr; 00555 00556 default: 00557 /* some error, return original */ 00558 return str; 00559 } 00560 } 00561 00562 VALUE 00563 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to) 00564 { 00565 return rb_str_conv_enc_opts(str, from, to, 0, Qnil); 00566 } 00567 00568 VALUE 00569 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc) 00570 { 00571 VALUE str; 00572 00573 str = rb_tainted_str_new(ptr, len); 00574 if (eenc == rb_usascii_encoding() && 00575 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { 00576 rb_enc_associate(str, rb_ascii8bit_encoding()); 00577 return str; 00578 } 00579 rb_enc_associate(str, eenc); 00580 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding()); 00581 } 00582 00583 VALUE 00584 rb_external_str_new(const char *ptr, long len) 00585 { 00586 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding()); 00587 } 00588 00589 VALUE 00590 rb_external_str_new_cstr(const char *ptr) 00591 { 00592 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding()); 00593 } 00594 00595 VALUE 00596 rb_locale_str_new(const char *ptr, long len) 00597 { 00598 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding()); 00599 } 00600 00601 VALUE 00602 rb_locale_str_new_cstr(const char *ptr) 00603 { 00604 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding()); 00605 } 00606 00607 VALUE 00608 rb_filesystem_str_new(const char *ptr, long len) 00609 { 00610 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding()); 00611 } 00612 00613 VALUE 00614 rb_filesystem_str_new_cstr(const char *ptr) 00615 { 00616 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding()); 00617 } 00618 00619 VALUE 00620 rb_str_export(VALUE str) 00621 { 00622 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding()); 00623 } 00624 00625 VALUE 00626 rb_str_export_locale(VALUE str) 00627 { 00628 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding()); 00629 } 00630 00631 VALUE 00632 rb_str_export_to_enc(VALUE str, rb_encoding *enc) 00633 { 00634 return rb_str_conv_enc(str, STR_ENC_GET(str), enc); 00635 } 00636 00637 static VALUE 00638 str_replace_shared_without_enc(VALUE str2, VALUE str) 00639 { 00640 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) { 00641 STR_SET_EMBED(str2); 00642 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1); 00643 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str)); 00644 } 00645 else { 00646 str = rb_str_new_frozen(str); 00647 FL_SET(str2, STR_NOEMBED); 00648 RSTRING(str2)->as.heap.len = RSTRING_LEN(str); 00649 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); 00650 RSTRING(str2)->as.heap.aux.shared = str; 00651 FL_SET(str2, ELTS_SHARED); 00652 } 00653 return str2; 00654 } 00655 00656 static VALUE 00657 str_replace_shared(VALUE str2, VALUE str) 00658 { 00659 str_replace_shared_without_enc(str2, str); 00660 rb_enc_cr_str_exact_copy(str2, str); 00661 return str2; 00662 } 00663 00664 static VALUE 00665 str_new_shared(VALUE klass, VALUE str) 00666 { 00667 return str_replace_shared(str_alloc(klass), str); 00668 } 00669 00670 static VALUE 00671 str_new3(VALUE klass, VALUE str) 00672 { 00673 return str_new_shared(klass, str); 00674 } 00675 00676 VALUE 00677 rb_str_new_shared(VALUE str) 00678 { 00679 VALUE str2 = str_new3(rb_obj_class(str), str); 00680 00681 OBJ_INFECT(str2, str); 00682 return str2; 00683 } 00684 00685 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str)) 00686 #define rb_str_new3 rb_str_new_shared 00687 00688 static VALUE 00689 str_new4(VALUE klass, VALUE str) 00690 { 00691 VALUE str2; 00692 00693 str2 = str_alloc(klass); 00694 STR_SET_NOEMBED(str2); 00695 RSTRING(str2)->as.heap.len = RSTRING_LEN(str); 00696 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); 00697 if (STR_SHARED_P(str)) { 00698 VALUE shared = RSTRING(str)->as.heap.aux.shared; 00699 assert(OBJ_FROZEN(shared)); 00700 FL_SET(str2, ELTS_SHARED); 00701 RSTRING(str2)->as.heap.aux.shared = shared; 00702 } 00703 else { 00704 FL_SET(str, ELTS_SHARED); 00705 RSTRING(str)->as.heap.aux.shared = str2; 00706 } 00707 rb_enc_cr_str_exact_copy(str2, str); 00708 OBJ_INFECT(str2, str); 00709 return str2; 00710 } 00711 00712 VALUE 00713 rb_str_new_frozen(VALUE orig) 00714 { 00715 VALUE klass, str; 00716 00717 if (OBJ_FROZEN(orig)) return orig; 00718 klass = rb_obj_class(orig); 00719 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) { 00720 long ofs; 00721 assert(OBJ_FROZEN(str)); 00722 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig); 00723 if ((ofs > 0) || (klass != RBASIC(str)->klass) || 00724 ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) || 00725 ENCODING_GET(str) != ENCODING_GET(orig)) { 00726 str = str_new3(klass, str); 00727 RSTRING(str)->as.heap.ptr += ofs; 00728 RSTRING(str)->as.heap.len -= ofs; 00729 rb_enc_cr_str_exact_copy(str, orig); 00730 OBJ_INFECT(str, orig); 00731 } 00732 } 00733 else if (STR_EMBED_P(orig)) { 00734 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig)); 00735 rb_enc_cr_str_exact_copy(str, orig); 00736 OBJ_INFECT(str, orig); 00737 } 00738 else if (STR_ASSOC_P(orig)) { 00739 VALUE assoc = RSTRING(orig)->as.heap.aux.shared; 00740 FL_UNSET(orig, STR_ASSOC); 00741 str = str_new4(klass, orig); 00742 FL_SET(str, STR_ASSOC); 00743 RSTRING(str)->as.heap.aux.shared = assoc; 00744 } 00745 else { 00746 str = str_new4(klass, orig); 00747 } 00748 OBJ_FREEZE(str); 00749 return str; 00750 } 00751 00752 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig)) 00753 #define rb_str_new4 rb_str_new_frozen 00754 00755 VALUE 00756 rb_str_new_with_class(VALUE obj, const char *ptr, long len) 00757 { 00758 return str_new(rb_obj_class(obj), ptr, len); 00759 } 00760 00761 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len), 00762 rb_str_new_with_class, (obj, ptr, len)) 00763 #define rb_str_new5 rb_str_new_with_class 00764 00765 static VALUE 00766 str_new_empty(VALUE str) 00767 { 00768 VALUE v = rb_str_new5(str, 0, 0); 00769 rb_enc_copy(v, str); 00770 OBJ_INFECT(v, str); 00771 return v; 00772 } 00773 00774 #define STR_BUF_MIN_SIZE 128 00775 00776 VALUE 00777 rb_str_buf_new(long capa) 00778 { 00779 VALUE str = str_alloc(rb_cString); 00780 00781 if (capa < STR_BUF_MIN_SIZE) { 00782 capa = STR_BUF_MIN_SIZE; 00783 } 00784 FL_SET(str, STR_NOEMBED); 00785 RSTRING(str)->as.heap.aux.capa = capa; 00786 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1); 00787 RSTRING(str)->as.heap.ptr[0] = '\0'; 00788 00789 return str; 00790 } 00791 00792 VALUE 00793 rb_str_buf_new_cstr(const char *ptr) 00794 { 00795 VALUE str; 00796 long len = strlen(ptr); 00797 00798 str = rb_str_buf_new(len); 00799 rb_str_buf_cat(str, ptr, len); 00800 00801 return str; 00802 } 00803 00804 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr)) 00805 #define rb_str_buf_new2 rb_str_buf_new_cstr 00806 00807 VALUE 00808 rb_str_tmp_new(long len) 00809 { 00810 return str_new(0, 0, len); 00811 } 00812 00813 void * 00814 rb_alloc_tmp_buffer(volatile VALUE *store, long len) 00815 { 00816 VALUE s = rb_str_tmp_new(len); 00817 *store = s; 00818 return RSTRING_PTR(s); 00819 } 00820 00821 void 00822 rb_free_tmp_buffer(volatile VALUE *store) 00823 { 00824 VALUE s = *store; 00825 *store = 0; 00826 if (s) rb_str_clear(s); 00827 } 00828 00829 void 00830 rb_str_free(VALUE str) 00831 { 00832 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { 00833 xfree(RSTRING(str)->as.heap.ptr); 00834 } 00835 } 00836 00837 RUBY_FUNC_EXPORTED size_t 00838 rb_str_memsize(VALUE str) 00839 { 00840 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { 00841 return RSTRING(str)->as.heap.aux.capa; 00842 } 00843 else { 00844 return 0; 00845 } 00846 } 00847 00848 VALUE 00849 rb_str_to_str(VALUE str) 00850 { 00851 return rb_convert_type(str, T_STRING, "String", "to_str"); 00852 } 00853 00854 static inline void str_discard(VALUE str); 00855 00856 void 00857 rb_str_shared_replace(VALUE str, VALUE str2) 00858 { 00859 rb_encoding *enc; 00860 int cr; 00861 if (str == str2) return; 00862 enc = STR_ENC_GET(str2); 00863 cr = ENC_CODERANGE(str2); 00864 str_discard(str); 00865 OBJ_INFECT(str, str2); 00866 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) { 00867 STR_SET_EMBED(str); 00868 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1); 00869 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2)); 00870 rb_enc_associate(str, enc); 00871 ENC_CODERANGE_SET(str, cr); 00872 return; 00873 } 00874 STR_SET_NOEMBED(str); 00875 STR_UNSET_NOCAPA(str); 00876 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); 00877 RSTRING(str)->as.heap.len = RSTRING_LEN(str2); 00878 if (STR_NOCAPA_P(str2)) { 00879 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA); 00880 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared; 00881 } 00882 else { 00883 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa; 00884 } 00885 STR_SET_EMBED(str2); /* abandon str2 */ 00886 RSTRING_PTR(str2)[0] = 0; 00887 STR_SET_EMBED_LEN(str2, 0); 00888 rb_enc_associate(str, enc); 00889 ENC_CODERANGE_SET(str, cr); 00890 } 00891 00892 static ID id_to_s; 00893 00894 VALUE 00895 rb_obj_as_string(VALUE obj) 00896 { 00897 VALUE str; 00898 00899 if (RB_TYPE_P(obj, T_STRING)) { 00900 return obj; 00901 } 00902 str = rb_funcall(obj, id_to_s, 0); 00903 if (!RB_TYPE_P(str, T_STRING)) 00904 return rb_any_to_s(obj); 00905 if (OBJ_TAINTED(obj)) OBJ_TAINT(str); 00906 return str; 00907 } 00908 00909 static VALUE 00910 str_replace(VALUE str, VALUE str2) 00911 { 00912 long len; 00913 00914 len = RSTRING_LEN(str2); 00915 if (STR_ASSOC_P(str2)) { 00916 str2 = rb_str_new4(str2); 00917 } 00918 if (STR_SHARED_P(str2)) { 00919 VALUE shared = RSTRING(str2)->as.heap.aux.shared; 00920 assert(OBJ_FROZEN(shared)); 00921 STR_SET_NOEMBED(str); 00922 RSTRING(str)->as.heap.len = len; 00923 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); 00924 FL_SET(str, ELTS_SHARED); 00925 FL_UNSET(str, STR_ASSOC); 00926 RSTRING(str)->as.heap.aux.shared = shared; 00927 } 00928 else { 00929 str_replace_shared(str, str2); 00930 } 00931 00932 OBJ_INFECT(str, str2); 00933 rb_enc_cr_str_exact_copy(str, str2); 00934 return str; 00935 } 00936 00937 static VALUE 00938 str_duplicate(VALUE klass, VALUE str) 00939 { 00940 VALUE dup = str_alloc(klass); 00941 str_replace(dup, str); 00942 return dup; 00943 } 00944 00945 VALUE 00946 rb_str_dup(VALUE str) 00947 { 00948 return str_duplicate(rb_obj_class(str), str); 00949 } 00950 00951 VALUE 00952 rb_str_resurrect(VALUE str) 00953 { 00954 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) { 00955 RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str), 00956 rb_sourcefile(), rb_sourceline()); 00957 } 00958 return str_replace(str_alloc(rb_cString), str); 00959 } 00960 00961 /* 00962 * call-seq: 00963 * String.new(str="") -> new_str 00964 * 00965 * Returns a new string object containing a copy of <i>str</i>. 00966 */ 00967 00968 static VALUE 00969 rb_str_init(int argc, VALUE *argv, VALUE str) 00970 { 00971 VALUE orig; 00972 00973 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1) 00974 rb_str_replace(str, orig); 00975 return str; 00976 } 00977 00978 static inline long 00979 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr) 00980 { 00981 long c; 00982 const char *q; 00983 00984 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 00985 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); 00986 } 00987 else if (rb_enc_asciicompat(enc)) { 00988 c = 0; 00989 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) { 00990 while (p < e) { 00991 if (ISASCII(*p)) { 00992 q = search_nonascii(p, e); 00993 if (!q) 00994 return c + (e - p); 00995 c += q - p; 00996 p = q; 00997 } 00998 p += rb_enc_fast_mbclen(p, e, enc); 00999 c++; 01000 } 01001 } 01002 else { 01003 while (p < e) { 01004 if (ISASCII(*p)) { 01005 q = search_nonascii(p, e); 01006 if (!q) 01007 return c + (e - p); 01008 c += q - p; 01009 p = q; 01010 } 01011 p += rb_enc_mbclen(p, e, enc); 01012 c++; 01013 } 01014 } 01015 return c; 01016 } 01017 01018 for (c=0; p<e; c++) { 01019 p += rb_enc_mbclen(p, e, enc); 01020 } 01021 return c; 01022 } 01023 01024 long 01025 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) 01026 { 01027 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN); 01028 } 01029 01030 long 01031 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr) 01032 { 01033 long c; 01034 const char *q; 01035 int ret; 01036 01037 *cr = 0; 01038 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 01039 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); 01040 } 01041 else if (rb_enc_asciicompat(enc)) { 01042 c = 0; 01043 while (p < e) { 01044 if (ISASCII(*p)) { 01045 q = search_nonascii(p, e); 01046 if (!q) { 01047 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01048 return c + (e - p); 01049 } 01050 c += q - p; 01051 p = q; 01052 } 01053 ret = rb_enc_precise_mbclen(p, e, enc); 01054 if (MBCLEN_CHARFOUND_P(ret)) { 01055 *cr |= ENC_CODERANGE_VALID; 01056 p += MBCLEN_CHARFOUND_LEN(ret); 01057 } 01058 else { 01059 *cr = ENC_CODERANGE_BROKEN; 01060 p++; 01061 } 01062 c++; 01063 } 01064 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01065 return c; 01066 } 01067 01068 for (c=0; p<e; c++) { 01069 ret = rb_enc_precise_mbclen(p, e, enc); 01070 if (MBCLEN_CHARFOUND_P(ret)) { 01071 *cr |= ENC_CODERANGE_VALID; 01072 p += MBCLEN_CHARFOUND_LEN(ret); 01073 } 01074 else { 01075 *cr = ENC_CODERANGE_BROKEN; 01076 if (p + rb_enc_mbminlen(enc) <= e) 01077 p += rb_enc_mbminlen(enc); 01078 else 01079 p = e; 01080 } 01081 } 01082 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01083 return c; 01084 } 01085 01086 #ifdef NONASCII_MASK 01087 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) 01088 01089 /* 01090 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx 01091 * bit represention. (see http://en.wikipedia.org/wiki/UTF-8) 01092 * Therefore, following pseudo code can detect UTF-8 leading byte. 01093 * 01094 * if (!(byte & 0x80)) 01095 * byte |= 0x40; // turn on bit6 01096 * return ((byte>>6) & 1); // bit6 represent it's leading byte or not. 01097 * 01098 * This function calculate every bytes in the argument word `s' 01099 * using the above logic concurrently. and gather every bytes result. 01100 */ 01101 static inline VALUE 01102 count_utf8_lead_bytes_with_word(const VALUE *s) 01103 { 01104 VALUE d = *s; 01105 01106 /* Transform into bit0 represent UTF-8 leading or not. */ 01107 d |= ~(d>>1); 01108 d >>= 6; 01109 d &= NONASCII_MASK >> 7; 01110 01111 /* Gather every bytes. */ 01112 d += (d>>8); 01113 d += (d>>16); 01114 #if SIZEOF_VALUE == 8 01115 d += (d>>32); 01116 #endif 01117 return (d&0xF); 01118 } 01119 #endif 01120 01121 static long 01122 str_strlen(VALUE str, rb_encoding *enc) 01123 { 01124 const char *p, *e; 01125 long n; 01126 int cr; 01127 01128 if (single_byte_optimizable(str)) return RSTRING_LEN(str); 01129 if (!enc) enc = STR_ENC_GET(str); 01130 p = RSTRING_PTR(str); 01131 e = RSTRING_END(str); 01132 cr = ENC_CODERANGE(str); 01133 #ifdef NONASCII_MASK 01134 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && 01135 enc == rb_utf8_encoding()) { 01136 01137 VALUE len = 0; 01138 if ((int)sizeof(VALUE) * 2 < e - p) { 01139 const VALUE *s, *t; 01140 const VALUE lowbits = sizeof(VALUE) - 1; 01141 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 01142 t = (const VALUE*)(~lowbits & (VALUE)e); 01143 while (p < (const char *)s) { 01144 if (is_utf8_lead_byte(*p)) len++; 01145 p++; 01146 } 01147 while (s < t) { 01148 len += count_utf8_lead_bytes_with_word(s); 01149 s++; 01150 } 01151 p = (const char *)s; 01152 } 01153 while (p < e) { 01154 if (is_utf8_lead_byte(*p)) len++; 01155 p++; 01156 } 01157 return (long)len; 01158 } 01159 #endif 01160 n = rb_enc_strlen_cr(p, e, enc, &cr); 01161 if (cr) { 01162 ENC_CODERANGE_SET(str, cr); 01163 } 01164 return n; 01165 } 01166 01167 long 01168 rb_str_strlen(VALUE str) 01169 { 01170 return str_strlen(str, STR_ENC_GET(str)); 01171 } 01172 01173 /* 01174 * call-seq: 01175 * str.length -> integer 01176 * str.size -> integer 01177 * 01178 * Returns the character length of <i>str</i>. 01179 */ 01180 01181 VALUE 01182 rb_str_length(VALUE str) 01183 { 01184 long len; 01185 01186 len = str_strlen(str, STR_ENC_GET(str)); 01187 return LONG2NUM(len); 01188 } 01189 01190 /* 01191 * call-seq: 01192 * str.bytesize -> integer 01193 * 01194 * Returns the length of +str+ in bytes. 01195 * 01196 * "\x80\u3042".bytesize #=> 4 01197 * "hello".bytesize #=> 5 01198 */ 01199 01200 static VALUE 01201 rb_str_bytesize(VALUE str) 01202 { 01203 return LONG2NUM(RSTRING_LEN(str)); 01204 } 01205 01206 /* 01207 * call-seq: 01208 * str.empty? -> true or false 01209 * 01210 * Returns <code>true</code> if <i>str</i> has a length of zero. 01211 * 01212 * "hello".empty? #=> false 01213 * " ".empty? #=> false 01214 * "".empty? #=> true 01215 */ 01216 01217 static VALUE 01218 rb_str_empty(VALUE str) 01219 { 01220 if (RSTRING_LEN(str) == 0) 01221 return Qtrue; 01222 return Qfalse; 01223 } 01224 01225 /* 01226 * call-seq: 01227 * str + other_str -> new_str 01228 * 01229 * Concatenation---Returns a new <code>String</code> containing 01230 * <i>other_str</i> concatenated to <i>str</i>. 01231 * 01232 * "Hello from " + self.to_s #=> "Hello from main" 01233 */ 01234 01235 VALUE 01236 rb_str_plus(VALUE str1, VALUE str2) 01237 { 01238 VALUE str3; 01239 rb_encoding *enc; 01240 01241 StringValue(str2); 01242 enc = rb_enc_check(str1, str2); 01243 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2)); 01244 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1)); 01245 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1), 01246 RSTRING_PTR(str2), RSTRING_LEN(str2)); 01247 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0'; 01248 01249 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2)) 01250 OBJ_TAINT(str3); 01251 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc), 01252 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2))); 01253 return str3; 01254 } 01255 01256 /* 01257 * call-seq: 01258 * str * integer -> new_str 01259 * 01260 * Copy --- Returns a new String containing +integer+ copies of the receiver. 01261 * +integer+ must be greater than or equal to 0. 01262 * 01263 * "Ho! " * 3 #=> "Ho! Ho! Ho! " 01264 * "Ho! " * 0 #=> "" 01265 */ 01266 01267 VALUE 01268 rb_str_times(VALUE str, VALUE times) 01269 { 01270 VALUE str2; 01271 long n, len; 01272 char *ptr2; 01273 01274 len = NUM2LONG(times); 01275 if (len < 0) { 01276 rb_raise(rb_eArgError, "negative argument"); 01277 } 01278 if (len && LONG_MAX/len < RSTRING_LEN(str)) { 01279 rb_raise(rb_eArgError, "argument too big"); 01280 } 01281 01282 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str)); 01283 ptr2 = RSTRING_PTR(str2); 01284 if (len) { 01285 n = RSTRING_LEN(str); 01286 memcpy(ptr2, RSTRING_PTR(str), n); 01287 while (n <= len/2) { 01288 memcpy(ptr2 + n, ptr2, n); 01289 n *= 2; 01290 } 01291 memcpy(ptr2 + n, ptr2, len-n); 01292 } 01293 ptr2[RSTRING_LEN(str2)] = '\0'; 01294 OBJ_INFECT(str2, str); 01295 rb_enc_cr_str_copy_for_substr(str2, str); 01296 01297 return str2; 01298 } 01299 01300 /* 01301 * call-seq: 01302 * str % arg -> new_str 01303 * 01304 * Format---Uses <i>str</i> as a format specification, and returns the result 01305 * of applying it to <i>arg</i>. If the format specification contains more than 01306 * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code> 01307 * containing the values to be substituted. See <code>Kernel::sprintf</code> for 01308 * details of the format string. 01309 * 01310 * "%05d" % 123 #=> "00123" 01311 * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6" 01312 * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar" 01313 */ 01314 01315 static VALUE 01316 rb_str_format_m(VALUE str, VALUE arg) 01317 { 01318 volatile VALUE tmp = rb_check_array_type(arg); 01319 01320 if (!NIL_P(tmp)) { 01321 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str); 01322 } 01323 return rb_str_format(1, &arg, str); 01324 } 01325 01326 static inline void 01327 str_modifiable(VALUE str) 01328 { 01329 if (FL_TEST(str, STR_TMPLOCK)) { 01330 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked"); 01331 } 01332 rb_check_frozen(str); 01333 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4) 01334 rb_raise(rb_eSecurityError, "Insecure: can't modify string"); 01335 } 01336 01337 static inline int 01338 str_independent(VALUE str) 01339 { 01340 str_modifiable(str); 01341 if (!STR_SHARED_P(str)) return 1; 01342 if (STR_EMBED_P(str)) return 1; 01343 return 0; 01344 } 01345 01346 static void 01347 str_make_independent_expand(VALUE str, long expand) 01348 { 01349 char *ptr; 01350 long len = RSTRING_LEN(str); 01351 long capa = len + expand; 01352 01353 if (len > capa) len = capa; 01354 ptr = ALLOC_N(char, capa + 1); 01355 if (RSTRING_PTR(str)) { 01356 memcpy(ptr, RSTRING_PTR(str), len); 01357 } 01358 STR_SET_NOEMBED(str); 01359 STR_UNSET_NOCAPA(str); 01360 ptr[len] = 0; 01361 RSTRING(str)->as.heap.ptr = ptr; 01362 RSTRING(str)->as.heap.len = len; 01363 RSTRING(str)->as.heap.aux.capa = capa; 01364 } 01365 01366 #define str_make_independent(str) str_make_independent_expand((str), 0L) 01367 01368 void 01369 rb_str_modify(VALUE str) 01370 { 01371 if (!str_independent(str)) 01372 str_make_independent(str); 01373 ENC_CODERANGE_CLEAR(str); 01374 } 01375 01376 void 01377 rb_str_modify_expand(VALUE str, long expand) 01378 { 01379 if (expand < 0) { 01380 rb_raise(rb_eArgError, "negative expanding string size"); 01381 } 01382 if (!str_independent(str)) { 01383 str_make_independent_expand(str, expand); 01384 } 01385 else if (expand > 0) { 01386 long len = RSTRING_LEN(str); 01387 long capa = len + expand; 01388 if (!STR_EMBED_P(str)) { 01389 REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1); 01390 RSTRING(str)->as.heap.aux.capa = capa; 01391 } 01392 else if (capa > RSTRING_EMBED_LEN_MAX) { 01393 str_make_independent_expand(str, expand); 01394 } 01395 } 01396 ENC_CODERANGE_CLEAR(str); 01397 } 01398 01399 /* As rb_str_modify(), but don't clear coderange */ 01400 static void 01401 str_modify_keep_cr(VALUE str) 01402 { 01403 if (!str_independent(str)) 01404 str_make_independent(str); 01405 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN) 01406 /* Force re-scan later */ 01407 ENC_CODERANGE_CLEAR(str); 01408 } 01409 01410 static inline void 01411 str_discard(VALUE str) 01412 { 01413 str_modifiable(str); 01414 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) { 01415 xfree(RSTRING_PTR(str)); 01416 RSTRING(str)->as.heap.ptr = 0; 01417 RSTRING(str)->as.heap.len = 0; 01418 } 01419 } 01420 01421 void 01422 rb_str_associate(VALUE str, VALUE add) 01423 { 01424 /* sanity check */ 01425 rb_check_frozen(str); 01426 if (STR_ASSOC_P(str)) { 01427 /* already associated */ 01428 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add); 01429 } 01430 else { 01431 if (STR_SHARED_P(str)) { 01432 VALUE assoc = RSTRING(str)->as.heap.aux.shared; 01433 str_make_independent(str); 01434 if (STR_ASSOC_P(assoc)) { 01435 assoc = RSTRING(assoc)->as.heap.aux.shared; 01436 rb_ary_concat(assoc, add); 01437 add = assoc; 01438 } 01439 } 01440 else if (STR_EMBED_P(str)) { 01441 str_make_independent(str); 01442 } 01443 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) { 01444 RESIZE_CAPA(str, RSTRING_LEN(str)); 01445 } 01446 FL_SET(str, STR_ASSOC); 01447 RBASIC(add)->klass = 0; 01448 RSTRING(str)->as.heap.aux.shared = add; 01449 } 01450 } 01451 01452 VALUE 01453 rb_str_associated(VALUE str) 01454 { 01455 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared; 01456 if (STR_ASSOC_P(str)) { 01457 return RSTRING(str)->as.heap.aux.shared; 01458 } 01459 return Qfalse; 01460 } 01461 01462 void 01463 rb_must_asciicompat(VALUE str) 01464 { 01465 rb_encoding *enc = rb_enc_get(str); 01466 if (!rb_enc_asciicompat(enc)) { 01467 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc)); 01468 } 01469 } 01470 01471 VALUE 01472 rb_string_value(volatile VALUE *ptr) 01473 { 01474 VALUE s = *ptr; 01475 if (!RB_TYPE_P(s, T_STRING)) { 01476 s = rb_str_to_str(s); 01477 *ptr = s; 01478 } 01479 return s; 01480 } 01481 01482 char * 01483 rb_string_value_ptr(volatile VALUE *ptr) 01484 { 01485 VALUE str = rb_string_value(ptr); 01486 return RSTRING_PTR(str); 01487 } 01488 01489 char * 01490 rb_string_value_cstr(volatile VALUE *ptr) 01491 { 01492 VALUE str = rb_string_value(ptr); 01493 char *s = RSTRING_PTR(str); 01494 long len = RSTRING_LEN(str); 01495 01496 if (!s || memchr(s, 0, len)) { 01497 rb_raise(rb_eArgError, "string contains null byte"); 01498 } 01499 if (s[len]) { 01500 rb_str_modify(str); 01501 s = RSTRING_PTR(str); 01502 s[RSTRING_LEN(str)] = 0; 01503 } 01504 return s; 01505 } 01506 01507 VALUE 01508 rb_check_string_type(VALUE str) 01509 { 01510 str = rb_check_convert_type(str, T_STRING, "String", "to_str"); 01511 return str; 01512 } 01513 01514 /* 01515 * call-seq: 01516 * String.try_convert(obj) -> string or nil 01517 * 01518 * Try to convert <i>obj</i> into a String, using to_str method. 01519 * Returns converted string or nil if <i>obj</i> cannot be converted 01520 * for any reason. 01521 * 01522 * String.try_convert("str") #=> "str" 01523 * String.try_convert(/re/) #=> nil 01524 */ 01525 static VALUE 01526 rb_str_s_try_convert(VALUE dummy, VALUE str) 01527 { 01528 return rb_check_string_type(str); 01529 } 01530 01531 static char* 01532 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc) 01533 { 01534 long nth = *nthp; 01535 if (rb_enc_mbmaxlen(enc) == 1) { 01536 p += nth; 01537 } 01538 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 01539 p += nth * rb_enc_mbmaxlen(enc); 01540 } 01541 else if (rb_enc_asciicompat(enc)) { 01542 const char *p2, *e2; 01543 int n; 01544 01545 while (p < e && 0 < nth) { 01546 e2 = p + nth; 01547 if (e < e2) { 01548 *nthp = nth; 01549 return (char *)e; 01550 } 01551 if (ISASCII(*p)) { 01552 p2 = search_nonascii(p, e2); 01553 if (!p2) { 01554 nth -= e2 - p; 01555 *nthp = nth; 01556 return (char *)e2; 01557 } 01558 nth -= p2 - p; 01559 p = p2; 01560 } 01561 n = rb_enc_mbclen(p, e, enc); 01562 p += n; 01563 nth--; 01564 } 01565 *nthp = nth; 01566 if (nth != 0) { 01567 return (char *)e; 01568 } 01569 return (char *)p; 01570 } 01571 else { 01572 while (p < e && nth--) { 01573 p += rb_enc_mbclen(p, e, enc); 01574 } 01575 } 01576 if (p > e) p = e; 01577 *nthp = nth; 01578 return (char*)p; 01579 } 01580 01581 char* 01582 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc) 01583 { 01584 return str_nth_len(p, e, &nth, enc); 01585 } 01586 01587 static char* 01588 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte) 01589 { 01590 if (singlebyte) 01591 p += nth; 01592 else { 01593 p = str_nth_len(p, e, &nth, enc); 01594 } 01595 if (!p) return 0; 01596 if (p > e) p = e; 01597 return (char *)p; 01598 } 01599 01600 /* char offset to byte offset */ 01601 static long 01602 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte) 01603 { 01604 const char *pp = str_nth(p, e, nth, enc, singlebyte); 01605 if (!pp) return e - p; 01606 return pp - p; 01607 } 01608 01609 long 01610 rb_str_offset(VALUE str, long pos) 01611 { 01612 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 01613 STR_ENC_GET(str), single_byte_optimizable(str)); 01614 } 01615 01616 #ifdef NONASCII_MASK 01617 static char * 01618 str_utf8_nth(const char *p, const char *e, long *nthp) 01619 { 01620 long nth = *nthp; 01621 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) { 01622 const VALUE *s, *t; 01623 const VALUE lowbits = sizeof(VALUE) - 1; 01624 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 01625 t = (const VALUE*)(~lowbits & (VALUE)e); 01626 while (p < (const char *)s) { 01627 if (is_utf8_lead_byte(*p)) nth--; 01628 p++; 01629 } 01630 do { 01631 nth -= count_utf8_lead_bytes_with_word(s); 01632 s++; 01633 } while (s < t && (int)sizeof(VALUE) <= nth); 01634 p = (char *)s; 01635 } 01636 while (p < e) { 01637 if (is_utf8_lead_byte(*p)) { 01638 if (nth == 0) break; 01639 nth--; 01640 } 01641 p++; 01642 } 01643 *nthp = nth; 01644 return (char *)p; 01645 } 01646 01647 static long 01648 str_utf8_offset(const char *p, const char *e, long nth) 01649 { 01650 const char *pp = str_utf8_nth(p, e, &nth); 01651 return pp - p; 01652 } 01653 #endif 01654 01655 /* byte offset to char offset */ 01656 long 01657 rb_str_sublen(VALUE str, long pos) 01658 { 01659 if (single_byte_optimizable(str) || pos < 0) 01660 return pos; 01661 else { 01662 char *p = RSTRING_PTR(str); 01663 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str)); 01664 } 01665 } 01666 01667 VALUE 01668 rb_str_subseq(VALUE str, long beg, long len) 01669 { 01670 VALUE str2; 01671 01672 if (RSTRING_LEN(str) == beg + len && 01673 RSTRING_EMBED_LEN_MAX < len) { 01674 str2 = rb_str_new_shared(rb_str_new_frozen(str)); 01675 rb_str_drop_bytes(str2, beg); 01676 } 01677 else { 01678 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len); 01679 RB_GC_GUARD(str); 01680 } 01681 01682 rb_enc_cr_str_copy_for_substr(str2, str); 01683 OBJ_INFECT(str2, str); 01684 01685 return str2; 01686 } 01687 01688 static char * 01689 rb_str_subpos(VALUE str, long beg, long *lenp) 01690 { 01691 long len = *lenp; 01692 long slen = -1L; 01693 long blen = RSTRING_LEN(str); 01694 rb_encoding *enc = STR_ENC_GET(str); 01695 char *p, *s = RSTRING_PTR(str), *e = s + blen; 01696 01697 if (len < 0) return 0; 01698 if (!blen) { 01699 len = 0; 01700 } 01701 if (single_byte_optimizable(str)) { 01702 if (beg > blen) return 0; 01703 if (beg < 0) { 01704 beg += blen; 01705 if (beg < 0) return 0; 01706 } 01707 if (beg + len > blen) 01708 len = blen - beg; 01709 if (len < 0) return 0; 01710 p = s + beg; 01711 goto end; 01712 } 01713 if (beg < 0) { 01714 if (len > -beg) len = -beg; 01715 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) { 01716 beg = -beg; 01717 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0); 01718 p = e; 01719 if (!p) return 0; 01720 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0); 01721 if (!p) return 0; 01722 len = e - p; 01723 goto end; 01724 } 01725 else { 01726 slen = str_strlen(str, enc); 01727 beg += slen; 01728 if (beg < 0) return 0; 01729 p = s + beg; 01730 if (len == 0) goto end; 01731 } 01732 } 01733 else if (beg > 0 && beg > RSTRING_LEN(str)) { 01734 return 0; 01735 } 01736 if (len == 0) { 01737 if (beg > str_strlen(str, enc)) return 0; 01738 p = s + beg; 01739 } 01740 #ifdef NONASCII_MASK 01741 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && 01742 enc == rb_utf8_encoding()) { 01743 p = str_utf8_nth(s, e, &beg); 01744 if (beg > 0) return 0; 01745 len = str_utf8_offset(p, e, len); 01746 } 01747 #endif 01748 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 01749 int char_sz = rb_enc_mbmaxlen(enc); 01750 01751 p = s + beg * char_sz; 01752 if (p > e) { 01753 return 0; 01754 } 01755 else if (len * char_sz > e - p) 01756 len = e - p; 01757 else 01758 len *= char_sz; 01759 } 01760 else if ((p = str_nth_len(s, e, &beg, enc)) == e) { 01761 if (beg > 0) return 0; 01762 len = 0; 01763 } 01764 else { 01765 len = str_offset(p, e, len, enc, 0); 01766 } 01767 end: 01768 *lenp = len; 01769 RB_GC_GUARD(str); 01770 return p; 01771 } 01772 01773 VALUE 01774 rb_str_substr(VALUE str, long beg, long len) 01775 { 01776 VALUE str2; 01777 char *p = rb_str_subpos(str, beg, &len); 01778 01779 if (!p) return Qnil; 01780 if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) { 01781 str2 = rb_str_new4(str); 01782 str2 = str_new3(rb_obj_class(str2), str2); 01783 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; 01784 RSTRING(str2)->as.heap.len = len; 01785 } 01786 else { 01787 str2 = rb_str_new5(str, p, len); 01788 rb_enc_cr_str_copy_for_substr(str2, str); 01789 OBJ_INFECT(str2, str); 01790 RB_GC_GUARD(str); 01791 } 01792 01793 return str2; 01794 } 01795 01796 VALUE 01797 rb_str_freeze(VALUE str) 01798 { 01799 if (STR_ASSOC_P(str)) { 01800 VALUE ary = RSTRING(str)->as.heap.aux.shared; 01801 OBJ_FREEZE(ary); 01802 } 01803 return rb_obj_freeze(str); 01804 } 01805 01806 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str)) 01807 #define rb_str_dup_frozen rb_str_new_frozen 01808 01809 VALUE 01810 rb_str_locktmp(VALUE str) 01811 { 01812 if (FL_TEST(str, STR_TMPLOCK)) { 01813 rb_raise(rb_eRuntimeError, "temporal locking already locked string"); 01814 } 01815 FL_SET(str, STR_TMPLOCK); 01816 return str; 01817 } 01818 01819 VALUE 01820 rb_str_unlocktmp(VALUE str) 01821 { 01822 if (!FL_TEST(str, STR_TMPLOCK)) { 01823 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string"); 01824 } 01825 FL_UNSET(str, STR_TMPLOCK); 01826 return str; 01827 } 01828 01829 void 01830 rb_str_set_len(VALUE str, long len) 01831 { 01832 long capa; 01833 01834 str_modifiable(str); 01835 if (STR_SHARED_P(str)) { 01836 rb_raise(rb_eRuntimeError, "can't set length of shared string"); 01837 } 01838 if (len > (capa = (long)rb_str_capacity(str))) { 01839 rb_bug("probable buffer overflow: %ld for %ld", len, capa); 01840 } 01841 STR_SET_LEN(str, len); 01842 RSTRING_PTR(str)[len] = '\0'; 01843 } 01844 01845 VALUE 01846 rb_str_resize(VALUE str, long len) 01847 { 01848 long slen; 01849 int independent; 01850 01851 if (len < 0) { 01852 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01853 } 01854 01855 independent = str_independent(str); 01856 ENC_CODERANGE_CLEAR(str); 01857 slen = RSTRING_LEN(str); 01858 if (len != slen) { 01859 if (STR_EMBED_P(str)) { 01860 if (len <= RSTRING_EMBED_LEN_MAX) { 01861 STR_SET_EMBED_LEN(str, len); 01862 RSTRING(str)->as.ary[len] = '\0'; 01863 return str; 01864 } 01865 str_make_independent_expand(str, len - slen); 01866 STR_SET_NOEMBED(str); 01867 } 01868 else if (len <= RSTRING_EMBED_LEN_MAX) { 01869 char *ptr = RSTRING(str)->as.heap.ptr; 01870 STR_SET_EMBED(str); 01871 if (slen > len) slen = len; 01872 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen); 01873 RSTRING(str)->as.ary[len] = '\0'; 01874 STR_SET_EMBED_LEN(str, len); 01875 if (independent) xfree(ptr); 01876 return str; 01877 } 01878 else if (!independent) { 01879 str_make_independent_expand(str, len - slen); 01880 } 01881 else if (slen < len || slen - len > 1024) { 01882 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1); 01883 } 01884 if (!STR_NOCAPA_P(str)) { 01885 RSTRING(str)->as.heap.aux.capa = len; 01886 } 01887 RSTRING(str)->as.heap.len = len; 01888 RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */ 01889 } 01890 return str; 01891 } 01892 01893 static VALUE 01894 str_buf_cat(VALUE str, const char *ptr, long len) 01895 { 01896 long capa, total, off = -1; 01897 01898 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) { 01899 off = ptr - RSTRING_PTR(str); 01900 } 01901 rb_str_modify(str); 01902 if (len == 0) return 0; 01903 if (STR_ASSOC_P(str)) { 01904 FL_UNSET(str, STR_ASSOC); 01905 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str); 01906 } 01907 else if (STR_EMBED_P(str)) { 01908 capa = RSTRING_EMBED_LEN_MAX; 01909 } 01910 else { 01911 capa = RSTRING(str)->as.heap.aux.capa; 01912 } 01913 if (RSTRING_LEN(str) >= LONG_MAX - len) { 01914 rb_raise(rb_eArgError, "string sizes too big"); 01915 } 01916 total = RSTRING_LEN(str)+len; 01917 if (capa <= total) { 01918 while (total > capa) { 01919 if (capa + 1 >= LONG_MAX / 2) { 01920 capa = (total + 4095) / 4096; 01921 break; 01922 } 01923 capa = (capa + 1) * 2; 01924 } 01925 RESIZE_CAPA(str, capa); 01926 } 01927 if (off != -1) { 01928 ptr = RSTRING_PTR(str) + off; 01929 } 01930 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len); 01931 STR_SET_LEN(str, total); 01932 RSTRING_PTR(str)[total] = '\0'; /* sentinel */ 01933 01934 return str; 01935 } 01936 01937 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr)) 01938 01939 VALUE 01940 rb_str_buf_cat(VALUE str, const char *ptr, long len) 01941 { 01942 if (len == 0) return str; 01943 if (len < 0) { 01944 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01945 } 01946 return str_buf_cat(str, ptr, len); 01947 } 01948 01949 VALUE 01950 rb_str_buf_cat2(VALUE str, const char *ptr) 01951 { 01952 return rb_str_buf_cat(str, ptr, strlen(ptr)); 01953 } 01954 01955 VALUE 01956 rb_str_cat(VALUE str, const char *ptr, long len) 01957 { 01958 if (len < 0) { 01959 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01960 } 01961 if (STR_ASSOC_P(str)) { 01962 char *p; 01963 rb_str_modify_expand(str, len); 01964 p = RSTRING(str)->as.heap.ptr; 01965 memcpy(p + RSTRING(str)->as.heap.len, ptr, len); 01966 len = RSTRING(str)->as.heap.len += len; 01967 p[len] = '\0'; /* sentinel */ 01968 return str; 01969 } 01970 01971 return rb_str_buf_cat(str, ptr, len); 01972 } 01973 01974 VALUE 01975 rb_str_cat2(VALUE str, const char *ptr) 01976 { 01977 return rb_str_cat(str, ptr, strlen(ptr)); 01978 } 01979 01980 static VALUE 01981 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len, 01982 int ptr_encindex, int ptr_cr, int *ptr_cr_ret) 01983 { 01984 int str_encindex = ENCODING_GET(str); 01985 int res_encindex; 01986 int str_cr, res_cr; 01987 01988 str_cr = ENC_CODERANGE(str); 01989 01990 if (str_encindex == ptr_encindex) { 01991 if (str_cr == ENC_CODERANGE_UNKNOWN) 01992 ptr_cr = ENC_CODERANGE_UNKNOWN; 01993 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) { 01994 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex)); 01995 } 01996 } 01997 else { 01998 rb_encoding *str_enc = rb_enc_from_index(str_encindex); 01999 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex); 02000 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) { 02001 if (len == 0) 02002 return str; 02003 if (RSTRING_LEN(str) == 0) { 02004 rb_str_buf_cat(str, ptr, len); 02005 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr); 02006 return str; 02007 } 02008 goto incompatible; 02009 } 02010 if (ptr_cr == ENC_CODERANGE_UNKNOWN) { 02011 ptr_cr = coderange_scan(ptr, len, ptr_enc); 02012 } 02013 if (str_cr == ENC_CODERANGE_UNKNOWN) { 02014 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) { 02015 str_cr = rb_enc_str_coderange(str); 02016 } 02017 } 02018 } 02019 if (ptr_cr_ret) 02020 *ptr_cr_ret = ptr_cr; 02021 02022 if (str_encindex != ptr_encindex && 02023 str_cr != ENC_CODERANGE_7BIT && 02024 ptr_cr != ENC_CODERANGE_7BIT) { 02025 incompatible: 02026 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 02027 rb_enc_name(rb_enc_from_index(str_encindex)), 02028 rb_enc_name(rb_enc_from_index(ptr_encindex))); 02029 } 02030 02031 if (str_cr == ENC_CODERANGE_UNKNOWN) { 02032 res_encindex = str_encindex; 02033 res_cr = ENC_CODERANGE_UNKNOWN; 02034 } 02035 else if (str_cr == ENC_CODERANGE_7BIT) { 02036 if (ptr_cr == ENC_CODERANGE_7BIT) { 02037 res_encindex = str_encindex; 02038 res_cr = ENC_CODERANGE_7BIT; 02039 } 02040 else { 02041 res_encindex = ptr_encindex; 02042 res_cr = ptr_cr; 02043 } 02044 } 02045 else if (str_cr == ENC_CODERANGE_VALID) { 02046 res_encindex = str_encindex; 02047 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID) 02048 res_cr = str_cr; 02049 else 02050 res_cr = ptr_cr; 02051 } 02052 else { /* str_cr == ENC_CODERANGE_BROKEN */ 02053 res_encindex = str_encindex; 02054 res_cr = str_cr; 02055 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN; 02056 } 02057 02058 if (len < 0) { 02059 rb_raise(rb_eArgError, "negative string size (or size too big)"); 02060 } 02061 str_buf_cat(str, ptr, len); 02062 ENCODING_CODERANGE_SET(str, res_encindex, res_cr); 02063 return str; 02064 } 02065 02066 VALUE 02067 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc) 02068 { 02069 return rb_enc_cr_str_buf_cat(str, ptr, len, 02070 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL); 02071 } 02072 02073 VALUE 02074 rb_str_buf_cat_ascii(VALUE str, const char *ptr) 02075 { 02076 /* ptr must reference NUL terminated ASCII string. */ 02077 int encindex = ENCODING_GET(str); 02078 rb_encoding *enc = rb_enc_from_index(encindex); 02079 if (rb_enc_asciicompat(enc)) { 02080 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr), 02081 encindex, ENC_CODERANGE_7BIT, 0); 02082 } 02083 else { 02084 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc)); 02085 while (*ptr) { 02086 unsigned int c = (unsigned char)*ptr; 02087 int len = rb_enc_codelen(c, enc); 02088 rb_enc_mbcput(c, buf, enc); 02089 rb_enc_cr_str_buf_cat(str, buf, len, 02090 encindex, ENC_CODERANGE_VALID, 0); 02091 ptr++; 02092 } 02093 return str; 02094 } 02095 } 02096 02097 VALUE 02098 rb_str_buf_append(VALUE str, VALUE str2) 02099 { 02100 int str2_cr; 02101 02102 str2_cr = ENC_CODERANGE(str2); 02103 02104 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2), 02105 ENCODING_GET(str2), str2_cr, &str2_cr); 02106 02107 OBJ_INFECT(str, str2); 02108 ENC_CODERANGE_SET(str2, str2_cr); 02109 02110 return str; 02111 } 02112 02113 VALUE 02114 rb_str_append(VALUE str, VALUE str2) 02115 { 02116 rb_encoding *enc; 02117 int cr, cr2; 02118 long len2; 02119 02120 StringValue(str2); 02121 if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) { 02122 long len = RSTRING_LEN(str) + len2; 02123 enc = rb_enc_check(str, str2); 02124 cr = ENC_CODERANGE(str); 02125 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2; 02126 rb_str_modify_expand(str, len2); 02127 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, 02128 RSTRING_PTR(str2), len2+1); 02129 RSTRING(str)->as.heap.len = len; 02130 rb_enc_associate(str, enc); 02131 ENC_CODERANGE_SET(str, cr); 02132 OBJ_INFECT(str, str2); 02133 return str; 02134 } 02135 return rb_str_buf_append(str, str2); 02136 } 02137 02138 /* 02139 * call-seq: 02140 * str << integer -> str 02141 * str.concat(integer) -> str 02142 * str << obj -> str 02143 * str.concat(obj) -> str 02144 * 02145 * Append---Concatenates the given object to <i>str</i>. If the object is a 02146 * <code>Integer</code>, it is considered as a codepoint, and is converted 02147 * to a character before concatenation. 02148 * 02149 * a = "hello " 02150 * a << "world" #=> "hello world" 02151 * a.concat(33) #=> "hello world!" 02152 */ 02153 02154 VALUE 02155 rb_str_concat(VALUE str1, VALUE str2) 02156 { 02157 unsigned int code; 02158 rb_encoding *enc = STR_ENC_GET(str1); 02159 02160 if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) { 02161 if (rb_num_to_uint(str2, &code) == 0) { 02162 } 02163 else if (FIXNUM_P(str2)) { 02164 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2)); 02165 } 02166 else { 02167 rb_raise(rb_eRangeError, "bignum out of char range"); 02168 } 02169 } 02170 else { 02171 return rb_str_append(str1, str2); 02172 } 02173 02174 if (enc == rb_usascii_encoding()) { 02175 /* US-ASCII automatically extended to ASCII-8BIT */ 02176 char buf[1]; 02177 buf[0] = (char)code; 02178 if (code > 0xFF) { 02179 rb_raise(rb_eRangeError, "%u out of char range", code); 02180 } 02181 rb_str_cat(str1, buf, 1); 02182 if (code > 127) { 02183 rb_enc_associate(str1, rb_ascii8bit_encoding()); 02184 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID); 02185 } 02186 } 02187 else { 02188 long pos = RSTRING_LEN(str1); 02189 int cr = ENC_CODERANGE(str1); 02190 int len; 02191 char *buf; 02192 02193 switch (len = rb_enc_codelen(code, enc)) { 02194 case ONIGERR_INVALID_CODE_POINT_VALUE: 02195 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); 02196 break; 02197 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE: 02198 case 0: 02199 rb_raise(rb_eRangeError, "%u out of char range", code); 02200 break; 02201 } 02202 buf = ALLOCA_N(char, len + 1); 02203 rb_enc_mbcput(code, buf, enc); 02204 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) { 02205 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); 02206 } 02207 rb_str_resize(str1, pos+len); 02208 memcpy(RSTRING_PTR(str1) + pos, buf, len); 02209 if (cr == ENC_CODERANGE_7BIT && code > 127) 02210 cr = ENC_CODERANGE_VALID; 02211 ENC_CODERANGE_SET(str1, cr); 02212 } 02213 return str1; 02214 } 02215 02216 /* 02217 * call-seq: 02218 * str.prepend(other_str) -> str 02219 * 02220 * Prepend---Prepend the given string to <i>str</i>. 02221 * 02222 * a = "world" 02223 * a.prepend("hello ") #=> "hello world" 02224 * a #=> "hello world" 02225 */ 02226 02227 static VALUE 02228 rb_str_prepend(VALUE str, VALUE str2) 02229 { 02230 StringValue(str2); 02231 StringValue(str); 02232 rb_str_update(str, 0L, 0L, str2); 02233 return str; 02234 } 02235 02236 st_index_t 02237 rb_str_hash(VALUE str) 02238 { 02239 int e = ENCODING_GET(str); 02240 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) { 02241 e = 0; 02242 } 02243 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e; 02244 } 02245 02246 int 02247 rb_str_hash_cmp(VALUE str1, VALUE str2) 02248 { 02249 long len; 02250 02251 if (!rb_str_comparable(str1, str2)) return 1; 02252 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) && 02253 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) { 02254 return 0; 02255 } 02256 return 1; 02257 } 02258 02259 /* 02260 * call-seq: 02261 * str.hash -> fixnum 02262 * 02263 * Return a hash based on the string's length and content. 02264 */ 02265 02266 static VALUE 02267 rb_str_hash_m(VALUE str) 02268 { 02269 st_index_t hval = rb_str_hash(str); 02270 return INT2FIX(hval); 02271 } 02272 02273 #define lesser(a,b) (((a)>(b))?(b):(a)) 02274 02275 int 02276 rb_str_comparable(VALUE str1, VALUE str2) 02277 { 02278 int idx1, idx2; 02279 int rc1, rc2; 02280 02281 if (RSTRING_LEN(str1) == 0) return TRUE; 02282 if (RSTRING_LEN(str2) == 0) return TRUE; 02283 idx1 = ENCODING_GET(str1); 02284 idx2 = ENCODING_GET(str2); 02285 if (idx1 == idx2) return TRUE; 02286 rc1 = rb_enc_str_coderange(str1); 02287 rc2 = rb_enc_str_coderange(str2); 02288 if (rc1 == ENC_CODERANGE_7BIT) { 02289 if (rc2 == ENC_CODERANGE_7BIT) return TRUE; 02290 if (rb_enc_asciicompat(rb_enc_from_index(idx2))) 02291 return TRUE; 02292 } 02293 if (rc2 == ENC_CODERANGE_7BIT) { 02294 if (rb_enc_asciicompat(rb_enc_from_index(idx1))) 02295 return TRUE; 02296 } 02297 return FALSE; 02298 } 02299 02300 int 02301 rb_str_cmp(VALUE str1, VALUE str2) 02302 { 02303 long len1, len2; 02304 const char *ptr1, *ptr2; 02305 int retval; 02306 02307 if (str1 == str2) return 0; 02308 RSTRING_GETMEM(str1, ptr1, len1); 02309 RSTRING_GETMEM(str2, ptr2, len2); 02310 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) { 02311 if (len1 == len2) { 02312 if (!rb_str_comparable(str1, str2)) { 02313 if (ENCODING_GET(str1) > ENCODING_GET(str2)) 02314 return 1; 02315 return -1; 02316 } 02317 return 0; 02318 } 02319 if (len1 > len2) return 1; 02320 return -1; 02321 } 02322 if (retval > 0) return 1; 02323 return -1; 02324 } 02325 02326 /* expect tail call optimization */ 02327 static VALUE 02328 str_eql(const VALUE str1, const VALUE str2) 02329 { 02330 const long len = RSTRING_LEN(str1); 02331 const char *ptr1, *ptr2; 02332 02333 if (len != RSTRING_LEN(str2)) return Qfalse; 02334 if (!rb_str_comparable(str1, str2)) return Qfalse; 02335 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2))) 02336 return Qtrue; 02337 if (memcmp(ptr1, ptr2, len) == 0) 02338 return Qtrue; 02339 return Qfalse; 02340 } 02341 /* 02342 * call-seq: 02343 * str == obj -> true or false 02344 * 02345 * Equality---If <i>obj</i> is not a <code>String</code>, returns 02346 * <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i> 02347 * <code><=></code> <i>obj</i> returns zero. 02348 */ 02349 02350 VALUE 02351 rb_str_equal(VALUE str1, VALUE str2) 02352 { 02353 if (str1 == str2) return Qtrue; 02354 if (!RB_TYPE_P(str2, T_STRING)) { 02355 if (!rb_respond_to(str2, rb_intern("to_str"))) { 02356 return Qfalse; 02357 } 02358 return rb_equal(str2, str1); 02359 } 02360 return str_eql(str1, str2); 02361 } 02362 02363 /* 02364 * call-seq: 02365 * str.eql?(other) -> true or false 02366 * 02367 * Two strings are equal if they have the same length and content. 02368 */ 02369 02370 static VALUE 02371 rb_str_eql(VALUE str1, VALUE str2) 02372 { 02373 if (str1 == str2) return Qtrue; 02374 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse; 02375 return str_eql(str1, str2); 02376 } 02377 02378 /* 02379 * call-seq: 02380 * string <=> other_string -> -1, 0, +1 or nil 02381 * 02382 * 02383 * Comparison---Returns -1, 0, +1 or nil depending on whether +string+ is less 02384 * than, equal to, or greater than +other_string+. 02385 * 02386 * +nil+ is returned if the two values are incomparable. 02387 * 02388 * If the strings are of different lengths, and the strings are equal when 02389 * compared up to the shortest length, then the longer string is considered 02390 * greater than the shorter one. 02391 * 02392 * <code><=></code> is the basis for the methods <code><</code>, 02393 * <code><=</code>, <code>></code>, <code>>=</code>, and 02394 * <code>between?</code>, included from module Comparable. The method 02395 * String#== does not use Comparable#==. 02396 * 02397 * "abcdef" <=> "abcde" #=> 1 02398 * "abcdef" <=> "abcdef" #=> 0 02399 * "abcdef" <=> "abcdefg" #=> -1 02400 * "abcdef" <=> "ABCDEF" #=> 1 02401 */ 02402 02403 static VALUE 02404 rb_str_cmp_m(VALUE str1, VALUE str2) 02405 { 02406 int result; 02407 02408 if (!RB_TYPE_P(str2, T_STRING)) { 02409 VALUE tmp = rb_check_funcall(str2, rb_intern("to_str"), 0, 0); 02410 if (RB_TYPE_P(tmp, T_STRING)) { 02411 result = rb_str_cmp(str1, tmp); 02412 } 02413 else { 02414 return rb_invcmp(str1, str2); 02415 } 02416 } 02417 else { 02418 result = rb_str_cmp(str1, str2); 02419 } 02420 return INT2FIX(result); 02421 } 02422 02423 /* 02424 * call-seq: 02425 * str.casecmp(other_str) -> -1, 0, +1 or nil 02426 * 02427 * Case-insensitive version of <code>String#<=></code>. 02428 * 02429 * "abcdef".casecmp("abcde") #=> 1 02430 * "aBcDeF".casecmp("abcdef") #=> 0 02431 * "abcdef".casecmp("abcdefg") #=> -1 02432 * "abcdef".casecmp("ABCDEF") #=> 0 02433 */ 02434 02435 static VALUE 02436 rb_str_casecmp(VALUE str1, VALUE str2) 02437 { 02438 long len; 02439 rb_encoding *enc; 02440 char *p1, *p1end, *p2, *p2end; 02441 02442 StringValue(str2); 02443 enc = rb_enc_compatible(str1, str2); 02444 if (!enc) { 02445 return Qnil; 02446 } 02447 02448 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1); 02449 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2); 02450 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) { 02451 while (p1 < p1end && p2 < p2end) { 02452 if (*p1 != *p2) { 02453 unsigned int c1 = TOUPPER(*p1 & 0xff); 02454 unsigned int c2 = TOUPPER(*p2 & 0xff); 02455 if (c1 != c2) 02456 return INT2FIX(c1 < c2 ? -1 : 1); 02457 } 02458 p1++; 02459 p2++; 02460 } 02461 } 02462 else { 02463 while (p1 < p1end && p2 < p2end) { 02464 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc); 02465 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc); 02466 02467 if (0 <= c1 && 0 <= c2) { 02468 c1 = TOUPPER(c1); 02469 c2 = TOUPPER(c2); 02470 if (c1 != c2) 02471 return INT2FIX(c1 < c2 ? -1 : 1); 02472 } 02473 else { 02474 int r; 02475 l1 = rb_enc_mbclen(p1, p1end, enc); 02476 l2 = rb_enc_mbclen(p2, p2end, enc); 02477 len = l1 < l2 ? l1 : l2; 02478 r = memcmp(p1, p2, len); 02479 if (r != 0) 02480 return INT2FIX(r < 0 ? -1 : 1); 02481 if (l1 != l2) 02482 return INT2FIX(l1 < l2 ? -1 : 1); 02483 } 02484 p1 += l1; 02485 p2 += l2; 02486 } 02487 } 02488 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0); 02489 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1); 02490 return INT2FIX(-1); 02491 } 02492 02493 static long 02494 rb_str_index(VALUE str, VALUE sub, long offset) 02495 { 02496 long pos; 02497 char *s, *sptr, *e; 02498 long len, slen; 02499 rb_encoding *enc; 02500 02501 enc = rb_enc_check(str, sub); 02502 if (is_broken_string(sub)) { 02503 return -1; 02504 } 02505 len = str_strlen(str, enc); 02506 slen = str_strlen(sub, enc); 02507 if (offset < 0) { 02508 offset += len; 02509 if (offset < 0) return -1; 02510 } 02511 if (len - offset < slen) return -1; 02512 s = RSTRING_PTR(str); 02513 e = s + RSTRING_LEN(str); 02514 if (offset) { 02515 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str)); 02516 s += offset; 02517 } 02518 if (slen == 0) return offset; 02519 /* need proceed one character at a time */ 02520 sptr = RSTRING_PTR(sub); 02521 slen = RSTRING_LEN(sub); 02522 len = RSTRING_LEN(str) - offset; 02523 for (;;) { 02524 char *t; 02525 pos = rb_memsearch(sptr, slen, s, len, enc); 02526 if (pos < 0) return pos; 02527 t = rb_enc_right_char_head(s, s+pos, e, enc); 02528 if (t == s + pos) break; 02529 if ((len -= t - s) <= 0) return -1; 02530 offset += t - s; 02531 s = t; 02532 } 02533 return pos + offset; 02534 } 02535 02536 02537 /* 02538 * call-seq: 02539 * str.index(substring [, offset]) -> fixnum or nil 02540 * str.index(regexp [, offset]) -> fixnum or nil 02541 * 02542 * Returns the index of the first occurrence of the given <i>substring</i> or 02543 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not 02544 * found. If the second parameter is present, it specifies the position in the 02545 * string to begin the search. 02546 * 02547 * "hello".index('e') #=> 1 02548 * "hello".index('lo') #=> 3 02549 * "hello".index('a') #=> nil 02550 * "hello".index(?e) #=> 1 02551 * "hello".index(/[aeiou]/, -3) #=> 4 02552 */ 02553 02554 static VALUE 02555 rb_str_index_m(int argc, VALUE *argv, VALUE str) 02556 { 02557 VALUE sub; 02558 VALUE initpos; 02559 long pos; 02560 02561 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) { 02562 pos = NUM2LONG(initpos); 02563 } 02564 else { 02565 pos = 0; 02566 } 02567 if (pos < 0) { 02568 pos += str_strlen(str, STR_ENC_GET(str)); 02569 if (pos < 0) { 02570 if (RB_TYPE_P(sub, T_REGEXP)) { 02571 rb_backref_set(Qnil); 02572 } 02573 return Qnil; 02574 } 02575 } 02576 02577 if (SPECIAL_CONST_P(sub)) goto generic; 02578 switch (BUILTIN_TYPE(sub)) { 02579 case T_REGEXP: 02580 if (pos > str_strlen(str, STR_ENC_GET(str))) 02581 return Qnil; 02582 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 02583 rb_enc_check(str, sub), single_byte_optimizable(str)); 02584 02585 pos = rb_reg_search(sub, str, pos, 0); 02586 pos = rb_str_sublen(str, pos); 02587 break; 02588 02589 generic: 02590 default: { 02591 VALUE tmp; 02592 02593 tmp = rb_check_string_type(sub); 02594 if (NIL_P(tmp)) { 02595 rb_raise(rb_eTypeError, "type mismatch: %s given", 02596 rb_obj_classname(sub)); 02597 } 02598 sub = tmp; 02599 } 02600 /* fall through */ 02601 case T_STRING: 02602 pos = rb_str_index(str, sub, pos); 02603 pos = rb_str_sublen(str, pos); 02604 break; 02605 } 02606 02607 if (pos == -1) return Qnil; 02608 return LONG2NUM(pos); 02609 } 02610 02611 static long 02612 rb_str_rindex(VALUE str, VALUE sub, long pos) 02613 { 02614 long len, slen; 02615 char *s, *sbeg, *e, *t; 02616 rb_encoding *enc; 02617 int singlebyte = single_byte_optimizable(str); 02618 02619 enc = rb_enc_check(str, sub); 02620 if (is_broken_string(sub)) { 02621 return -1; 02622 } 02623 len = str_strlen(str, enc); 02624 slen = str_strlen(sub, enc); 02625 /* substring longer than string */ 02626 if (len < slen) return -1; 02627 if (len - pos < slen) { 02628 pos = len - slen; 02629 } 02630 if (len == 0) { 02631 return pos; 02632 } 02633 sbeg = RSTRING_PTR(str); 02634 e = RSTRING_END(str); 02635 t = RSTRING_PTR(sub); 02636 slen = RSTRING_LEN(sub); 02637 s = str_nth(sbeg, e, pos, enc, singlebyte); 02638 while (s) { 02639 if (memcmp(s, t, slen) == 0) { 02640 return pos; 02641 } 02642 if (pos == 0) break; 02643 pos--; 02644 s = rb_enc_prev_char(sbeg, s, e, enc); 02645 } 02646 return -1; 02647 } 02648 02649 02650 /* 02651 * call-seq: 02652 * str.rindex(substring [, fixnum]) -> fixnum or nil 02653 * str.rindex(regexp [, fixnum]) -> fixnum or nil 02654 * 02655 * Returns the index of the last occurrence of the given <i>substring</i> or 02656 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not 02657 * found. If the second parameter is present, it specifies the position in the 02658 * string to end the search---characters beyond this point will not be 02659 * considered. 02660 * 02661 * "hello".rindex('e') #=> 1 02662 * "hello".rindex('l') #=> 3 02663 * "hello".rindex('a') #=> nil 02664 * "hello".rindex(?e) #=> 1 02665 * "hello".rindex(/[aeiou]/, -2) #=> 1 02666 */ 02667 02668 static VALUE 02669 rb_str_rindex_m(int argc, VALUE *argv, VALUE str) 02670 { 02671 VALUE sub; 02672 VALUE vpos; 02673 rb_encoding *enc = STR_ENC_GET(str); 02674 long pos, len = str_strlen(str, enc); 02675 02676 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) { 02677 pos = NUM2LONG(vpos); 02678 if (pos < 0) { 02679 pos += len; 02680 if (pos < 0) { 02681 if (RB_TYPE_P(sub, T_REGEXP)) { 02682 rb_backref_set(Qnil); 02683 } 02684 return Qnil; 02685 } 02686 } 02687 if (pos > len) pos = len; 02688 } 02689 else { 02690 pos = len; 02691 } 02692 02693 if (SPECIAL_CONST_P(sub)) goto generic; 02694 switch (BUILTIN_TYPE(sub)) { 02695 case T_REGEXP: 02696 /* enc = rb_get_check(str, sub); */ 02697 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 02698 STR_ENC_GET(str), single_byte_optimizable(str)); 02699 02700 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) { 02701 pos = rb_reg_search(sub, str, pos, 1); 02702 pos = rb_str_sublen(str, pos); 02703 } 02704 if (pos >= 0) return LONG2NUM(pos); 02705 break; 02706 02707 generic: 02708 default: { 02709 VALUE tmp; 02710 02711 tmp = rb_check_string_type(sub); 02712 if (NIL_P(tmp)) { 02713 rb_raise(rb_eTypeError, "type mismatch: %s given", 02714 rb_obj_classname(sub)); 02715 } 02716 sub = tmp; 02717 } 02718 /* fall through */ 02719 case T_STRING: 02720 pos = rb_str_rindex(str, sub, pos); 02721 if (pos >= 0) return LONG2NUM(pos); 02722 break; 02723 } 02724 return Qnil; 02725 } 02726 02727 /* 02728 * call-seq: 02729 * str =~ obj -> fixnum or nil 02730 * 02731 * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match 02732 * against <i>str</i>,and returns the position the match starts, or 02733 * <code>nil</code> if there is no match. Otherwise, invokes 02734 * <i>obj.=~</i>, passing <i>str</i> as an argument. The default 02735 * <code>=~</code> in <code>Object</code> returns <code>nil</code>. 02736 * 02737 * Note: <code>str =~ regexp</code> is not the same as 02738 * <code>regexp =~ str</code>. Strings captured from named capture groups 02739 * are assigned to local variables only in the second case. 02740 * 02741 * "cat o' 9 tails" =~ /\d/ #=> 7 02742 * "cat o' 9 tails" =~ 9 #=> nil 02743 */ 02744 02745 static VALUE 02746 rb_str_match(VALUE x, VALUE y) 02747 { 02748 if (SPECIAL_CONST_P(y)) goto generic; 02749 switch (BUILTIN_TYPE(y)) { 02750 case T_STRING: 02751 rb_raise(rb_eTypeError, "type mismatch: String given"); 02752 02753 case T_REGEXP: 02754 return rb_reg_match(y, x); 02755 02756 generic: 02757 default: 02758 return rb_funcall(y, rb_intern("=~"), 1, x); 02759 } 02760 } 02761 02762 02763 static VALUE get_pat(VALUE, int); 02764 02765 02766 /* 02767 * call-seq: 02768 * str.match(pattern) -> matchdata or nil 02769 * str.match(pattern, pos) -> matchdata or nil 02770 * 02771 * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one), 02772 * then invokes its <code>match</code> method on <i>str</i>. If the second 02773 * parameter is present, it specifies the position in the string to begin the 02774 * search. 02775 * 02776 * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l"> 02777 * 'hello'.match('(.)\1')[0] #=> "ll" 02778 * 'hello'.match(/(.)\1/)[0] #=> "ll" 02779 * 'hello'.match('xx') #=> nil 02780 * 02781 * If a block is given, invoke the block with MatchData if match succeed, so 02782 * that you can write 02783 * 02784 * str.match(pat) {|m| ...} 02785 * 02786 * instead of 02787 * 02788 * if m = str.match(pat) 02789 * ... 02790 * end 02791 * 02792 * The return value is a value from block execution in this case. 02793 */ 02794 02795 static VALUE 02796 rb_str_match_m(int argc, VALUE *argv, VALUE str) 02797 { 02798 VALUE re, result; 02799 if (argc < 1) 02800 rb_check_arity(argc, 1, 2); 02801 re = argv[0]; 02802 argv[0] = str; 02803 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv); 02804 if (!NIL_P(result) && rb_block_given_p()) { 02805 return rb_yield(result); 02806 } 02807 return result; 02808 } 02809 02810 enum neighbor_char { 02811 NEIGHBOR_NOT_CHAR, 02812 NEIGHBOR_FOUND, 02813 NEIGHBOR_WRAPPED 02814 }; 02815 02816 static enum neighbor_char 02817 enc_succ_char(char *p, long len, rb_encoding *enc) 02818 { 02819 long i; 02820 int l; 02821 while (1) { 02822 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--) 02823 p[i] = '\0'; 02824 if (i < 0) 02825 return NEIGHBOR_WRAPPED; 02826 ++((unsigned char*)p)[i]; 02827 l = rb_enc_precise_mbclen(p, p+len, enc); 02828 if (MBCLEN_CHARFOUND_P(l)) { 02829 l = MBCLEN_CHARFOUND_LEN(l); 02830 if (l == len) { 02831 return NEIGHBOR_FOUND; 02832 } 02833 else { 02834 memset(p+l, 0xff, len-l); 02835 } 02836 } 02837 if (MBCLEN_INVALID_P(l) && i < len-1) { 02838 long len2; 02839 int l2; 02840 for (len2 = len-1; 0 < len2; len2--) { 02841 l2 = rb_enc_precise_mbclen(p, p+len2, enc); 02842 if (!MBCLEN_INVALID_P(l2)) 02843 break; 02844 } 02845 memset(p+len2+1, 0xff, len-(len2+1)); 02846 } 02847 } 02848 } 02849 02850 static enum neighbor_char 02851 enc_pred_char(char *p, long len, rb_encoding *enc) 02852 { 02853 long i; 02854 int l; 02855 while (1) { 02856 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--) 02857 p[i] = '\xff'; 02858 if (i < 0) 02859 return NEIGHBOR_WRAPPED; 02860 --((unsigned char*)p)[i]; 02861 l = rb_enc_precise_mbclen(p, p+len, enc); 02862 if (MBCLEN_CHARFOUND_P(l)) { 02863 l = MBCLEN_CHARFOUND_LEN(l); 02864 if (l == len) { 02865 return NEIGHBOR_FOUND; 02866 } 02867 else { 02868 memset(p+l, 0, len-l); 02869 } 02870 } 02871 if (MBCLEN_INVALID_P(l) && i < len-1) { 02872 long len2; 02873 int l2; 02874 for (len2 = len-1; 0 < len2; len2--) { 02875 l2 = rb_enc_precise_mbclen(p, p+len2, enc); 02876 if (!MBCLEN_INVALID_P(l2)) 02877 break; 02878 } 02879 memset(p+len2+1, 0, len-(len2+1)); 02880 } 02881 } 02882 } 02883 02884 /* 02885 overwrite +p+ by succeeding letter in +enc+ and returns 02886 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED. 02887 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry. 02888 assuming each ranges are successive, and mbclen 02889 never change in each ranges. 02890 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one 02891 character. 02892 */ 02893 static enum neighbor_char 02894 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry) 02895 { 02896 enum neighbor_char ret; 02897 unsigned int c; 02898 int ctype; 02899 int range; 02900 char save[ONIGENC_CODE_TO_MBC_MAXLEN]; 02901 02902 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02903 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc)) 02904 ctype = ONIGENC_CTYPE_DIGIT; 02905 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc)) 02906 ctype = ONIGENC_CTYPE_ALPHA; 02907 else 02908 return NEIGHBOR_NOT_CHAR; 02909 02910 MEMCPY(save, p, char, len); 02911 ret = enc_succ_char(p, len, enc); 02912 if (ret == NEIGHBOR_FOUND) { 02913 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02914 if (rb_enc_isctype(c, ctype, enc)) 02915 return NEIGHBOR_FOUND; 02916 } 02917 MEMCPY(p, save, char, len); 02918 range = 1; 02919 while (1) { 02920 MEMCPY(save, p, char, len); 02921 ret = enc_pred_char(p, len, enc); 02922 if (ret == NEIGHBOR_FOUND) { 02923 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02924 if (!rb_enc_isctype(c, ctype, enc)) { 02925 MEMCPY(p, save, char, len); 02926 break; 02927 } 02928 } 02929 else { 02930 MEMCPY(p, save, char, len); 02931 break; 02932 } 02933 range++; 02934 } 02935 if (range == 1) { 02936 return NEIGHBOR_NOT_CHAR; 02937 } 02938 02939 if (ctype != ONIGENC_CTYPE_DIGIT) { 02940 MEMCPY(carry, p, char, len); 02941 return NEIGHBOR_WRAPPED; 02942 } 02943 02944 MEMCPY(carry, p, char, len); 02945 enc_succ_char(carry, len, enc); 02946 return NEIGHBOR_WRAPPED; 02947 } 02948 02949 02950 /* 02951 * call-seq: 02952 * str.succ -> new_str 02953 * str.next -> new_str 02954 * 02955 * Returns the successor to <i>str</i>. The successor is calculated by 02956 * incrementing characters starting from the rightmost alphanumeric (or 02957 * the rightmost character if there are no alphanumerics) in the 02958 * string. Incrementing a digit always results in another digit, and 02959 * incrementing a letter results in another letter of the same case. 02960 * Incrementing nonalphanumerics uses the underlying character set's 02961 * collating sequence. 02962 * 02963 * If the increment generates a ``carry,'' the character to the left of 02964 * it is incremented. This process repeats until there is no carry, 02965 * adding an additional character if necessary. 02966 * 02967 * "abcd".succ #=> "abce" 02968 * "THX1138".succ #=> "THX1139" 02969 * "<<koala>>".succ #=> "<<koalb>>" 02970 * "1999zzz".succ #=> "2000aaa" 02971 * "ZZZ9999".succ #=> "AAAA0000" 02972 * "***".succ #=> "**+" 02973 */ 02974 02975 VALUE 02976 rb_str_succ(VALUE orig) 02977 { 02978 rb_encoding *enc; 02979 VALUE str; 02980 char *sbeg, *s, *e, *last_alnum = 0; 02981 int c = -1; 02982 long l; 02983 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1"; 02984 long carry_pos = 0, carry_len = 1; 02985 enum neighbor_char neighbor = NEIGHBOR_FOUND; 02986 02987 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig)); 02988 rb_enc_cr_str_copy_for_substr(str, orig); 02989 OBJ_INFECT(str, orig); 02990 if (RSTRING_LEN(str) == 0) return str; 02991 02992 enc = STR_ENC_GET(orig); 02993 sbeg = RSTRING_PTR(str); 02994 s = e = sbeg + RSTRING_LEN(str); 02995 02996 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { 02997 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) { 02998 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) : 02999 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) { 03000 s = last_alnum; 03001 break; 03002 } 03003 } 03004 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; 03005 neighbor = enc_succ_alnum_char(s, l, enc, carry); 03006 switch (neighbor) { 03007 case NEIGHBOR_NOT_CHAR: 03008 continue; 03009 case NEIGHBOR_FOUND: 03010 return str; 03011 case NEIGHBOR_WRAPPED: 03012 last_alnum = s; 03013 break; 03014 } 03015 c = 1; 03016 carry_pos = s - sbeg; 03017 carry_len = l; 03018 } 03019 if (c == -1) { /* str contains no alnum */ 03020 s = e; 03021 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { 03022 enum neighbor_char neighbor; 03023 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; 03024 neighbor = enc_succ_char(s, l, enc); 03025 if (neighbor == NEIGHBOR_FOUND) 03026 return str; 03027 if (rb_enc_precise_mbclen(s, s+l, enc) != l) { 03028 /* wrapped to \0...\0. search next valid char. */ 03029 enc_succ_char(s, l, enc); 03030 } 03031 if (!rb_enc_asciicompat(enc)) { 03032 MEMCPY(carry, s, char, l); 03033 carry_len = l; 03034 } 03035 carry_pos = s - sbeg; 03036 } 03037 } 03038 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len); 03039 s = RSTRING_PTR(str) + carry_pos; 03040 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos); 03041 memmove(s, carry, carry_len); 03042 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len); 03043 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 03044 rb_enc_str_coderange(str); 03045 return str; 03046 } 03047 03048 03049 /* 03050 * call-seq: 03051 * str.succ! -> str 03052 * str.next! -> str 03053 * 03054 * Equivalent to <code>String#succ</code>, but modifies the receiver in 03055 * place. 03056 */ 03057 03058 static VALUE 03059 rb_str_succ_bang(VALUE str) 03060 { 03061 rb_str_shared_replace(str, rb_str_succ(str)); 03062 03063 return str; 03064 } 03065 03066 03067 /* 03068 * call-seq: 03069 * str.upto(other_str, exclusive=false) {|s| block } -> str 03070 * str.upto(other_str, exclusive=false) -> an_enumerator 03071 * 03072 * Iterates through successive values, starting at <i>str</i> and 03073 * ending at <i>other_str</i> inclusive, passing each value in turn to 03074 * the block. The <code>String#succ</code> method is used to generate 03075 * each value. If optional second argument exclusive is omitted or is false, 03076 * the last value will be included; otherwise it will be excluded. 03077 * 03078 * If no block is given, an enumerator is returned instead. 03079 * 03080 * "a8".upto("b6") {|s| print s, ' ' } 03081 * for s in "a8".."b6" 03082 * print s, ' ' 03083 * end 03084 * 03085 * <em>produces:</em> 03086 * 03087 * a8 a9 b0 b1 b2 b3 b4 b5 b6 03088 * a8 a9 b0 b1 b2 b3 b4 b5 b6 03089 * 03090 * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters, 03091 * both are recognized as decimal numbers. In addition, the width of 03092 * string (e.g. leading zeros) is handled appropriately. 03093 * 03094 * "9".upto("11").to_a #=> ["9", "10", "11"] 03095 * "25".upto("5").to_a #=> [] 03096 * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"] 03097 */ 03098 03099 static VALUE 03100 rb_str_upto(int argc, VALUE *argv, VALUE beg) 03101 { 03102 VALUE end, exclusive; 03103 VALUE current, after_end; 03104 ID succ; 03105 int n, excl, ascii; 03106 rb_encoding *enc; 03107 03108 rb_scan_args(argc, argv, "11", &end, &exclusive); 03109 RETURN_ENUMERATOR(beg, argc, argv); 03110 excl = RTEST(exclusive); 03111 CONST_ID(succ, "succ"); 03112 StringValue(end); 03113 enc = rb_enc_check(beg, end); 03114 ascii = (is_ascii_string(beg) && is_ascii_string(end)); 03115 /* single character */ 03116 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) { 03117 char c = RSTRING_PTR(beg)[0]; 03118 char e = RSTRING_PTR(end)[0]; 03119 03120 if (c > e || (excl && c == e)) return beg; 03121 for (;;) { 03122 rb_yield(rb_enc_str_new(&c, 1, enc)); 03123 if (!excl && c == e) break; 03124 c++; 03125 if (excl && c == e) break; 03126 } 03127 return beg; 03128 } 03129 /* both edges are all digits */ 03130 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) { 03131 char *s, *send; 03132 VALUE b, e; 03133 int width; 03134 03135 s = RSTRING_PTR(beg); send = RSTRING_END(beg); 03136 width = rb_long2int(send - s); 03137 while (s < send) { 03138 if (!ISDIGIT(*s)) goto no_digits; 03139 s++; 03140 } 03141 s = RSTRING_PTR(end); send = RSTRING_END(end); 03142 while (s < send) { 03143 if (!ISDIGIT(*s)) goto no_digits; 03144 s++; 03145 } 03146 b = rb_str_to_inum(beg, 10, FALSE); 03147 e = rb_str_to_inum(end, 10, FALSE); 03148 if (FIXNUM_P(b) && FIXNUM_P(e)) { 03149 long bi = FIX2LONG(b); 03150 long ei = FIX2LONG(e); 03151 rb_encoding *usascii = rb_usascii_encoding(); 03152 03153 while (bi <= ei) { 03154 if (excl && bi == ei) break; 03155 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi)); 03156 bi++; 03157 } 03158 } 03159 else { 03160 ID op = excl ? '<' : rb_intern("<="); 03161 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d")); 03162 03163 args[0] = INT2FIX(width); 03164 while (rb_funcall(b, op, 1, e)) { 03165 args[1] = b; 03166 rb_yield(rb_str_format(numberof(args), args, fmt)); 03167 b = rb_funcall(b, succ, 0, 0); 03168 } 03169 } 03170 return beg; 03171 } 03172 /* normal case */ 03173 no_digits: 03174 n = rb_str_cmp(beg, end); 03175 if (n > 0 || (excl && n == 0)) return beg; 03176 03177 after_end = rb_funcall(end, succ, 0, 0); 03178 current = rb_str_dup(beg); 03179 while (!rb_str_equal(current, after_end)) { 03180 VALUE next = Qnil; 03181 if (excl || !rb_str_equal(current, end)) 03182 next = rb_funcall(current, succ, 0, 0); 03183 rb_yield(current); 03184 if (NIL_P(next)) break; 03185 current = next; 03186 StringValue(current); 03187 if (excl && rb_str_equal(current, end)) break; 03188 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0) 03189 break; 03190 } 03191 03192 return beg; 03193 } 03194 03195 static VALUE 03196 rb_str_subpat(VALUE str, VALUE re, VALUE backref) 03197 { 03198 if (rb_reg_search(re, str, 0, 0) >= 0) { 03199 VALUE match = rb_backref_get(); 03200 int nth = rb_reg_backref_number(match, backref); 03201 return rb_reg_nth_match(nth, match); 03202 } 03203 return Qnil; 03204 } 03205 03206 static VALUE 03207 rb_str_aref(VALUE str, VALUE indx) 03208 { 03209 long idx; 03210 03211 if (FIXNUM_P(indx)) { 03212 idx = FIX2LONG(indx); 03213 03214 num_index: 03215 str = rb_str_substr(str, idx, 1); 03216 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil; 03217 return str; 03218 } 03219 03220 if (SPECIAL_CONST_P(indx)) goto generic; 03221 switch (BUILTIN_TYPE(indx)) { 03222 case T_REGEXP: 03223 return rb_str_subpat(str, indx, INT2FIX(0)); 03224 03225 case T_STRING: 03226 if (rb_str_index(str, indx, 0) != -1) 03227 return rb_str_dup(indx); 03228 return Qnil; 03229 03230 generic: 03231 default: 03232 /* check if indx is Range */ 03233 { 03234 long beg, len; 03235 VALUE tmp; 03236 03237 len = str_strlen(str, STR_ENC_GET(str)); 03238 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { 03239 case Qfalse: 03240 break; 03241 case Qnil: 03242 return Qnil; 03243 default: 03244 tmp = rb_str_substr(str, beg, len); 03245 return tmp; 03246 } 03247 } 03248 idx = NUM2LONG(indx); 03249 goto num_index; 03250 } 03251 03252 UNREACHABLE; 03253 } 03254 03255 03256 /* 03257 * call-seq: 03258 * str[index] -> new_str or nil 03259 * str[start, length] -> new_str or nil 03260 * str[range] -> new_str or nil 03261 * str[regexp] -> new_str or nil 03262 * str[regexp, capture] -> new_str or nil 03263 * str[match_str] -> new_str or nil 03264 * str.slice(index) -> new_str or nil 03265 * str.slice(start, length) -> new_str or nil 03266 * str.slice(range) -> new_str or nil 03267 * str.slice(regexp) -> new_str or nil 03268 * str.slice(regexp, capture) -> new_str or nil 03269 * str.slice(match_str) -> new_str or nil 03270 * 03271 * Element Reference --- If passed a single +index+, returns a substring of 03272 * one character at that index. If passed a +start+ index and a +length+, 03273 * returns a substring containing +length+ characters starting at the 03274 * +index+. If passed a +range+, its beginning and end are interpreted as 03275 * offsets delimiting the substring to be returned. 03276 * 03277 * In these three cases, if an index is negative, it is counted from the end 03278 * of the string. For the +start+ and +range+ cases the starting index 03279 * is just before a character and an index matching the string's size. 03280 * Additionally, an empty string is returned when the starting index for a 03281 * character range is at the end of the string. 03282 * 03283 * Returns +nil+ if the initial index falls outside the string or the length 03284 * is negative. 03285 * 03286 * If a +Regexp+ is supplied, the matching portion of the string is 03287 * returned. If a +capture+ follows the regular expression, which may be a 03288 * capture group index or name, follows the regular expression that component 03289 * of the MatchData is returned instead. 03290 * 03291 * If a +match_str+ is given, that string is returned if it occurs in 03292 * the string. 03293 * 03294 * Returns +nil+ if the regular expression does not match or the match string 03295 * cannot be found. 03296 * 03297 * a = "hello there" 03298 * 03299 * a[1] #=> "e" 03300 * a[2, 3] #=> "llo" 03301 * a[2..3] #=> "ll" 03302 * 03303 * a[-3, 2] #=> "er" 03304 * a[7..-2] #=> "her" 03305 * a[-4..-2] #=> "her" 03306 * a[-2..-4] #=> "" 03307 * 03308 * a[11, 0] #=> "" 03309 * a[11] #=> nil 03310 * a[12, 0] #=> nil 03311 * a[12..-1] #=> nil 03312 * 03313 * a[/[aeiou](.)\1/] #=> "ell" 03314 * a[/[aeiou](.)\1/, 0] #=> "ell" 03315 * a[/[aeiou](.)\1/, 1] #=> "l" 03316 * a[/[aeiou](.)\1/, 2] #=> nil 03317 * 03318 * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l" 03319 * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"] #=> "e" 03320 * 03321 * a["lo"] #=> "lo" 03322 * a["bye"] #=> nil 03323 */ 03324 03325 static VALUE 03326 rb_str_aref_m(int argc, VALUE *argv, VALUE str) 03327 { 03328 if (argc == 2) { 03329 if (RB_TYPE_P(argv[0], T_REGEXP)) { 03330 return rb_str_subpat(str, argv[0], argv[1]); 03331 } 03332 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); 03333 } 03334 rb_check_arity(argc, 1, 2); 03335 return rb_str_aref(str, argv[0]); 03336 } 03337 03338 VALUE 03339 rb_str_drop_bytes(VALUE str, long len) 03340 { 03341 char *ptr = RSTRING_PTR(str); 03342 long olen = RSTRING_LEN(str), nlen; 03343 03344 str_modifiable(str); 03345 if (len > olen) len = olen; 03346 nlen = olen - len; 03347 if (nlen <= RSTRING_EMBED_LEN_MAX) { 03348 char *oldptr = ptr; 03349 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED)); 03350 STR_SET_EMBED(str); 03351 STR_SET_EMBED_LEN(str, nlen); 03352 ptr = RSTRING(str)->as.ary; 03353 memmove(ptr, oldptr + len, nlen); 03354 if (fl == STR_NOEMBED) xfree(oldptr); 03355 } 03356 else { 03357 if (!STR_SHARED_P(str)) rb_str_new4(str); 03358 ptr = RSTRING(str)->as.heap.ptr += len; 03359 RSTRING(str)->as.heap.len = nlen; 03360 } 03361 ptr[nlen] = 0; 03362 ENC_CODERANGE_CLEAR(str); 03363 return str; 03364 } 03365 03366 static void 03367 rb_str_splice_0(VALUE str, long beg, long len, VALUE val) 03368 { 03369 if (beg == 0 && RSTRING_LEN(val) == 0) { 03370 rb_str_drop_bytes(str, len); 03371 OBJ_INFECT(str, val); 03372 return; 03373 } 03374 03375 rb_str_modify(str); 03376 if (len < RSTRING_LEN(val)) { 03377 /* expand string */ 03378 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1); 03379 } 03380 03381 if (RSTRING_LEN(val) != len) { 03382 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val), 03383 RSTRING_PTR(str) + beg + len, 03384 RSTRING_LEN(str) - (beg + len)); 03385 } 03386 if (RSTRING_LEN(val) < beg && len < 0) { 03387 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len); 03388 } 03389 if (RSTRING_LEN(val) > 0) { 03390 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val)); 03391 } 03392 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len); 03393 if (RSTRING_PTR(str)) { 03394 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 03395 } 03396 OBJ_INFECT(str, val); 03397 } 03398 03399 static void 03400 rb_str_splice(VALUE str, long beg, long len, VALUE val) 03401 { 03402 long slen; 03403 char *p, *e; 03404 rb_encoding *enc; 03405 int singlebyte = single_byte_optimizable(str); 03406 int cr; 03407 03408 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); 03409 03410 StringValue(val); 03411 enc = rb_enc_check(str, val); 03412 slen = str_strlen(str, enc); 03413 03414 if (slen < beg) { 03415 out_of_range: 03416 rb_raise(rb_eIndexError, "index %ld out of string", beg); 03417 } 03418 if (beg < 0) { 03419 if (-beg > slen) { 03420 goto out_of_range; 03421 } 03422 beg += slen; 03423 } 03424 if (slen < len || slen < beg + len) { 03425 len = slen - beg; 03426 } 03427 str_modify_keep_cr(str); 03428 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte); 03429 if (!p) p = RSTRING_END(str); 03430 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte); 03431 if (!e) e = RSTRING_END(str); 03432 /* error check */ 03433 beg = p - RSTRING_PTR(str); /* physical position */ 03434 len = e - p; /* physical length */ 03435 rb_str_splice_0(str, beg, len, val); 03436 rb_enc_associate(str, enc); 03437 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val)); 03438 if (cr != ENC_CODERANGE_BROKEN) 03439 ENC_CODERANGE_SET(str, cr); 03440 } 03441 03442 void 03443 rb_str_update(VALUE str, long beg, long len, VALUE val) 03444 { 03445 rb_str_splice(str, beg, len, val); 03446 } 03447 03448 static void 03449 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val) 03450 { 03451 int nth; 03452 VALUE match; 03453 long start, end, len; 03454 rb_encoding *enc; 03455 struct re_registers *regs; 03456 03457 if (rb_reg_search(re, str, 0, 0) < 0) { 03458 rb_raise(rb_eIndexError, "regexp not matched"); 03459 } 03460 match = rb_backref_get(); 03461 nth = rb_reg_backref_number(match, backref); 03462 regs = RMATCH_REGS(match); 03463 if (nth >= regs->num_regs) { 03464 out_of_range: 03465 rb_raise(rb_eIndexError, "index %d out of regexp", nth); 03466 } 03467 if (nth < 0) { 03468 if (-nth >= regs->num_regs) { 03469 goto out_of_range; 03470 } 03471 nth += regs->num_regs; 03472 } 03473 03474 start = BEG(nth); 03475 if (start == -1) { 03476 rb_raise(rb_eIndexError, "regexp group %d not matched", nth); 03477 } 03478 end = END(nth); 03479 len = end - start; 03480 StringValue(val); 03481 enc = rb_enc_check(str, val); 03482 rb_str_splice_0(str, start, len, val); 03483 rb_enc_associate(str, enc); 03484 } 03485 03486 static VALUE 03487 rb_str_aset(VALUE str, VALUE indx, VALUE val) 03488 { 03489 long idx, beg; 03490 03491 if (FIXNUM_P(indx)) { 03492 idx = FIX2LONG(indx); 03493 num_index: 03494 rb_str_splice(str, idx, 1, val); 03495 return val; 03496 } 03497 03498 if (SPECIAL_CONST_P(indx)) goto generic; 03499 switch (TYPE(indx)) { 03500 case T_REGEXP: 03501 rb_str_subpat_set(str, indx, INT2FIX(0), val); 03502 return val; 03503 03504 case T_STRING: 03505 beg = rb_str_index(str, indx, 0); 03506 if (beg < 0) { 03507 rb_raise(rb_eIndexError, "string not matched"); 03508 } 03509 beg = rb_str_sublen(str, beg); 03510 rb_str_splice(str, beg, str_strlen(indx, 0), val); 03511 return val; 03512 03513 generic: 03514 default: 03515 /* check if indx is Range */ 03516 { 03517 long beg, len; 03518 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) { 03519 rb_str_splice(str, beg, len, val); 03520 return val; 03521 } 03522 } 03523 idx = NUM2LONG(indx); 03524 goto num_index; 03525 } 03526 } 03527 03528 /* 03529 * call-seq: 03530 * str[fixnum] = new_str 03531 * str[fixnum, fixnum] = new_str 03532 * str[range] = aString 03533 * str[regexp] = new_str 03534 * str[regexp, fixnum] = new_str 03535 * str[regexp, name] = new_str 03536 * str[other_str] = new_str 03537 * 03538 * Element Assignment---Replaces some or all of the content of <i>str</i>. The 03539 * portion of the string affected is determined using the same criteria as 03540 * <code>String#[]</code>. If the replacement string is not the same length as 03541 * the text it is replacing, the string will be adjusted accordingly. If the 03542 * regular expression or string is used as the index doesn't match a position 03543 * in the string, <code>IndexError</code> is raised. If the regular expression 03544 * form is used, the optional second <code>Fixnum</code> allows you to specify 03545 * which portion of the match to replace (effectively using the 03546 * <code>MatchData</code> indexing rules. The forms that take a 03547 * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is 03548 * out of range; the <code>Range</code> form will raise a 03549 * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code> 03550 * will raise an <code>IndexError</code> on negative match. 03551 */ 03552 03553 static VALUE 03554 rb_str_aset_m(int argc, VALUE *argv, VALUE str) 03555 { 03556 if (argc == 3) { 03557 if (RB_TYPE_P(argv[0], T_REGEXP)) { 03558 rb_str_subpat_set(str, argv[0], argv[1], argv[2]); 03559 } 03560 else { 03561 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]); 03562 } 03563 return argv[2]; 03564 } 03565 rb_check_arity(argc, 2, 3); 03566 return rb_str_aset(str, argv[0], argv[1]); 03567 } 03568 03569 /* 03570 * call-seq: 03571 * str.insert(index, other_str) -> str 03572 * 03573 * Inserts <i>other_str</i> before the character at the given 03574 * <i>index</i>, modifying <i>str</i>. Negative indices count from the 03575 * end of the string, and insert <em>after</em> the given character. 03576 * The intent is insert <i>aString</i> so that it starts at the given 03577 * <i>index</i>. 03578 * 03579 * "abcd".insert(0, 'X') #=> "Xabcd" 03580 * "abcd".insert(3, 'X') #=> "abcXd" 03581 * "abcd".insert(4, 'X') #=> "abcdX" 03582 * "abcd".insert(-3, 'X') #=> "abXcd" 03583 * "abcd".insert(-1, 'X') #=> "abcdX" 03584 */ 03585 03586 static VALUE 03587 rb_str_insert(VALUE str, VALUE idx, VALUE str2) 03588 { 03589 long pos = NUM2LONG(idx); 03590 03591 if (pos == -1) { 03592 return rb_str_append(str, str2); 03593 } 03594 else if (pos < 0) { 03595 pos++; 03596 } 03597 rb_str_splice(str, pos, 0, str2); 03598 return str; 03599 } 03600 03601 03602 /* 03603 * call-seq: 03604 * str.slice!(fixnum) -> fixnum or nil 03605 * str.slice!(fixnum, fixnum) -> new_str or nil 03606 * str.slice!(range) -> new_str or nil 03607 * str.slice!(regexp) -> new_str or nil 03608 * str.slice!(other_str) -> new_str or nil 03609 * 03610 * Deletes the specified portion from <i>str</i>, and returns the portion 03611 * deleted. 03612 * 03613 * string = "this is a string" 03614 * string.slice!(2) #=> "i" 03615 * string.slice!(3..6) #=> " is " 03616 * string.slice!(/s.*t/) #=> "sa st" 03617 * string.slice!("r") #=> "r" 03618 * string #=> "thing" 03619 */ 03620 03621 static VALUE 03622 rb_str_slice_bang(int argc, VALUE *argv, VALUE str) 03623 { 03624 VALUE result; 03625 VALUE buf[3]; 03626 int i; 03627 03628 rb_check_arity(argc, 1, 2); 03629 for (i=0; i<argc; i++) { 03630 buf[i] = argv[i]; 03631 } 03632 str_modify_keep_cr(str); 03633 result = rb_str_aref_m(argc, buf, str); 03634 if (!NIL_P(result)) { 03635 buf[i] = rb_str_new(0,0); 03636 rb_str_aset_m(argc+1, buf, str); 03637 } 03638 return result; 03639 } 03640 03641 static VALUE 03642 get_pat(VALUE pat, int quote) 03643 { 03644 VALUE val; 03645 03646 switch (TYPE(pat)) { 03647 case T_REGEXP: 03648 return pat; 03649 03650 case T_STRING: 03651 break; 03652 03653 default: 03654 val = rb_check_string_type(pat); 03655 if (NIL_P(val)) { 03656 Check_Type(pat, T_REGEXP); 03657 } 03658 pat = val; 03659 } 03660 03661 if (quote) { 03662 pat = rb_reg_quote(pat); 03663 } 03664 03665 return rb_reg_regcomp(pat); 03666 } 03667 03668 03669 /* 03670 * call-seq: 03671 * str.sub!(pattern, replacement) -> str or nil 03672 * str.sub!(pattern) {|match| block } -> str or nil 03673 * 03674 * Performs the same substitution as String#sub in-place. 03675 * 03676 * Returns +str+ if a substitution was performed or +nil+ if no substitution 03677 * was performed. 03678 */ 03679 03680 static VALUE 03681 rb_str_sub_bang(int argc, VALUE *argv, VALUE str) 03682 { 03683 VALUE pat, repl, hash = Qnil; 03684 int iter = 0; 03685 int tainted = 0; 03686 int untrusted = 0; 03687 long plen; 03688 int min_arity = rb_block_given_p() ? 1 : 2; 03689 03690 rb_check_arity(argc, min_arity, 2); 03691 if (argc == 1) { 03692 iter = 1; 03693 } 03694 else { 03695 repl = argv[1]; 03696 hash = rb_check_hash_type(argv[1]); 03697 if (NIL_P(hash)) { 03698 StringValue(repl); 03699 } 03700 if (OBJ_TAINTED(repl)) tainted = 1; 03701 if (OBJ_UNTRUSTED(repl)) untrusted = 1; 03702 } 03703 03704 pat = get_pat(argv[0], 1); 03705 str_modifiable(str); 03706 if (rb_reg_search(pat, str, 0, 0) >= 0) { 03707 rb_encoding *enc; 03708 int cr = ENC_CODERANGE(str); 03709 VALUE match = rb_backref_get(); 03710 struct re_registers *regs = RMATCH_REGS(match); 03711 long beg0 = BEG(0); 03712 long end0 = END(0); 03713 char *p, *rp; 03714 long len, rlen; 03715 03716 if (iter || !NIL_P(hash)) { 03717 p = RSTRING_PTR(str); len = RSTRING_LEN(str); 03718 03719 if (iter) { 03720 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); 03721 } 03722 else { 03723 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0)); 03724 repl = rb_obj_as_string(repl); 03725 } 03726 str_mod_check(str, p, len); 03727 rb_check_frozen(str); 03728 } 03729 else { 03730 repl = rb_reg_regsub(repl, str, regs, pat); 03731 } 03732 enc = rb_enc_compatible(str, repl); 03733 if (!enc) { 03734 rb_encoding *str_enc = STR_ENC_GET(str); 03735 p = RSTRING_PTR(str); len = RSTRING_LEN(str); 03736 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT || 03737 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) { 03738 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 03739 rb_enc_name(str_enc), 03740 rb_enc_name(STR_ENC_GET(repl))); 03741 } 03742 enc = STR_ENC_GET(repl); 03743 } 03744 rb_str_modify(str); 03745 rb_enc_associate(str, enc); 03746 if (OBJ_TAINTED(repl)) tainted = 1; 03747 if (OBJ_UNTRUSTED(repl)) untrusted = 1; 03748 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) { 03749 int cr2 = ENC_CODERANGE(repl); 03750 if (cr2 == ENC_CODERANGE_BROKEN || 03751 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT)) 03752 cr = ENC_CODERANGE_UNKNOWN; 03753 else 03754 cr = cr2; 03755 } 03756 plen = end0 - beg0; 03757 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl); 03758 len = RSTRING_LEN(str); 03759 if (rlen > plen) { 03760 RESIZE_CAPA(str, len + rlen - plen); 03761 } 03762 p = RSTRING_PTR(str); 03763 if (rlen != plen) { 03764 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen); 03765 } 03766 memcpy(p + beg0, rp, rlen); 03767 len += rlen - plen; 03768 STR_SET_LEN(str, len); 03769 RSTRING_PTR(str)[len] = '\0'; 03770 ENC_CODERANGE_SET(str, cr); 03771 if (tainted) OBJ_TAINT(str); 03772 if (untrusted) OBJ_UNTRUST(str); 03773 03774 return str; 03775 } 03776 return Qnil; 03777 } 03778 03779 03780 /* 03781 * call-seq: 03782 * str.sub(pattern, replacement) -> new_str 03783 * str.sub(pattern, hash) -> new_str 03784 * str.sub(pattern) {|match| block } -> new_str 03785 * 03786 * Returns a copy of +str+ with the _first_ occurrence of +pattern+ 03787 * replaced by the second argument. The +pattern+ is typically a Regexp; if 03788 * given as a String, any regular expression metacharacters it contains will 03789 * be interpreted literally, e.g. <code>'\\\d'</code> will match a backlash 03790 * followed by 'd', instead of a digit. 03791 * 03792 * If +replacement+ is a String it will be substituted for the matched text. 03793 * It may contain back-references to the pattern's capture groups of the form 03794 * <code>"\\d"</code>, where <i>d</i> is a group number, or 03795 * <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a 03796 * double-quoted string, both back-references must be preceded by an 03797 * additional backslash. However, within +replacement+ the special match 03798 * variables, such as <code>&$</code>, will not refer to the current match. 03799 * 03800 * If the second argument is a Hash, and the matched text is one of its keys, 03801 * the corresponding value is the replacement string. 03802 * 03803 * In the block form, the current match string is passed in as a parameter, 03804 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>, 03805 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value 03806 * returned by the block will be substituted for the match on each call. 03807 * 03808 * The result inherits any tainting in the original string or any supplied 03809 * replacement string. 03810 * 03811 * "hello".sub(/[aeiou]/, '*') #=> "h*llo" 03812 * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo" 03813 * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello" 03814 * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo" 03815 * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV) 03816 * #=> "Is /bin/bash your preferred shell?" 03817 */ 03818 03819 static VALUE 03820 rb_str_sub(int argc, VALUE *argv, VALUE str) 03821 { 03822 str = rb_str_dup(str); 03823 rb_str_sub_bang(argc, argv, str); 03824 return str; 03825 } 03826 03827 static VALUE 03828 str_gsub(int argc, VALUE *argv, VALUE str, int bang) 03829 { 03830 VALUE pat, val, repl, match, dest, hash = Qnil; 03831 struct re_registers *regs; 03832 long beg, n; 03833 long beg0, end0; 03834 long offset, blen, slen, len, last; 03835 int iter = 0; 03836 char *sp, *cp; 03837 int tainted = 0; 03838 rb_encoding *str_enc; 03839 03840 switch (argc) { 03841 case 1: 03842 RETURN_ENUMERATOR(str, argc, argv); 03843 iter = 1; 03844 break; 03845 case 2: 03846 repl = argv[1]; 03847 hash = rb_check_hash_type(argv[1]); 03848 if (NIL_P(hash)) { 03849 StringValue(repl); 03850 } 03851 if (OBJ_TAINTED(repl)) tainted = 1; 03852 break; 03853 default: 03854 rb_check_arity(argc, 1, 2); 03855 } 03856 03857 pat = get_pat(argv[0], 1); 03858 beg = rb_reg_search(pat, str, 0, 0); 03859 if (beg < 0) { 03860 if (bang) return Qnil; /* no match, no substitution */ 03861 return rb_str_dup(str); 03862 } 03863 03864 offset = 0; 03865 n = 0; 03866 blen = RSTRING_LEN(str) + 30; /* len + margin */ 03867 dest = rb_str_buf_new(blen); 03868 sp = RSTRING_PTR(str); 03869 slen = RSTRING_LEN(str); 03870 cp = sp; 03871 str_enc = STR_ENC_GET(str); 03872 rb_enc_associate(dest, str_enc); 03873 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID); 03874 03875 do { 03876 n++; 03877 match = rb_backref_get(); 03878 regs = RMATCH_REGS(match); 03879 beg0 = BEG(0); 03880 end0 = END(0); 03881 if (iter || !NIL_P(hash)) { 03882 if (iter) { 03883 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); 03884 } 03885 else { 03886 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0))); 03887 val = rb_obj_as_string(val); 03888 } 03889 str_mod_check(str, sp, slen); 03890 if (val == dest) { /* paranoid check [ruby-dev:24827] */ 03891 rb_raise(rb_eRuntimeError, "block should not cheat"); 03892 } 03893 } 03894 else { 03895 val = rb_reg_regsub(repl, str, regs, pat); 03896 } 03897 03898 if (OBJ_TAINTED(val)) tainted = 1; 03899 03900 len = beg - offset; /* copy pre-match substr */ 03901 if (len) { 03902 rb_enc_str_buf_cat(dest, cp, len, str_enc); 03903 } 03904 03905 rb_str_buf_append(dest, val); 03906 03907 last = offset; 03908 offset = end0; 03909 if (beg0 == end0) { 03910 /* 03911 * Always consume at least one character of the input string 03912 * in order to prevent infinite loops. 03913 */ 03914 if (RSTRING_LEN(str) <= end0) break; 03915 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc); 03916 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc); 03917 offset = end0 + len; 03918 } 03919 cp = RSTRING_PTR(str) + offset; 03920 if (offset > RSTRING_LEN(str)) break; 03921 beg = rb_reg_search(pat, str, offset, 0); 03922 } while (beg >= 0); 03923 if (RSTRING_LEN(str) > offset) { 03924 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc); 03925 } 03926 rb_reg_search(pat, str, last, 0); 03927 if (bang) { 03928 rb_str_shared_replace(str, dest); 03929 } 03930 else { 03931 RBASIC(dest)->klass = rb_obj_class(str); 03932 OBJ_INFECT(dest, str); 03933 str = dest; 03934 } 03935 03936 if (tainted) OBJ_TAINT(str); 03937 return str; 03938 } 03939 03940 03941 /* 03942 * call-seq: 03943 * str.gsub!(pattern, replacement) -> str or nil 03944 * str.gsub!(pattern) {|match| block } -> str or nil 03945 * str.gsub!(pattern) -> an_enumerator 03946 * 03947 * Performs the substitutions of <code>String#gsub</code> in place, returning 03948 * <i>str</i>, or <code>nil</code> if no substitutions were performed. 03949 * If no block and no <i>replacement</i> is given, an enumerator is returned instead. 03950 */ 03951 03952 static VALUE 03953 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str) 03954 { 03955 str_modify_keep_cr(str); 03956 return str_gsub(argc, argv, str, 1); 03957 } 03958 03959 03960 /* 03961 * call-seq: 03962 * str.gsub(pattern, replacement) -> new_str 03963 * str.gsub(pattern, hash) -> new_str 03964 * str.gsub(pattern) {|match| block } -> new_str 03965 * str.gsub(pattern) -> enumerator 03966 * 03967 * Returns a copy of <i>str</i> with the <em>all</em> occurrences of 03968 * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is 03969 * typically a <code>Regexp</code>; if given as a <code>String</code>, any 03970 * regular expression metacharacters it contains will be interpreted 03971 * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd', 03972 * instead of a digit. 03973 * 03974 * If <i>replacement</i> is a <code>String</code> it will be substituted for 03975 * the matched text. It may contain back-references to the pattern's capture 03976 * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or 03977 * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a 03978 * double-quoted string, both back-references must be preceded by an 03979 * additional backslash. However, within <i>replacement</i> the special match 03980 * variables, such as <code>$&</code>, will not refer to the current match. 03981 * 03982 * If the second argument is a <code>Hash</code>, and the matched text is one 03983 * of its keys, the corresponding value is the replacement string. 03984 * 03985 * In the block form, the current match string is passed in as a parameter, 03986 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>, 03987 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value 03988 * returned by the block will be substituted for the match on each call. 03989 * 03990 * The result inherits any tainting in the original string or any supplied 03991 * replacement string. 03992 * 03993 * When neither a block nor a second argument is supplied, an 03994 * <code>Enumerator</code> is returned. 03995 * 03996 * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*" 03997 * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>" 03998 * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 " 03999 * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}" 04000 * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*" 04001 */ 04002 04003 static VALUE 04004 rb_str_gsub(int argc, VALUE *argv, VALUE str) 04005 { 04006 return str_gsub(argc, argv, str, 0); 04007 } 04008 04009 04010 /* 04011 * call-seq: 04012 * str.replace(other_str) -> str 04013 * 04014 * Replaces the contents and taintedness of <i>str</i> with the corresponding 04015 * values in <i>other_str</i>. 04016 * 04017 * s = "hello" #=> "hello" 04018 * s.replace "world" #=> "world" 04019 */ 04020 04021 VALUE 04022 rb_str_replace(VALUE str, VALUE str2) 04023 { 04024 str_modifiable(str); 04025 if (str == str2) return str; 04026 04027 StringValue(str2); 04028 str_discard(str); 04029 return str_replace(str, str2); 04030 } 04031 04032 /* 04033 * call-seq: 04034 * string.clear -> string 04035 * 04036 * Makes string empty. 04037 * 04038 * a = "abcde" 04039 * a.clear #=> "" 04040 */ 04041 04042 static VALUE 04043 rb_str_clear(VALUE str) 04044 { 04045 str_discard(str); 04046 STR_SET_EMBED(str); 04047 STR_SET_EMBED_LEN(str, 0); 04048 RSTRING_PTR(str)[0] = 0; 04049 if (rb_enc_asciicompat(STR_ENC_GET(str))) 04050 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 04051 else 04052 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); 04053 return str; 04054 } 04055 04056 /* 04057 * call-seq: 04058 * string.chr -> string 04059 * 04060 * Returns a one-character string at the beginning of the string. 04061 * 04062 * a = "abcde" 04063 * a.chr #=> "a" 04064 */ 04065 04066 static VALUE 04067 rb_str_chr(VALUE str) 04068 { 04069 return rb_str_substr(str, 0, 1); 04070 } 04071 04072 /* 04073 * call-seq: 04074 * str.getbyte(index) -> 0 .. 255 04075 * 04076 * returns the <i>index</i>th byte as an integer. 04077 */ 04078 static VALUE 04079 rb_str_getbyte(VALUE str, VALUE index) 04080 { 04081 long pos = NUM2LONG(index); 04082 04083 if (pos < 0) 04084 pos += RSTRING_LEN(str); 04085 if (pos < 0 || RSTRING_LEN(str) <= pos) 04086 return Qnil; 04087 04088 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]); 04089 } 04090 04091 /* 04092 * call-seq: 04093 * str.setbyte(index, integer) -> integer 04094 * 04095 * modifies the <i>index</i>th byte as <i>integer</i>. 04096 */ 04097 static VALUE 04098 rb_str_setbyte(VALUE str, VALUE index, VALUE value) 04099 { 04100 long pos = NUM2LONG(index); 04101 int byte = NUM2INT(value); 04102 04103 rb_str_modify(str); 04104 04105 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos) 04106 rb_raise(rb_eIndexError, "index %ld out of string", pos); 04107 if (pos < 0) 04108 pos += RSTRING_LEN(str); 04109 04110 RSTRING_PTR(str)[pos] = byte; 04111 04112 return value; 04113 } 04114 04115 static VALUE 04116 str_byte_substr(VALUE str, long beg, long len) 04117 { 04118 char *p, *s = RSTRING_PTR(str); 04119 long n = RSTRING_LEN(str); 04120 VALUE str2; 04121 04122 if (beg > n || len < 0) return Qnil; 04123 if (beg < 0) { 04124 beg += n; 04125 if (beg < 0) return Qnil; 04126 } 04127 if (beg + len > n) 04128 len = n - beg; 04129 if (len <= 0) { 04130 len = 0; 04131 p = 0; 04132 } 04133 else 04134 p = s + beg; 04135 04136 if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) { 04137 str2 = rb_str_new4(str); 04138 str2 = str_new3(rb_obj_class(str2), str2); 04139 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; 04140 RSTRING(str2)->as.heap.len = len; 04141 } 04142 else { 04143 str2 = rb_str_new5(str, p, len); 04144 } 04145 04146 str_enc_copy(str2, str); 04147 04148 if (RSTRING_LEN(str2) == 0) { 04149 if (!rb_enc_asciicompat(STR_ENC_GET(str))) 04150 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID); 04151 else 04152 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT); 04153 } 04154 else { 04155 switch (ENC_CODERANGE(str)) { 04156 case ENC_CODERANGE_7BIT: 04157 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT); 04158 break; 04159 default: 04160 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN); 04161 break; 04162 } 04163 } 04164 04165 OBJ_INFECT(str2, str); 04166 04167 return str2; 04168 } 04169 04170 static VALUE 04171 str_byte_aref(VALUE str, VALUE indx) 04172 { 04173 long idx; 04174 switch (TYPE(indx)) { 04175 case T_FIXNUM: 04176 idx = FIX2LONG(indx); 04177 04178 num_index: 04179 str = str_byte_substr(str, idx, 1); 04180 if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil; 04181 return str; 04182 04183 default: 04184 /* check if indx is Range */ 04185 { 04186 long beg, len = RSTRING_LEN(str); 04187 04188 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { 04189 case Qfalse: 04190 break; 04191 case Qnil: 04192 return Qnil; 04193 default: 04194 return str_byte_substr(str, beg, len); 04195 } 04196 } 04197 idx = NUM2LONG(indx); 04198 goto num_index; 04199 } 04200 04201 UNREACHABLE; 04202 } 04203 04204 /* 04205 * call-seq: 04206 * str.byteslice(fixnum) -> new_str or nil 04207 * str.byteslice(fixnum, fixnum) -> new_str or nil 04208 * str.byteslice(range) -> new_str or nil 04209 * 04210 * Byte Reference---If passed a single <code>Fixnum</code>, returns a 04211 * substring of one byte at that position. If passed two <code>Fixnum</code> 04212 * objects, returns a substring starting at the offset given by the first, and 04213 * a length given by the second. If given a <code>Range</code>, a substring containing 04214 * bytes at offsets given by the range is returned. In all three cases, if 04215 * an offset is negative, it is counted from the end of <i>str</i>. Returns 04216 * <code>nil</code> if the initial offset falls outside the string, the length 04217 * is negative, or the beginning of the range is greater than the end. 04218 * The encoding of the resulted string keeps original encoding. 04219 * 04220 * "hello".byteslice(1) #=> "e" 04221 * "hello".byteslice(-1) #=> "o" 04222 * "hello".byteslice(1, 2) #=> "el" 04223 * "\x80\u3042".byteslice(1, 3) #=> "\u3042" 04224 * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042" 04225 */ 04226 04227 static VALUE 04228 rb_str_byteslice(int argc, VALUE *argv, VALUE str) 04229 { 04230 if (argc == 2) { 04231 return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); 04232 } 04233 rb_check_arity(argc, 1, 2); 04234 return str_byte_aref(str, argv[0]); 04235 } 04236 04237 /* 04238 * call-seq: 04239 * str.reverse -> new_str 04240 * 04241 * Returns a new string with the characters from <i>str</i> in reverse order. 04242 * 04243 * "stressed".reverse #=> "desserts" 04244 */ 04245 04246 static VALUE 04247 rb_str_reverse(VALUE str) 04248 { 04249 rb_encoding *enc; 04250 VALUE rev; 04251 char *s, *e, *p; 04252 int single = 1; 04253 04254 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str); 04255 enc = STR_ENC_GET(str); 04256 rev = rb_str_new5(str, 0, RSTRING_LEN(str)); 04257 s = RSTRING_PTR(str); e = RSTRING_END(str); 04258 p = RSTRING_END(rev); 04259 04260 if (RSTRING_LEN(str) > 1) { 04261 if (single_byte_optimizable(str)) { 04262 while (s < e) { 04263 *--p = *s++; 04264 } 04265 } 04266 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) { 04267 while (s < e) { 04268 int clen = rb_enc_fast_mbclen(s, e, enc); 04269 04270 if (clen > 1 || (*s & 0x80)) single = 0; 04271 p -= clen; 04272 memcpy(p, s, clen); 04273 s += clen; 04274 } 04275 } 04276 else { 04277 while (s < e) { 04278 int clen = rb_enc_mbclen(s, e, enc); 04279 04280 if (clen > 1 || (*s & 0x80)) single = 0; 04281 p -= clen; 04282 memcpy(p, s, clen); 04283 s += clen; 04284 } 04285 } 04286 } 04287 STR_SET_LEN(rev, RSTRING_LEN(str)); 04288 OBJ_INFECT(rev, str); 04289 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) { 04290 if (single) { 04291 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 04292 } 04293 else { 04294 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); 04295 } 04296 } 04297 rb_enc_cr_str_copy_for_substr(rev, str); 04298 04299 return rev; 04300 } 04301 04302 04303 /* 04304 * call-seq: 04305 * str.reverse! -> str 04306 * 04307 * Reverses <i>str</i> in place. 04308 */ 04309 04310 static VALUE 04311 rb_str_reverse_bang(VALUE str) 04312 { 04313 if (RSTRING_LEN(str) > 1) { 04314 if (single_byte_optimizable(str)) { 04315 char *s, *e, c; 04316 04317 str_modify_keep_cr(str); 04318 s = RSTRING_PTR(str); 04319 e = RSTRING_END(str) - 1; 04320 while (s < e) { 04321 c = *s; 04322 *s++ = *e; 04323 *e-- = c; 04324 } 04325 } 04326 else { 04327 rb_str_shared_replace(str, rb_str_reverse(str)); 04328 } 04329 } 04330 else { 04331 str_modify_keep_cr(str); 04332 } 04333 return str; 04334 } 04335 04336 04337 /* 04338 * call-seq: 04339 * str.include? other_str -> true or false 04340 * 04341 * Returns <code>true</code> if <i>str</i> contains the given string or 04342 * character. 04343 * 04344 * "hello".include? "lo" #=> true 04345 * "hello".include? "ol" #=> false 04346 * "hello".include? ?h #=> true 04347 */ 04348 04349 static VALUE 04350 rb_str_include(VALUE str, VALUE arg) 04351 { 04352 long i; 04353 04354 StringValue(arg); 04355 i = rb_str_index(str, arg, 0); 04356 04357 if (i == -1) return Qfalse; 04358 return Qtrue; 04359 } 04360 04361 04362 /* 04363 * call-seq: 04364 * str.to_i(base=10) -> integer 04365 * 04366 * Returns the result of interpreting leading characters in <i>str</i> as an 04367 * integer base <i>base</i> (between 2 and 36). Extraneous characters past the 04368 * end of a valid number are ignored. If there is not a valid number at the 04369 * start of <i>str</i>, <code>0</code> is returned. This method never raises an 04370 * exception when <i>base</i> is valid. 04371 * 04372 * "12345".to_i #=> 12345 04373 * "99 red balloons".to_i #=> 99 04374 * "0a".to_i #=> 0 04375 * "0a".to_i(16) #=> 10 04376 * "hello".to_i #=> 0 04377 * "1100101".to_i(2) #=> 101 04378 * "1100101".to_i(8) #=> 294977 04379 * "1100101".to_i(10) #=> 1100101 04380 * "1100101".to_i(16) #=> 17826049 04381 */ 04382 04383 static VALUE 04384 rb_str_to_i(int argc, VALUE *argv, VALUE str) 04385 { 04386 int base; 04387 04388 if (argc == 0) base = 10; 04389 else { 04390 VALUE b; 04391 04392 rb_scan_args(argc, argv, "01", &b); 04393 base = NUM2INT(b); 04394 } 04395 if (base < 0) { 04396 rb_raise(rb_eArgError, "invalid radix %d", base); 04397 } 04398 return rb_str_to_inum(str, base, FALSE); 04399 } 04400 04401 04402 /* 04403 * call-seq: 04404 * str.to_f -> float 04405 * 04406 * Returns the result of interpreting leading characters in <i>str</i> as a 04407 * floating point number. Extraneous characters past the end of a valid number 04408 * are ignored. If there is not a valid number at the start of <i>str</i>, 04409 * <code>0.0</code> is returned. This method never raises an exception. 04410 * 04411 * "123.45e1".to_f #=> 1234.5 04412 * "45.67 degrees".to_f #=> 45.67 04413 * "thx1138".to_f #=> 0.0 04414 */ 04415 04416 static VALUE 04417 rb_str_to_f(VALUE str) 04418 { 04419 return DBL2NUM(rb_str_to_dbl(str, FALSE)); 04420 } 04421 04422 04423 /* 04424 * call-seq: 04425 * str.to_s -> str 04426 * str.to_str -> str 04427 * 04428 * Returns the receiver. 04429 */ 04430 04431 static VALUE 04432 rb_str_to_s(VALUE str) 04433 { 04434 if (rb_obj_class(str) != rb_cString) { 04435 return str_duplicate(rb_cString, str); 04436 } 04437 return str; 04438 } 04439 04440 #if 0 04441 static void 04442 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc) 04443 { 04444 char s[RUBY_MAX_CHAR_LEN]; 04445 int n = rb_enc_codelen(c, enc); 04446 04447 rb_enc_mbcput(c, s, enc); 04448 rb_enc_str_buf_cat(str, s, n, enc); 04449 } 04450 #endif 04451 04452 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */ 04453 04454 int 04455 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p) 04456 { 04457 char buf[CHAR_ESC_LEN + 1]; 04458 int l; 04459 04460 #if SIZEOF_INT > 4 04461 c &= 0xffffffff; 04462 #endif 04463 if (unicode_p) { 04464 if (c < 0x7F && ISPRINT(c)) { 04465 snprintf(buf, CHAR_ESC_LEN, "%c", c); 04466 } 04467 else if (c < 0x10000) { 04468 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c); 04469 } 04470 else { 04471 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c); 04472 } 04473 } 04474 else { 04475 if (c < 0x100) { 04476 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c); 04477 } 04478 else { 04479 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c); 04480 } 04481 } 04482 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */ 04483 rb_str_buf_cat(result, buf, l); 04484 return l; 04485 } 04486 04487 /* 04488 * call-seq: 04489 * str.inspect -> string 04490 * 04491 * Returns a printable version of _str_, surrounded by quote marks, 04492 * with special characters escaped. 04493 * 04494 * str = "hello" 04495 * str[3] = "\b" 04496 * str.inspect #=> "\"hel\\bo\"" 04497 */ 04498 04499 VALUE 04500 rb_str_inspect(VALUE str) 04501 { 04502 rb_encoding *enc = STR_ENC_GET(str); 04503 const char *p, *pend, *prev; 04504 char buf[CHAR_ESC_LEN + 1]; 04505 VALUE result = rb_str_buf_new(0); 04506 rb_encoding *resenc = rb_default_internal_encoding(); 04507 int unicode_p = rb_enc_unicode_p(enc); 04508 int asciicompat = rb_enc_asciicompat(enc); 04509 static rb_encoding *utf16, *utf32; 04510 04511 if (!utf16) utf16 = rb_enc_find("UTF-16"); 04512 if (!utf32) utf32 = rb_enc_find("UTF-32"); 04513 if (resenc == NULL) resenc = rb_default_external_encoding(); 04514 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding(); 04515 rb_enc_associate(result, resenc); 04516 str_buf_cat2(result, "\""); 04517 04518 p = RSTRING_PTR(str); pend = RSTRING_END(str); 04519 prev = p; 04520 if (enc == utf16) { 04521 const unsigned char *q = (const unsigned char *)p; 04522 if (q[0] == 0xFE && q[1] == 0xFF) 04523 enc = rb_enc_find("UTF-16BE"); 04524 else if (q[0] == 0xFF && q[1] == 0xFE) 04525 enc = rb_enc_find("UTF-16LE"); 04526 else 04527 unicode_p = 0; 04528 } 04529 else if (enc == utf32) { 04530 const unsigned char *q = (const unsigned char *)p; 04531 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) 04532 enc = rb_enc_find("UTF-32BE"); 04533 else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) 04534 enc = rb_enc_find("UTF-32LE"); 04535 else 04536 unicode_p = 0; 04537 } 04538 while (p < pend) { 04539 unsigned int c, cc; 04540 int n; 04541 04542 n = rb_enc_precise_mbclen(p, pend, enc); 04543 if (!MBCLEN_CHARFOUND_P(n)) { 04544 if (p > prev) str_buf_cat(result, prev, p - prev); 04545 n = rb_enc_mbminlen(enc); 04546 if (pend < p + n) 04547 n = (int)(pend - p); 04548 while (n--) { 04549 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377); 04550 str_buf_cat(result, buf, strlen(buf)); 04551 prev = ++p; 04552 } 04553 continue; 04554 } 04555 n = MBCLEN_CHARFOUND_LEN(n); 04556 c = rb_enc_mbc_to_codepoint(p, pend, enc); 04557 p += n; 04558 if ((asciicompat || unicode_p) && 04559 (c == '"'|| c == '\\' || 04560 (c == '#' && 04561 p < pend && 04562 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) && 04563 (cc = rb_enc_codepoint(p,pend,enc), 04564 (cc == '$' || cc == '@' || cc == '{'))))) { 04565 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04566 str_buf_cat2(result, "\\"); 04567 if (asciicompat || enc == resenc) { 04568 prev = p - n; 04569 continue; 04570 } 04571 } 04572 switch (c) { 04573 case '\n': cc = 'n'; break; 04574 case '\r': cc = 'r'; break; 04575 case '\t': cc = 't'; break; 04576 case '\f': cc = 'f'; break; 04577 case '\013': cc = 'v'; break; 04578 case '\010': cc = 'b'; break; 04579 case '\007': cc = 'a'; break; 04580 case 033: cc = 'e'; break; 04581 default: cc = 0; break; 04582 } 04583 if (cc) { 04584 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04585 buf[0] = '\\'; 04586 buf[1] = (char)cc; 04587 str_buf_cat(result, buf, 2); 04588 prev = p; 04589 continue; 04590 } 04591 if ((enc == resenc && rb_enc_isprint(c, enc)) || 04592 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) { 04593 continue; 04594 } 04595 else { 04596 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04597 rb_str_buf_cat_escaped_char(result, c, unicode_p); 04598 prev = p; 04599 continue; 04600 } 04601 } 04602 if (p > prev) str_buf_cat(result, prev, p - prev); 04603 str_buf_cat2(result, "\""); 04604 04605 OBJ_INFECT(result, str); 04606 return result; 04607 } 04608 04609 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{')) 04610 04611 /* 04612 * call-seq: 04613 * str.dump -> new_str 04614 * 04615 * Produces a version of +str+ with all non-printing characters replaced by 04616 * <code>\nnn</code> notation and all special characters escaped. 04617 * 04618 * "hello \n ''".dump #=> "\"hello \\n ''\" 04619 */ 04620 04621 VALUE 04622 rb_str_dump(VALUE str) 04623 { 04624 rb_encoding *enc = rb_enc_get(str); 04625 long len; 04626 const char *p, *pend; 04627 char *q, *qend; 04628 VALUE result; 04629 int u8 = (enc == rb_utf8_encoding()); 04630 04631 len = 2; /* "" */ 04632 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); 04633 while (p < pend) { 04634 unsigned char c = *p++; 04635 switch (c) { 04636 case '"': case '\\': 04637 case '\n': case '\r': 04638 case '\t': case '\f': 04639 case '\013': case '\010': case '\007': case '\033': 04640 len += 2; 04641 break; 04642 04643 case '#': 04644 len += IS_EVSTR(p, pend) ? 2 : 1; 04645 break; 04646 04647 default: 04648 if (ISPRINT(c)) { 04649 len++; 04650 } 04651 else { 04652 if (u8) { /* \u{NN} */ 04653 int n = rb_enc_precise_mbclen(p-1, pend, enc); 04654 if (MBCLEN_CHARFOUND_P(n-1)) { 04655 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); 04656 while (cc >>= 4) len++; 04657 len += 5; 04658 p += MBCLEN_CHARFOUND_LEN(n)-1; 04659 break; 04660 } 04661 } 04662 len += 4; /* \xNN */ 04663 } 04664 break; 04665 } 04666 } 04667 if (!rb_enc_asciicompat(enc)) { 04668 len += 19; /* ".force_encoding('')" */ 04669 len += strlen(enc->name); 04670 } 04671 04672 result = rb_str_new5(str, 0, len); 04673 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); 04674 q = RSTRING_PTR(result); qend = q + len + 1; 04675 04676 *q++ = '"'; 04677 while (p < pend) { 04678 unsigned char c = *p++; 04679 04680 if (c == '"' || c == '\\') { 04681 *q++ = '\\'; 04682 *q++ = c; 04683 } 04684 else if (c == '#') { 04685 if (IS_EVSTR(p, pend)) *q++ = '\\'; 04686 *q++ = '#'; 04687 } 04688 else if (c == '\n') { 04689 *q++ = '\\'; 04690 *q++ = 'n'; 04691 } 04692 else if (c == '\r') { 04693 *q++ = '\\'; 04694 *q++ = 'r'; 04695 } 04696 else if (c == '\t') { 04697 *q++ = '\\'; 04698 *q++ = 't'; 04699 } 04700 else if (c == '\f') { 04701 *q++ = '\\'; 04702 *q++ = 'f'; 04703 } 04704 else if (c == '\013') { 04705 *q++ = '\\'; 04706 *q++ = 'v'; 04707 } 04708 else if (c == '\010') { 04709 *q++ = '\\'; 04710 *q++ = 'b'; 04711 } 04712 else if (c == '\007') { 04713 *q++ = '\\'; 04714 *q++ = 'a'; 04715 } 04716 else if (c == '\033') { 04717 *q++ = '\\'; 04718 *q++ = 'e'; 04719 } 04720 else if (ISPRINT(c)) { 04721 *q++ = c; 04722 } 04723 else { 04724 *q++ = '\\'; 04725 if (u8) { 04726 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1; 04727 if (MBCLEN_CHARFOUND_P(n)) { 04728 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); 04729 p += n; 04730 snprintf(q, qend-q, "u{%x}", cc); 04731 q += strlen(q); 04732 continue; 04733 } 04734 } 04735 snprintf(q, qend-q, "x%02X", c); 04736 q += 3; 04737 } 04738 } 04739 *q++ = '"'; 04740 *q = '\0'; 04741 if (!rb_enc_asciicompat(enc)) { 04742 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name); 04743 enc = rb_ascii8bit_encoding(); 04744 } 04745 OBJ_INFECT(result, str); 04746 /* result from dump is ASCII */ 04747 rb_enc_associate(result, enc); 04748 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT); 04749 return result; 04750 } 04751 04752 04753 static void 04754 rb_str_check_dummy_enc(rb_encoding *enc) 04755 { 04756 if (rb_enc_dummy_p(enc)) { 04757 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s", 04758 rb_enc_name(enc)); 04759 } 04760 } 04761 04762 /* 04763 * call-seq: 04764 * str.upcase! -> str or nil 04765 * 04766 * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes 04767 * were made. 04768 * Note: case replacement is effective only in ASCII region. 04769 */ 04770 04771 static VALUE 04772 rb_str_upcase_bang(VALUE str) 04773 { 04774 rb_encoding *enc; 04775 char *s, *send; 04776 int modify = 0; 04777 int n; 04778 04779 str_modify_keep_cr(str); 04780 enc = STR_ENC_GET(str); 04781 rb_str_check_dummy_enc(enc); 04782 s = RSTRING_PTR(str); send = RSTRING_END(str); 04783 if (single_byte_optimizable(str)) { 04784 while (s < send) { 04785 unsigned int c = *(unsigned char*)s; 04786 04787 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { 04788 *s = 'A' + (c - 'a'); 04789 modify = 1; 04790 } 04791 s++; 04792 } 04793 } 04794 else { 04795 int ascompat = rb_enc_asciicompat(enc); 04796 04797 while (s < send) { 04798 unsigned int c; 04799 04800 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 04801 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { 04802 *s = 'A' + (c - 'a'); 04803 modify = 1; 04804 } 04805 s++; 04806 } 04807 else { 04808 c = rb_enc_codepoint_len(s, send, &n, enc); 04809 if (rb_enc_islower(c, enc)) { 04810 /* assuming toupper returns codepoint with same size */ 04811 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04812 modify = 1; 04813 } 04814 s += n; 04815 } 04816 } 04817 } 04818 04819 if (modify) return str; 04820 return Qnil; 04821 } 04822 04823 04824 /* 04825 * call-seq: 04826 * str.upcase -> new_str 04827 * 04828 * Returns a copy of <i>str</i> with all lowercase letters replaced with their 04829 * uppercase counterparts. The operation is locale insensitive---only 04830 * characters ``a'' to ``z'' are affected. 04831 * Note: case replacement is effective only in ASCII region. 04832 * 04833 * "hEllO".upcase #=> "HELLO" 04834 */ 04835 04836 static VALUE 04837 rb_str_upcase(VALUE str) 04838 { 04839 str = rb_str_dup(str); 04840 rb_str_upcase_bang(str); 04841 return str; 04842 } 04843 04844 04845 /* 04846 * call-seq: 04847 * str.downcase! -> str or nil 04848 * 04849 * Downcases the contents of <i>str</i>, returning <code>nil</code> if no 04850 * changes were made. 04851 * Note: case replacement is effective only in ASCII region. 04852 */ 04853 04854 static VALUE 04855 rb_str_downcase_bang(VALUE str) 04856 { 04857 rb_encoding *enc; 04858 char *s, *send; 04859 int modify = 0; 04860 04861 str_modify_keep_cr(str); 04862 enc = STR_ENC_GET(str); 04863 rb_str_check_dummy_enc(enc); 04864 s = RSTRING_PTR(str); send = RSTRING_END(str); 04865 if (single_byte_optimizable(str)) { 04866 while (s < send) { 04867 unsigned int c = *(unsigned char*)s; 04868 04869 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { 04870 *s = 'a' + (c - 'A'); 04871 modify = 1; 04872 } 04873 s++; 04874 } 04875 } 04876 else { 04877 int ascompat = rb_enc_asciicompat(enc); 04878 04879 while (s < send) { 04880 unsigned int c; 04881 int n; 04882 04883 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 04884 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { 04885 *s = 'a' + (c - 'A'); 04886 modify = 1; 04887 } 04888 s++; 04889 } 04890 else { 04891 c = rb_enc_codepoint_len(s, send, &n, enc); 04892 if (rb_enc_isupper(c, enc)) { 04893 /* assuming toupper returns codepoint with same size */ 04894 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04895 modify = 1; 04896 } 04897 s += n; 04898 } 04899 } 04900 } 04901 04902 if (modify) return str; 04903 return Qnil; 04904 } 04905 04906 04907 /* 04908 * call-seq: 04909 * str.downcase -> new_str 04910 * 04911 * Returns a copy of <i>str</i> with all uppercase letters replaced with their 04912 * lowercase counterparts. The operation is locale insensitive---only 04913 * characters ``A'' to ``Z'' are affected. 04914 * Note: case replacement is effective only in ASCII region. 04915 * 04916 * "hEllO".downcase #=> "hello" 04917 */ 04918 04919 static VALUE 04920 rb_str_downcase(VALUE str) 04921 { 04922 str = rb_str_dup(str); 04923 rb_str_downcase_bang(str); 04924 return str; 04925 } 04926 04927 04928 /* 04929 * call-seq: 04930 * str.capitalize! -> str or nil 04931 * 04932 * Modifies <i>str</i> by converting the first character to uppercase and the 04933 * remainder to lowercase. Returns <code>nil</code> if no changes are made. 04934 * Note: case conversion is effective only in ASCII region. 04935 * 04936 * a = "hello" 04937 * a.capitalize! #=> "Hello" 04938 * a #=> "Hello" 04939 * a.capitalize! #=> nil 04940 */ 04941 04942 static VALUE 04943 rb_str_capitalize_bang(VALUE str) 04944 { 04945 rb_encoding *enc; 04946 char *s, *send; 04947 int modify = 0; 04948 unsigned int c; 04949 int n; 04950 04951 str_modify_keep_cr(str); 04952 enc = STR_ENC_GET(str); 04953 rb_str_check_dummy_enc(enc); 04954 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 04955 s = RSTRING_PTR(str); send = RSTRING_END(str); 04956 04957 c = rb_enc_codepoint_len(s, send, &n, enc); 04958 if (rb_enc_islower(c, enc)) { 04959 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04960 modify = 1; 04961 } 04962 s += n; 04963 while (s < send) { 04964 c = rb_enc_codepoint_len(s, send, &n, enc); 04965 if (rb_enc_isupper(c, enc)) { 04966 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04967 modify = 1; 04968 } 04969 s += n; 04970 } 04971 04972 if (modify) return str; 04973 return Qnil; 04974 } 04975 04976 04977 /* 04978 * call-seq: 04979 * str.capitalize -> new_str 04980 * 04981 * Returns a copy of <i>str</i> with the first character converted to uppercase 04982 * and the remainder to lowercase. 04983 * Note: case conversion is effective only in ASCII region. 04984 * 04985 * "hello".capitalize #=> "Hello" 04986 * "HELLO".capitalize #=> "Hello" 04987 * "123ABC".capitalize #=> "123abc" 04988 */ 04989 04990 static VALUE 04991 rb_str_capitalize(VALUE str) 04992 { 04993 str = rb_str_dup(str); 04994 rb_str_capitalize_bang(str); 04995 return str; 04996 } 04997 04998 04999 /* 05000 * call-seq: 05001 * str.swapcase! -> str or nil 05002 * 05003 * Equivalent to <code>String#swapcase</code>, but modifies the receiver in 05004 * place, returning <i>str</i>, or <code>nil</code> if no changes were made. 05005 * Note: case conversion is effective only in ASCII region. 05006 */ 05007 05008 static VALUE 05009 rb_str_swapcase_bang(VALUE str) 05010 { 05011 rb_encoding *enc; 05012 char *s, *send; 05013 int modify = 0; 05014 int n; 05015 05016 str_modify_keep_cr(str); 05017 enc = STR_ENC_GET(str); 05018 rb_str_check_dummy_enc(enc); 05019 s = RSTRING_PTR(str); send = RSTRING_END(str); 05020 while (s < send) { 05021 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc); 05022 05023 if (rb_enc_isupper(c, enc)) { 05024 /* assuming toupper returns codepoint with same size */ 05025 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 05026 modify = 1; 05027 } 05028 else if (rb_enc_islower(c, enc)) { 05029 /* assuming tolower returns codepoint with same size */ 05030 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 05031 modify = 1; 05032 } 05033 s += n; 05034 } 05035 05036 if (modify) return str; 05037 return Qnil; 05038 } 05039 05040 05041 /* 05042 * call-seq: 05043 * str.swapcase -> new_str 05044 * 05045 * Returns a copy of <i>str</i> with uppercase alphabetic characters converted 05046 * to lowercase and lowercase characters converted to uppercase. 05047 * Note: case conversion is effective only in ASCII region. 05048 * 05049 * "Hello".swapcase #=> "hELLO" 05050 * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11" 05051 */ 05052 05053 static VALUE 05054 rb_str_swapcase(VALUE str) 05055 { 05056 str = rb_str_dup(str); 05057 rb_str_swapcase_bang(str); 05058 return str; 05059 } 05060 05061 typedef unsigned char *USTR; 05062 05063 struct tr { 05064 int gen; 05065 unsigned int now, max; 05066 char *p, *pend; 05067 }; 05068 05069 static unsigned int 05070 trnext(struct tr *t, rb_encoding *enc) 05071 { 05072 int n; 05073 05074 for (;;) { 05075 if (!t->gen) { 05076 nextpart: 05077 if (t->p == t->pend) return -1; 05078 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) { 05079 t->p += n; 05080 } 05081 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc); 05082 t->p += n; 05083 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) { 05084 t->p += n; 05085 if (t->p < t->pend) { 05086 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc); 05087 t->p += n; 05088 if (t->now > c) { 05089 if (t->now < 0x80 && c < 0x80) { 05090 rb_raise(rb_eArgError, 05091 "invalid range \"%c-%c\" in string transliteration", 05092 t->now, c); 05093 } 05094 else { 05095 rb_raise(rb_eArgError, "invalid range in string transliteration"); 05096 } 05097 continue; /* not reached */ 05098 } 05099 t->gen = 1; 05100 t->max = c; 05101 } 05102 } 05103 return t->now; 05104 } 05105 else { 05106 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) { 05107 if (t->now == t->max) { 05108 t->gen = 0; 05109 goto nextpart; 05110 } 05111 } 05112 if (t->now < t->max) { 05113 return t->now; 05114 } 05115 else { 05116 t->gen = 0; 05117 return t->max; 05118 } 05119 } 05120 } 05121 } 05122 05123 static VALUE rb_str_delete_bang(int,VALUE*,VALUE); 05124 05125 static VALUE 05126 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) 05127 { 05128 const unsigned int errc = -1; 05129 unsigned int trans[256]; 05130 rb_encoding *enc, *e1, *e2; 05131 struct tr trsrc, trrepl; 05132 int cflag = 0; 05133 unsigned int c, c0, last = 0; 05134 int modify = 0, i, l; 05135 char *s, *send; 05136 VALUE hash = 0; 05137 int singlebyte = single_byte_optimizable(str); 05138 int cr; 05139 05140 #define CHECK_IF_ASCII(c) \ 05141 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \ 05142 (cr = ENC_CODERANGE_VALID) : 0) 05143 05144 StringValue(src); 05145 StringValue(repl); 05146 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 05147 if (RSTRING_LEN(repl) == 0) { 05148 return rb_str_delete_bang(1, &src, str); 05149 } 05150 05151 cr = ENC_CODERANGE(str); 05152 e1 = rb_enc_check(str, src); 05153 e2 = rb_enc_check(str, repl); 05154 if (e1 == e2) { 05155 enc = e1; 05156 } 05157 else { 05158 enc = rb_enc_check(src, repl); 05159 } 05160 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src); 05161 if (RSTRING_LEN(src) > 1 && 05162 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' && 05163 trsrc.p + l < trsrc.pend) { 05164 cflag = 1; 05165 trsrc.p += l; 05166 } 05167 trrepl.p = RSTRING_PTR(repl); 05168 trrepl.pend = trrepl.p + RSTRING_LEN(repl); 05169 trsrc.gen = trrepl.gen = 0; 05170 trsrc.now = trrepl.now = 0; 05171 trsrc.max = trrepl.max = 0; 05172 05173 if (cflag) { 05174 for (i=0; i<256; i++) { 05175 trans[i] = 1; 05176 } 05177 while ((c = trnext(&trsrc, enc)) != errc) { 05178 if (c < 256) { 05179 trans[c] = errc; 05180 } 05181 else { 05182 if (!hash) hash = rb_hash_new(); 05183 rb_hash_aset(hash, UINT2NUM(c), Qtrue); 05184 } 05185 } 05186 while ((c = trnext(&trrepl, enc)) != errc) 05187 /* retrieve last replacer */; 05188 last = trrepl.now; 05189 for (i=0; i<256; i++) { 05190 if (trans[i] != errc) { 05191 trans[i] = last; 05192 } 05193 } 05194 } 05195 else { 05196 unsigned int r; 05197 05198 for (i=0; i<256; i++) { 05199 trans[i] = errc; 05200 } 05201 while ((c = trnext(&trsrc, enc)) != errc) { 05202 r = trnext(&trrepl, enc); 05203 if (r == errc) r = trrepl.now; 05204 if (c < 256) { 05205 trans[c] = r; 05206 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0; 05207 } 05208 else { 05209 if (!hash) hash = rb_hash_new(); 05210 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r)); 05211 } 05212 } 05213 } 05214 05215 if (cr == ENC_CODERANGE_VALID) 05216 cr = ENC_CODERANGE_7BIT; 05217 str_modify_keep_cr(str); 05218 s = RSTRING_PTR(str); send = RSTRING_END(str); 05219 if (sflag) { 05220 int clen, tlen; 05221 long offset, max = RSTRING_LEN(str); 05222 unsigned int save = -1; 05223 char *buf = ALLOC_N(char, max), *t = buf; 05224 05225 while (s < send) { 05226 int may_modify = 0; 05227 05228 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); 05229 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); 05230 05231 s += clen; 05232 if (c < 256) { 05233 c = trans[c]; 05234 } 05235 else if (hash) { 05236 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c)); 05237 if (NIL_P(tmp)) { 05238 if (cflag) c = last; 05239 else c = errc; 05240 } 05241 else if (cflag) c = errc; 05242 else c = NUM2INT(tmp); 05243 } 05244 else { 05245 c = errc; 05246 } 05247 if (c != (unsigned int)-1) { 05248 if (save == c) { 05249 CHECK_IF_ASCII(c); 05250 continue; 05251 } 05252 save = c; 05253 tlen = rb_enc_codelen(c, enc); 05254 modify = 1; 05255 } 05256 else { 05257 save = -1; 05258 c = c0; 05259 if (enc != e1) may_modify = 1; 05260 } 05261 while (t - buf + tlen >= max) { 05262 offset = t - buf; 05263 max *= 2; 05264 REALLOC_N(buf, char, max); 05265 t = buf + offset; 05266 } 05267 rb_enc_mbcput(c, t, enc); 05268 if (may_modify && memcmp(s, t, tlen) != 0) { 05269 modify = 1; 05270 } 05271 CHECK_IF_ASCII(c); 05272 t += tlen; 05273 } 05274 if (!STR_EMBED_P(str)) { 05275 xfree(RSTRING(str)->as.heap.ptr); 05276 } 05277 *t = '\0'; 05278 RSTRING(str)->as.heap.ptr = buf; 05279 RSTRING(str)->as.heap.len = t - buf; 05280 STR_SET_NOEMBED(str); 05281 RSTRING(str)->as.heap.aux.capa = max; 05282 } 05283 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) { 05284 while (s < send) { 05285 c = (unsigned char)*s; 05286 if (trans[c] != errc) { 05287 if (!cflag) { 05288 c = trans[c]; 05289 *s = c; 05290 modify = 1; 05291 } 05292 else { 05293 *s = last; 05294 modify = 1; 05295 } 05296 } 05297 CHECK_IF_ASCII(c); 05298 s++; 05299 } 05300 } 05301 else { 05302 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2); 05303 long offset; 05304 char *buf = ALLOC_N(char, max), *t = buf; 05305 05306 while (s < send) { 05307 int may_modify = 0; 05308 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); 05309 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); 05310 05311 if (c < 256) { 05312 c = trans[c]; 05313 } 05314 else if (hash) { 05315 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c)); 05316 if (NIL_P(tmp)) { 05317 if (cflag) c = last; 05318 else c = errc; 05319 } 05320 else if (cflag) c = errc; 05321 else c = NUM2INT(tmp); 05322 } 05323 else { 05324 c = cflag ? last : errc; 05325 } 05326 if (c != errc) { 05327 tlen = rb_enc_codelen(c, enc); 05328 modify = 1; 05329 } 05330 else { 05331 c = c0; 05332 if (enc != e1) may_modify = 1; 05333 } 05334 while (t - buf + tlen >= max) { 05335 offset = t - buf; 05336 max *= 2; 05337 REALLOC_N(buf, char, max); 05338 t = buf + offset; 05339 } 05340 if (s != t) { 05341 rb_enc_mbcput(c, t, enc); 05342 if (may_modify && memcmp(s, t, tlen) != 0) { 05343 modify = 1; 05344 } 05345 } 05346 CHECK_IF_ASCII(c); 05347 s += clen; 05348 t += tlen; 05349 } 05350 if (!STR_EMBED_P(str)) { 05351 xfree(RSTRING(str)->as.heap.ptr); 05352 } 05353 *t = '\0'; 05354 RSTRING(str)->as.heap.ptr = buf; 05355 RSTRING(str)->as.heap.len = t - buf; 05356 STR_SET_NOEMBED(str); 05357 RSTRING(str)->as.heap.aux.capa = max; 05358 } 05359 05360 if (modify) { 05361 if (cr != ENC_CODERANGE_BROKEN) 05362 ENC_CODERANGE_SET(str, cr); 05363 rb_enc_associate(str, enc); 05364 return str; 05365 } 05366 return Qnil; 05367 } 05368 05369 05370 /* 05371 * call-seq: 05372 * str.tr!(from_str, to_str) -> str or nil 05373 * 05374 * Translates <i>str</i> in place, using the same rules as 05375 * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no 05376 * changes were made. 05377 */ 05378 05379 static VALUE 05380 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl) 05381 { 05382 return tr_trans(str, src, repl, 0); 05383 } 05384 05385 05386 /* 05387 * call-seq: 05388 * str.tr(from_str, to_str) => new_str 05389 * 05390 * Returns a copy of +str+ with the characters in +from_str+ replaced by the 05391 * corresponding characters in +to_str+. If +to_str+ is shorter than 05392 * +from_str+, it is padded with its last character in order to maintain the 05393 * correspondence. 05394 * 05395 * "hello".tr('el', 'ip') #=> "hippo" 05396 * "hello".tr('aeiou', '*') #=> "h*ll*" 05397 * "hello".tr('aeiou', 'AA*') #=> "hAll*" 05398 * 05399 * Both strings may use the <code>c1-c2</code> notation to denote ranges of 05400 * characters, and +from_str+ may start with a <code>^</code>, which denotes 05401 * all characters except those listed. 05402 * 05403 * "hello".tr('a-y', 'b-z') #=> "ifmmp" 05404 * "hello".tr('^aeiou', '*') #=> "*e**o" 05405 * 05406 * The backslash character <code></code> can be used to escape 05407 * <code>^</code> or <code>-</code> and is otherwise ignored unless it 05408 * appears at the end of a range or the end of the +from_str+ or +to_str+: 05409 * 05410 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld" 05411 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld" 05412 * 05413 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld" 05414 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold" 05415 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld" 05416 * 05417 * "X['\\b']".tr("X\\", "") #=> "['b']" 05418 * "X['\\b']".tr("X-\\]", "") #=> "'b'" 05419 */ 05420 05421 static VALUE 05422 rb_str_tr(VALUE str, VALUE src, VALUE repl) 05423 { 05424 str = rb_str_dup(str); 05425 tr_trans(str, src, repl, 0); 05426 return str; 05427 } 05428 05429 #define TR_TABLE_SIZE 257 05430 static void 05431 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first, 05432 VALUE *tablep, VALUE *ctablep, rb_encoding *enc) 05433 { 05434 const unsigned int errc = -1; 05435 char buf[256]; 05436 struct tr tr; 05437 unsigned int c; 05438 VALUE table = 0, ptable = 0; 05439 int i, l, cflag = 0; 05440 05441 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str); 05442 tr.gen = tr.now = tr.max = 0; 05443 05444 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') { 05445 cflag = 1; 05446 tr.p += l; 05447 } 05448 if (first) { 05449 for (i=0; i<256; i++) { 05450 stable[i] = 1; 05451 } 05452 stable[256] = cflag; 05453 } 05454 else if (stable[256] && !cflag) { 05455 stable[256] = 0; 05456 } 05457 for (i=0; i<256; i++) { 05458 buf[i] = cflag; 05459 } 05460 05461 while ((c = trnext(&tr, enc)) != errc) { 05462 if (c < 256) { 05463 buf[c & 0xff] = !cflag; 05464 } 05465 else { 05466 VALUE key = UINT2NUM(c); 05467 05468 if (!table && (first || *tablep || stable[256])) { 05469 if (cflag) { 05470 ptable = *ctablep; 05471 table = ptable ? ptable : rb_hash_new(); 05472 *ctablep = table; 05473 } 05474 else { 05475 table = rb_hash_new(); 05476 ptable = *tablep; 05477 *tablep = table; 05478 } 05479 } 05480 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) { 05481 rb_hash_aset(table, key, Qtrue); 05482 } 05483 } 05484 } 05485 for (i=0; i<256; i++) { 05486 stable[i] = stable[i] && buf[i]; 05487 } 05488 if (!table && !cflag) { 05489 *tablep = 0; 05490 } 05491 } 05492 05493 05494 static int 05495 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel) 05496 { 05497 if (c < 256) { 05498 return table[c] != 0; 05499 } 05500 else { 05501 VALUE v = UINT2NUM(c); 05502 05503 if (del) { 05504 if (!NIL_P(rb_hash_lookup(del, v)) && 05505 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) { 05506 return TRUE; 05507 } 05508 } 05509 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) { 05510 return FALSE; 05511 } 05512 return table[256] ? TRUE : FALSE; 05513 } 05514 } 05515 05516 /* 05517 * call-seq: 05518 * str.delete!([other_str]+) -> str or nil 05519 * 05520 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or 05521 * <code>nil</code> if <i>str</i> was not modified. 05522 */ 05523 05524 static VALUE 05525 rb_str_delete_bang(int argc, VALUE *argv, VALUE str) 05526 { 05527 char squeez[TR_TABLE_SIZE]; 05528 rb_encoding *enc = 0; 05529 char *s, *send, *t; 05530 VALUE del = 0, nodel = 0; 05531 int modify = 0; 05532 int i, ascompat, cr; 05533 05534 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 05535 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS); 05536 for (i=0; i<argc; i++) { 05537 VALUE s = argv[i]; 05538 05539 StringValue(s); 05540 enc = rb_enc_check(str, s); 05541 tr_setup_table(s, squeez, i==0, &del, &nodel, enc); 05542 } 05543 05544 str_modify_keep_cr(str); 05545 ascompat = rb_enc_asciicompat(enc); 05546 s = t = RSTRING_PTR(str); 05547 send = RSTRING_END(str); 05548 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; 05549 while (s < send) { 05550 unsigned int c; 05551 int clen; 05552 05553 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05554 if (squeez[c]) { 05555 modify = 1; 05556 } 05557 else { 05558 if (t != s) *t = c; 05559 t++; 05560 } 05561 s++; 05562 } 05563 else { 05564 c = rb_enc_codepoint_len(s, send, &clen, enc); 05565 05566 if (tr_find(c, squeez, del, nodel)) { 05567 modify = 1; 05568 } 05569 else { 05570 if (t != s) rb_enc_mbcput(c, t, enc); 05571 t += clen; 05572 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID; 05573 } 05574 s += clen; 05575 } 05576 } 05577 *t = '\0'; 05578 STR_SET_LEN(str, t - RSTRING_PTR(str)); 05579 ENC_CODERANGE_SET(str, cr); 05580 05581 if (modify) return str; 05582 return Qnil; 05583 } 05584 05585 05586 /* 05587 * call-seq: 05588 * str.delete([other_str]+) -> new_str 05589 * 05590 * Returns a copy of <i>str</i> with all characters in the intersection of its 05591 * arguments deleted. Uses the same rules for building the set of characters as 05592 * <code>String#count</code>. 05593 * 05594 * "hello".delete "l","lo" #=> "heo" 05595 * "hello".delete "lo" #=> "he" 05596 * "hello".delete "aeiou", "^e" #=> "hell" 05597 * "hello".delete "ej-m" #=> "ho" 05598 */ 05599 05600 static VALUE 05601 rb_str_delete(int argc, VALUE *argv, VALUE str) 05602 { 05603 str = rb_str_dup(str); 05604 rb_str_delete_bang(argc, argv, str); 05605 return str; 05606 } 05607 05608 05609 /* 05610 * call-seq: 05611 * str.squeeze!([other_str]*) -> str or nil 05612 * 05613 * Squeezes <i>str</i> in place, returning either <i>str</i>, or 05614 * <code>nil</code> if no changes were made. 05615 */ 05616 05617 static VALUE 05618 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) 05619 { 05620 char squeez[TR_TABLE_SIZE]; 05621 rb_encoding *enc = 0; 05622 VALUE del = 0, nodel = 0; 05623 char *s, *send, *t; 05624 int i, modify = 0; 05625 int ascompat, singlebyte = single_byte_optimizable(str); 05626 unsigned int save; 05627 05628 if (argc == 0) { 05629 enc = STR_ENC_GET(str); 05630 } 05631 else { 05632 for (i=0; i<argc; i++) { 05633 VALUE s = argv[i]; 05634 05635 StringValue(s); 05636 enc = rb_enc_check(str, s); 05637 if (singlebyte && !single_byte_optimizable(s)) 05638 singlebyte = 0; 05639 tr_setup_table(s, squeez, i==0, &del, &nodel, enc); 05640 } 05641 } 05642 05643 str_modify_keep_cr(str); 05644 s = t = RSTRING_PTR(str); 05645 if (!s || RSTRING_LEN(str) == 0) return Qnil; 05646 send = RSTRING_END(str); 05647 save = -1; 05648 ascompat = rb_enc_asciicompat(enc); 05649 05650 if (singlebyte) { 05651 while (s < send) { 05652 unsigned int c = *(unsigned char*)s++; 05653 if (c != save || (argc > 0 && !squeez[c])) { 05654 *t++ = save = c; 05655 } 05656 } 05657 } else { 05658 while (s < send) { 05659 unsigned int c; 05660 int clen; 05661 05662 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05663 if (c != save || (argc > 0 && !squeez[c])) { 05664 *t++ = save = c; 05665 } 05666 s++; 05667 } 05668 else { 05669 c = rb_enc_codepoint_len(s, send, &clen, enc); 05670 05671 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) { 05672 if (t != s) rb_enc_mbcput(c, t, enc); 05673 save = c; 05674 t += clen; 05675 } 05676 s += clen; 05677 } 05678 } 05679 } 05680 05681 *t = '\0'; 05682 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) { 05683 STR_SET_LEN(str, t - RSTRING_PTR(str)); 05684 modify = 1; 05685 } 05686 05687 if (modify) return str; 05688 return Qnil; 05689 } 05690 05691 05692 /* 05693 * call-seq: 05694 * str.squeeze([other_str]*) -> new_str 05695 * 05696 * Builds a set of characters from the <i>other_str</i> parameter(s) using the 05697 * procedure described for <code>String#count</code>. Returns a new string 05698 * where runs of the same character that occur in this set are replaced by a 05699 * single character. If no arguments are given, all runs of identical 05700 * characters are replaced by a single character. 05701 * 05702 * "yellow moon".squeeze #=> "yelow mon" 05703 * " now is the".squeeze(" ") #=> " now is the" 05704 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls" 05705 */ 05706 05707 static VALUE 05708 rb_str_squeeze(int argc, VALUE *argv, VALUE str) 05709 { 05710 str = rb_str_dup(str); 05711 rb_str_squeeze_bang(argc, argv, str); 05712 return str; 05713 } 05714 05715 05716 /* 05717 * call-seq: 05718 * str.tr_s!(from_str, to_str) -> str or nil 05719 * 05720 * Performs <code>String#tr_s</code> processing on <i>str</i> in place, 05721 * returning <i>str</i>, or <code>nil</code> if no changes were made. 05722 */ 05723 05724 static VALUE 05725 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl) 05726 { 05727 return tr_trans(str, src, repl, 1); 05728 } 05729 05730 05731 /* 05732 * call-seq: 05733 * str.tr_s(from_str, to_str) -> new_str 05734 * 05735 * Processes a copy of <i>str</i> as described under <code>String#tr</code>, 05736 * then removes duplicate characters in regions that were affected by the 05737 * translation. 05738 * 05739 * "hello".tr_s('l', 'r') #=> "hero" 05740 * "hello".tr_s('el', '*') #=> "h*o" 05741 * "hello".tr_s('el', 'hx') #=> "hhxo" 05742 */ 05743 05744 static VALUE 05745 rb_str_tr_s(VALUE str, VALUE src, VALUE repl) 05746 { 05747 str = rb_str_dup(str); 05748 tr_trans(str, src, repl, 1); 05749 return str; 05750 } 05751 05752 05753 /* 05754 * call-seq: 05755 * str.count([other_str]+) -> fixnum 05756 * 05757 * Each +other_str+ parameter defines a set of characters to count. The 05758 * intersection of these sets defines the characters to count in +str+. Any 05759 * +other_str+ that starts with a caret <code>^</code> is negated. The 05760 * sequence <code>c1-c2</code> means all characters between c1 and c2. The 05761 * backslash character <code></code> can be used to escape <code>^</code> or 05762 * <code>-</code> and is otherwise ignored unless it appears at the end of a 05763 * sequence or the end of a +other_str+. 05764 * 05765 * a = "hello world" 05766 * a.count "lo" #=> 5 05767 * a.count "lo", "o" #=> 2 05768 * a.count "hello", "^l" #=> 4 05769 * a.count "ej-m" #=> 4 05770 * 05771 * "hello^world".count "\\^aeiou" #=> 4 05772 * "hello-world".count "a\\-eo" #=> 4 05773 * 05774 * c = "hello world\\r\\n" 05775 * c.count "\\" #=> 2 05776 * c.count "\\A" #=> 0 05777 * c.count "X-\\w" #=> 3 05778 */ 05779 05780 static VALUE 05781 rb_str_count(int argc, VALUE *argv, VALUE str) 05782 { 05783 char table[TR_TABLE_SIZE]; 05784 rb_encoding *enc = 0; 05785 VALUE del = 0, nodel = 0; 05786 char *s, *send; 05787 int i; 05788 int ascompat; 05789 05790 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS); 05791 for (i=0; i<argc; i++) { 05792 VALUE tstr = argv[i]; 05793 unsigned char c; 05794 05795 StringValue(tstr); 05796 enc = rb_enc_check(str, tstr); 05797 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) && 05798 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) { 05799 int n = 0; 05800 05801 s = RSTRING_PTR(str); 05802 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); 05803 send = RSTRING_END(str); 05804 while (s < send) { 05805 if (*(unsigned char*)s++ == c) n++; 05806 } 05807 return INT2NUM(n); 05808 } 05809 tr_setup_table(tstr, table, i==0, &del, &nodel, enc); 05810 } 05811 05812 s = RSTRING_PTR(str); 05813 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); 05814 send = RSTRING_END(str); 05815 ascompat = rb_enc_asciicompat(enc); 05816 i = 0; 05817 while (s < send) { 05818 unsigned int c; 05819 05820 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05821 if (table[c]) { 05822 i++; 05823 } 05824 s++; 05825 } 05826 else { 05827 int clen; 05828 c = rb_enc_codepoint_len(s, send, &clen, enc); 05829 if (tr_find(c, table, del, nodel)) { 05830 i++; 05831 } 05832 s += clen; 05833 } 05834 } 05835 05836 return INT2NUM(i); 05837 } 05838 05839 static const char isspacetable[256] = { 05840 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 05841 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05842 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05843 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05844 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05845 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05846 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05847 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05848 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05854 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 05856 }; 05857 05858 #define ascii_isspace(c) isspacetable[(unsigned char)(c)] 05859 05860 /* 05861 * call-seq: 05862 * str.split(pattern=$;, [limit]) -> anArray 05863 * 05864 * Divides <i>str</i> into substrings based on a delimiter, returning an array 05865 * of these substrings. 05866 * 05867 * If <i>pattern</i> is a <code>String</code>, then its contents are used as 05868 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single 05869 * space, <i>str</i> is split on whitespace, with leading whitespace and runs 05870 * of contiguous whitespace characters ignored. 05871 * 05872 * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the 05873 * pattern matches. Whenever the pattern matches a zero-length string, 05874 * <i>str</i> is split into individual characters. If <i>pattern</i> contains 05875 * groups, the respective matches will be returned in the array as well. 05876 * 05877 * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If 05878 * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is 05879 * split on whitespace as if ` ' were specified. 05880 * 05881 * If the <i>limit</i> parameter is omitted, trailing null fields are 05882 * suppressed. If <i>limit</i> is a positive number, at most that number of 05883 * fields will be returned (if <i>limit</i> is <code>1</code>, the entire 05884 * string is returned as the only entry in an array). If negative, there is no 05885 * limit to the number of fields returned, and trailing null fields are not 05886 * suppressed. 05887 * 05888 * When the input +str+ is empty an empty Array is returned as the string is 05889 * considered to have no fields to split. 05890 * 05891 * " now's the time".split #=> ["now's", "the", "time"] 05892 * " now's the time".split(' ') #=> ["now's", "the", "time"] 05893 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"] 05894 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"] 05895 * "hello".split(//) #=> ["h", "e", "l", "l", "o"] 05896 * "hello".split(//, 3) #=> ["h", "e", "llo"] 05897 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"] 05898 * 05899 * "mellow yellow".split("ello") #=> ["m", "w y", "w"] 05900 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"] 05901 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"] 05902 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""] 05903 * 05904 * "".split(',', -1) #=> [] 05905 */ 05906 05907 static VALUE 05908 rb_str_split_m(int argc, VALUE *argv, VALUE str) 05909 { 05910 rb_encoding *enc; 05911 VALUE spat; 05912 VALUE limit; 05913 enum {awk, string, regexp} split_type; 05914 long beg, end, i = 0; 05915 int lim = 0; 05916 VALUE result, tmp; 05917 05918 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) { 05919 lim = NUM2INT(limit); 05920 if (lim <= 0) limit = Qnil; 05921 else if (lim == 1) { 05922 if (RSTRING_LEN(str) == 0) 05923 return rb_ary_new2(0); 05924 return rb_ary_new3(1, str); 05925 } 05926 i = 1; 05927 } 05928 05929 enc = STR_ENC_GET(str); 05930 if (NIL_P(spat)) { 05931 if (!NIL_P(rb_fs)) { 05932 spat = rb_fs; 05933 goto fs_set; 05934 } 05935 split_type = awk; 05936 } 05937 else { 05938 fs_set: 05939 if (RB_TYPE_P(spat, T_STRING)) { 05940 rb_encoding *enc2 = STR_ENC_GET(spat); 05941 05942 split_type = string; 05943 if (RSTRING_LEN(spat) == 0) { 05944 /* Special case - split into chars */ 05945 spat = rb_reg_regcomp(spat); 05946 split_type = regexp; 05947 } 05948 else if (rb_enc_asciicompat(enc2) == 1) { 05949 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){ 05950 split_type = awk; 05951 } 05952 } 05953 else { 05954 int l; 05955 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' && 05956 RSTRING_LEN(spat) == l) { 05957 split_type = awk; 05958 } 05959 } 05960 } 05961 else { 05962 spat = get_pat(spat, 1); 05963 split_type = regexp; 05964 } 05965 } 05966 05967 result = rb_ary_new(); 05968 beg = 0; 05969 if (split_type == awk) { 05970 char *ptr = RSTRING_PTR(str); 05971 char *eptr = RSTRING_END(str); 05972 char *bptr = ptr; 05973 int skip = 1; 05974 unsigned int c; 05975 05976 end = beg; 05977 if (is_ascii_string(str)) { 05978 while (ptr < eptr) { 05979 c = (unsigned char)*ptr++; 05980 if (skip) { 05981 if (ascii_isspace(c)) { 05982 beg = ptr - bptr; 05983 } 05984 else { 05985 end = ptr - bptr; 05986 skip = 0; 05987 if (!NIL_P(limit) && lim <= i) break; 05988 } 05989 } 05990 else if (ascii_isspace(c)) { 05991 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 05992 skip = 1; 05993 beg = ptr - bptr; 05994 if (!NIL_P(limit)) ++i; 05995 } 05996 else { 05997 end = ptr - bptr; 05998 } 05999 } 06000 } 06001 else { 06002 while (ptr < eptr) { 06003 int n; 06004 06005 c = rb_enc_codepoint_len(ptr, eptr, &n, enc); 06006 ptr += n; 06007 if (skip) { 06008 if (rb_isspace(c)) { 06009 beg = ptr - bptr; 06010 } 06011 else { 06012 end = ptr - bptr; 06013 skip = 0; 06014 if (!NIL_P(limit) && lim <= i) break; 06015 } 06016 } 06017 else if (rb_isspace(c)) { 06018 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 06019 skip = 1; 06020 beg = ptr - bptr; 06021 if (!NIL_P(limit)) ++i; 06022 } 06023 else { 06024 end = ptr - bptr; 06025 } 06026 } 06027 } 06028 } 06029 else if (split_type == string) { 06030 char *ptr = RSTRING_PTR(str); 06031 char *temp = ptr; 06032 char *eptr = RSTRING_END(str); 06033 char *sptr = RSTRING_PTR(spat); 06034 long slen = RSTRING_LEN(spat); 06035 06036 if (is_broken_string(str)) { 06037 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str))); 06038 } 06039 if (is_broken_string(spat)) { 06040 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat))); 06041 } 06042 enc = rb_enc_check(str, spat); 06043 while (ptr < eptr && 06044 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) { 06045 /* Check we are at the start of a char */ 06046 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc); 06047 if (t != ptr + end) { 06048 ptr = t; 06049 continue; 06050 } 06051 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end)); 06052 ptr += end + slen; 06053 if (!NIL_P(limit) && lim <= ++i) break; 06054 } 06055 beg = ptr - temp; 06056 } 06057 else { 06058 char *ptr = RSTRING_PTR(str); 06059 long len = RSTRING_LEN(str); 06060 long start = beg; 06061 long idx; 06062 int last_null = 0; 06063 struct re_registers *regs; 06064 06065 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) { 06066 regs = RMATCH_REGS(rb_backref_get()); 06067 if (start == end && BEG(0) == END(0)) { 06068 if (!ptr) { 06069 rb_ary_push(result, str_new_empty(str)); 06070 break; 06071 } 06072 else if (last_null == 1) { 06073 rb_ary_push(result, rb_str_subseq(str, beg, 06074 rb_enc_fast_mbclen(ptr+beg, 06075 ptr+len, 06076 enc))); 06077 beg = start; 06078 } 06079 else { 06080 if (ptr+start == ptr+len) 06081 start++; 06082 else 06083 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc); 06084 last_null = 1; 06085 continue; 06086 } 06087 } 06088 else { 06089 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 06090 beg = start = END(0); 06091 } 06092 last_null = 0; 06093 06094 for (idx=1; idx < regs->num_regs; idx++) { 06095 if (BEG(idx) == -1) continue; 06096 if (BEG(idx) == END(idx)) 06097 tmp = str_new_empty(str); 06098 else 06099 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx)); 06100 rb_ary_push(result, tmp); 06101 } 06102 if (!NIL_P(limit) && lim <= ++i) break; 06103 } 06104 } 06105 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) { 06106 if (RSTRING_LEN(str) == beg) 06107 tmp = str_new_empty(str); 06108 else 06109 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg); 06110 rb_ary_push(result, tmp); 06111 } 06112 if (NIL_P(limit) && lim == 0) { 06113 long len; 06114 while ((len = RARRAY_LEN(result)) > 0 && 06115 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0)) 06116 rb_ary_pop(result); 06117 } 06118 06119 return result; 06120 } 06121 06122 VALUE 06123 rb_str_split(VALUE str, const char *sep0) 06124 { 06125 VALUE sep; 06126 06127 StringValue(str); 06128 sep = rb_str_new2(sep0); 06129 return rb_str_split_m(1, &sep, str); 06130 } 06131 06132 06133 static VALUE 06134 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray) 06135 { 06136 rb_encoding *enc; 06137 VALUE rs; 06138 unsigned int newline; 06139 const char *p, *pend, *s, *ptr; 06140 long len, rslen; 06141 VALUE line; 06142 int n; 06143 VALUE orig = str; 06144 VALUE UNINITIALIZED_VAR(ary); 06145 06146 if (argc == 0) { 06147 rs = rb_rs; 06148 } 06149 else { 06150 rb_scan_args(argc, argv, "01", &rs); 06151 } 06152 06153 if (rb_block_given_p()) { 06154 if (wantarray) { 06155 #if 0 /* next major */ 06156 rb_warn("given block not used"); 06157 ary = rb_ary_new(); 06158 #else 06159 rb_warning("passing a block to String#lines is deprecated"); 06160 wantarray = 0; 06161 #endif 06162 } 06163 } 06164 else { 06165 if (wantarray) 06166 ary = rb_ary_new(); 06167 else 06168 RETURN_ENUMERATOR(str, argc, argv); 06169 } 06170 06171 if (NIL_P(rs)) { 06172 if (wantarray) { 06173 rb_ary_push(ary, str); 06174 return ary; 06175 } 06176 else { 06177 rb_yield(str); 06178 return orig; 06179 } 06180 } 06181 str = rb_str_new4(str); 06182 ptr = p = s = RSTRING_PTR(str); 06183 pend = p + RSTRING_LEN(str); 06184 len = RSTRING_LEN(str); 06185 StringValue(rs); 06186 if (rs == rb_default_rs) { 06187 enc = rb_enc_get(str); 06188 while (p < pend) { 06189 char *p0; 06190 06191 p = memchr(p, '\n', pend - p); 06192 if (!p) break; 06193 p0 = rb_enc_left_char_head(s, p, pend, enc); 06194 if (!rb_enc_is_newline(p0, pend, enc)) { 06195 p++; 06196 continue; 06197 } 06198 p = p0 + rb_enc_mbclen(p0, pend, enc); 06199 line = rb_str_subseq(str, s - ptr, p - s); 06200 if (wantarray) 06201 rb_ary_push(ary, line); 06202 else 06203 rb_yield(line); 06204 str_mod_check(str, ptr, len); 06205 s = p; 06206 } 06207 goto finish; 06208 } 06209 06210 enc = rb_enc_check(str, rs); 06211 rslen = RSTRING_LEN(rs); 06212 if (rslen == 0) { 06213 newline = '\n'; 06214 } 06215 else { 06216 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc); 06217 } 06218 06219 while (p < pend) { 06220 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc); 06221 06222 again: 06223 if (rslen == 0 && c == newline) { 06224 p += n; 06225 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) { 06226 goto again; 06227 } 06228 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) { 06229 p += n; 06230 } 06231 p -= n; 06232 } 06233 if (c == newline && 06234 (rslen <= 1 || 06235 (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) { 06236 const char *pp = p + (rslen ? rslen : n); 06237 line = rb_str_subseq(str, s - ptr, pp - s); 06238 if (wantarray) 06239 rb_ary_push(ary, line); 06240 else 06241 rb_yield(line); 06242 str_mod_check(str, ptr, len); 06243 s = pp; 06244 } 06245 p += n; 06246 } 06247 06248 finish: 06249 if (s != pend) { 06250 line = rb_str_subseq(str, s - ptr, pend - s); 06251 if (wantarray) 06252 rb_ary_push(ary, line); 06253 else 06254 rb_yield(line); 06255 RB_GC_GUARD(str); 06256 } 06257 06258 if (wantarray) 06259 return ary; 06260 else 06261 return orig; 06262 } 06263 06264 /* 06265 * call-seq: 06266 * str.each_line(separator=$/) {|substr| block } -> str 06267 * str.each_line(separator=$/) -> an_enumerator 06268 * 06269 * Splits <i>str</i> using the supplied parameter as the record 06270 * separator (<code>$/</code> by default), passing each substring in 06271 * turn to the supplied block. If a zero-length record separator is 06272 * supplied, the string is split into paragraphs delimited by 06273 * multiple successive newlines. 06274 * 06275 * If no block is given, an enumerator is returned instead. 06276 * 06277 * print "Example one\n" 06278 * "hello\nworld".each_line {|s| p s} 06279 * print "Example two\n" 06280 * "hello\nworld".each_line('l') {|s| p s} 06281 * print "Example three\n" 06282 * "hello\n\n\nworld".each_line('') {|s| p s} 06283 * 06284 * <em>produces:</em> 06285 * 06286 * Example one 06287 * "hello\n" 06288 * "world" 06289 * Example two 06290 * "hel" 06291 * "l" 06292 * "o\nworl" 06293 * "d" 06294 * Example three 06295 * "hello\n\n\n" 06296 * "world" 06297 */ 06298 06299 static VALUE 06300 rb_str_each_line(int argc, VALUE *argv, VALUE str) 06301 { 06302 return rb_str_enumerate_lines(argc, argv, str, 0); 06303 } 06304 06305 /* 06306 * call-seq: 06307 * str.lines(separator=$/) -> an_array 06308 * 06309 * Returns an array of lines in <i>str</i> split using the supplied 06310 * record separator (<code>$/</code> by default). This is a 06311 * shorthand for <code>str.each_line(separator).to_a</code>. 06312 * 06313 * If a block is given, which is a deprecated form, works the same as 06314 * <code>each_line</code>. 06315 */ 06316 06317 static VALUE 06318 rb_str_lines(int argc, VALUE *argv, VALUE str) 06319 { 06320 return rb_str_enumerate_lines(argc, argv, str, 1); 06321 } 06322 06323 static VALUE 06324 rb_str_each_byte_size(VALUE str, VALUE args) 06325 { 06326 return LONG2FIX(RSTRING_LEN(str)); 06327 } 06328 06329 static VALUE 06330 rb_str_enumerate_bytes(VALUE str, int wantarray) 06331 { 06332 long i; 06333 VALUE UNINITIALIZED_VAR(ary); 06334 06335 if (rb_block_given_p()) { 06336 if (wantarray) { 06337 #if 0 /* next major */ 06338 rb_warn("given block not used"); 06339 ary = rb_ary_new(); 06340 #else 06341 rb_warning("passing a block to String#bytes is deprecated"); 06342 wantarray = 0; 06343 #endif 06344 } 06345 } 06346 else { 06347 if (wantarray) 06348 ary = rb_ary_new2(RSTRING_LEN(str)); 06349 else 06350 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size); 06351 } 06352 06353 for (i=0; i<RSTRING_LEN(str); i++) { 06354 if (wantarray) 06355 rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff)); 06356 else 06357 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff)); 06358 } 06359 if (wantarray) 06360 return ary; 06361 else 06362 return str; 06363 } 06364 06365 /* 06366 * call-seq: 06367 * str.each_byte {|fixnum| block } -> str 06368 * str.each_byte -> an_enumerator 06369 * 06370 * Passes each byte in <i>str</i> to the given block, or returns an 06371 * enumerator if no block is given. 06372 * 06373 * "hello".each_byte {|c| print c, ' ' } 06374 * 06375 * <em>produces:</em> 06376 * 06377 * 104 101 108 108 111 06378 */ 06379 06380 static VALUE 06381 rb_str_each_byte(VALUE str) 06382 { 06383 return rb_str_enumerate_bytes(str, 0); 06384 } 06385 06386 /* 06387 * call-seq: 06388 * str.bytes -> an_array 06389 * 06390 * Returns an array of bytes in <i>str</i>. This is a shorthand for 06391 * <code>str.each_byte.to_a</code>. 06392 * 06393 * If a block is given, which is a deprecated form, works the same as 06394 * <code>each_byte</code>. 06395 */ 06396 06397 static VALUE 06398 rb_str_bytes(VALUE str) 06399 { 06400 return rb_str_enumerate_bytes(str, 1); 06401 } 06402 06403 static VALUE 06404 rb_str_each_char_size(VALUE str) 06405 { 06406 long len = RSTRING_LEN(str); 06407 if (!single_byte_optimizable(str)) { 06408 const char *ptr = RSTRING_PTR(str); 06409 rb_encoding *enc = rb_enc_get(str); 06410 const char *end_ptr = ptr + len; 06411 for (len = 0; ptr < end_ptr; ++len) { 06412 ptr += rb_enc_mbclen(ptr, end_ptr, enc); 06413 } 06414 } 06415 return LONG2FIX(len); 06416 } 06417 06418 static VALUE 06419 rb_str_enumerate_chars(VALUE str, int wantarray) 06420 { 06421 VALUE orig = str; 06422 VALUE substr; 06423 long i, len, n; 06424 const char *ptr; 06425 rb_encoding *enc; 06426 VALUE UNINITIALIZED_VAR(ary); 06427 06428 if (rb_block_given_p()) { 06429 if (wantarray) { 06430 #if 0 /* next major */ 06431 rb_warn("given block not used"); 06432 ary = rb_ary_new(); 06433 #else 06434 rb_warning("passing a block to String#chars is deprecated"); 06435 wantarray = 0; 06436 #endif 06437 } 06438 } 06439 else { 06440 if (wantarray) 06441 ary = rb_ary_new(); 06442 else 06443 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size); 06444 } 06445 06446 str = rb_str_new4(str); 06447 ptr = RSTRING_PTR(str); 06448 len = RSTRING_LEN(str); 06449 enc = rb_enc_get(str); 06450 switch (ENC_CODERANGE(str)) { 06451 case ENC_CODERANGE_VALID: 06452 case ENC_CODERANGE_7BIT: 06453 for (i = 0; i < len; i += n) { 06454 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc); 06455 substr = rb_str_subseq(str, i, n); 06456 if (wantarray) 06457 rb_ary_push(ary, substr); 06458 else 06459 rb_yield(substr); 06460 } 06461 break; 06462 default: 06463 for (i = 0; i < len; i += n) { 06464 n = rb_enc_mbclen(ptr + i, ptr + len, enc); 06465 substr = rb_str_subseq(str, i, n); 06466 if (wantarray) 06467 rb_ary_push(ary, substr); 06468 else 06469 rb_yield(substr); 06470 } 06471 } 06472 RB_GC_GUARD(str); 06473 if (wantarray) 06474 return ary; 06475 else 06476 return orig; 06477 } 06478 06479 /* 06480 * call-seq: 06481 * str.each_char {|cstr| block } -> str 06482 * str.each_char -> an_enumerator 06483 * 06484 * Passes each character in <i>str</i> to the given block, or returns 06485 * an enumerator if no block is given. 06486 * 06487 * "hello".each_char {|c| print c, ' ' } 06488 * 06489 * <em>produces:</em> 06490 * 06491 * h e l l o 06492 */ 06493 06494 static VALUE 06495 rb_str_each_char(VALUE str) 06496 { 06497 return rb_str_enumerate_chars(str, 0); 06498 } 06499 06500 /* 06501 * call-seq: 06502 * str.chars -> an_array 06503 * 06504 * Returns an array of characters in <i>str</i>. This is a shorthand 06505 * for <code>str.each_char.to_a</code>. 06506 * 06507 * If a block is given, which is a deprecated form, works the same as 06508 * <code>each_char</code>. 06509 */ 06510 06511 static VALUE 06512 rb_str_chars(VALUE str) 06513 { 06514 return rb_str_enumerate_chars(str, 1); 06515 } 06516 06517 06518 static VALUE 06519 rb_str_enumerate_codepoints(VALUE str, int wantarray) 06520 { 06521 VALUE orig = str; 06522 int n; 06523 unsigned int c; 06524 const char *ptr, *end; 06525 rb_encoding *enc; 06526 VALUE UNINITIALIZED_VAR(ary); 06527 06528 if (single_byte_optimizable(str)) 06529 return rb_str_enumerate_bytes(str, wantarray); 06530 06531 if (rb_block_given_p()) { 06532 if (wantarray) { 06533 #if 0 /* next major */ 06534 rb_warn("given block not used"); 06535 ary = rb_ary_new(); 06536 #else 06537 rb_warning("passing a block to String#codepoints is deprecated"); 06538 wantarray = 0; 06539 #endif 06540 } 06541 } 06542 else { 06543 if (wantarray) 06544 ary = rb_ary_new(); 06545 else 06546 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size); 06547 } 06548 06549 str = rb_str_new4(str); 06550 ptr = RSTRING_PTR(str); 06551 end = RSTRING_END(str); 06552 enc = STR_ENC_GET(str); 06553 while (ptr < end) { 06554 c = rb_enc_codepoint_len(ptr, end, &n, enc); 06555 if (wantarray) 06556 rb_ary_push(ary, UINT2NUM(c)); 06557 else 06558 rb_yield(UINT2NUM(c)); 06559 ptr += n; 06560 } 06561 RB_GC_GUARD(str); 06562 if (wantarray) 06563 return ary; 06564 else 06565 return orig; 06566 } 06567 06568 /* 06569 * call-seq: 06570 * str.each_codepoint {|integer| block } -> str 06571 * str.each_codepoint -> an_enumerator 06572 * 06573 * Passes the <code>Integer</code> ordinal of each character in <i>str</i>, 06574 * also known as a <i>codepoint</i> when applied to Unicode strings to the 06575 * given block. 06576 * 06577 * If no block is given, an enumerator is returned instead. 06578 * 06579 * "hello\u0639".each_codepoint {|c| print c, ' ' } 06580 * 06581 * <em>produces:</em> 06582 * 06583 * 104 101 108 108 111 1593 06584 */ 06585 06586 static VALUE 06587 rb_str_each_codepoint(VALUE str) 06588 { 06589 return rb_str_enumerate_codepoints(str, 0); 06590 } 06591 06592 /* 06593 * call-seq: 06594 * str.codepoints -> an_array 06595 * 06596 * Returns an array of the <code>Integer</code> ordinals of the 06597 * characters in <i>str</i>. This is a shorthand for 06598 * <code>str.each_codepoint.to_a</code>. 06599 * 06600 * If a block is given, which is a deprecated form, works the same as 06601 * <code>each_codepoint</code>. 06602 */ 06603 06604 static VALUE 06605 rb_str_codepoints(VALUE str) 06606 { 06607 return rb_str_enumerate_codepoints(str, 1); 06608 } 06609 06610 06611 static long 06612 chopped_length(VALUE str) 06613 { 06614 rb_encoding *enc = STR_ENC_GET(str); 06615 const char *p, *p2, *beg, *end; 06616 06617 beg = RSTRING_PTR(str); 06618 end = beg + RSTRING_LEN(str); 06619 if (beg > end) return 0; 06620 p = rb_enc_prev_char(beg, end, end, enc); 06621 if (!p) return 0; 06622 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') { 06623 p2 = rb_enc_prev_char(beg, p, end, enc); 06624 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2; 06625 } 06626 return p - beg; 06627 } 06628 06629 /* 06630 * call-seq: 06631 * str.chop! -> str or nil 06632 * 06633 * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>, 06634 * or <code>nil</code> if <i>str</i> is the empty string. See also 06635 * <code>String#chomp!</code>. 06636 */ 06637 06638 static VALUE 06639 rb_str_chop_bang(VALUE str) 06640 { 06641 str_modify_keep_cr(str); 06642 if (RSTRING_LEN(str) > 0) { 06643 long len; 06644 len = chopped_length(str); 06645 STR_SET_LEN(str, len); 06646 RSTRING_PTR(str)[len] = '\0'; 06647 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { 06648 ENC_CODERANGE_CLEAR(str); 06649 } 06650 return str; 06651 } 06652 return Qnil; 06653 } 06654 06655 06656 /* 06657 * call-seq: 06658 * str.chop -> new_str 06659 * 06660 * Returns a new <code>String</code> with the last character removed. If the 06661 * string ends with <code>\r\n</code>, both characters are removed. Applying 06662 * <code>chop</code> to an empty string returns an empty 06663 * string. <code>String#chomp</code> is often a safer alternative, as it leaves 06664 * the string unchanged if it doesn't end in a record separator. 06665 * 06666 * "string\r\n".chop #=> "string" 06667 * "string\n\r".chop #=> "string\n" 06668 * "string\n".chop #=> "string" 06669 * "string".chop #=> "strin" 06670 * "x".chop.chop #=> "" 06671 */ 06672 06673 static VALUE 06674 rb_str_chop(VALUE str) 06675 { 06676 return rb_str_subseq(str, 0, chopped_length(str)); 06677 } 06678 06679 06680 /* 06681 * call-seq: 06682 * str.chomp!(separator=$/) -> str or nil 06683 * 06684 * Modifies <i>str</i> in place as described for <code>String#chomp</code>, 06685 * returning <i>str</i>, or <code>nil</code> if no modifications were made. 06686 */ 06687 06688 static VALUE 06689 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) 06690 { 06691 rb_encoding *enc; 06692 VALUE rs; 06693 int newline; 06694 char *p, *pp, *e; 06695 long len, rslen; 06696 06697 str_modify_keep_cr(str); 06698 len = RSTRING_LEN(str); 06699 if (len == 0) return Qnil; 06700 p = RSTRING_PTR(str); 06701 e = p + len; 06702 if (argc == 0) { 06703 rs = rb_rs; 06704 if (rs == rb_default_rs) { 06705 smart_chomp: 06706 enc = rb_enc_get(str); 06707 if (rb_enc_mbminlen(enc) > 1) { 06708 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc); 06709 if (rb_enc_is_newline(pp, e, enc)) { 06710 e = pp; 06711 } 06712 pp = e - rb_enc_mbminlen(enc); 06713 if (pp >= p) { 06714 pp = rb_enc_left_char_head(p, pp, e, enc); 06715 if (rb_enc_ascget(pp, e, 0, enc) == '\r') { 06716 e = pp; 06717 } 06718 } 06719 if (e == RSTRING_END(str)) { 06720 return Qnil; 06721 } 06722 len = e - RSTRING_PTR(str); 06723 STR_SET_LEN(str, len); 06724 } 06725 else { 06726 if (RSTRING_PTR(str)[len-1] == '\n') { 06727 STR_DEC_LEN(str); 06728 if (RSTRING_LEN(str) > 0 && 06729 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') { 06730 STR_DEC_LEN(str); 06731 } 06732 } 06733 else if (RSTRING_PTR(str)[len-1] == '\r') { 06734 STR_DEC_LEN(str); 06735 } 06736 else { 06737 return Qnil; 06738 } 06739 } 06740 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06741 return str; 06742 } 06743 } 06744 else { 06745 rb_scan_args(argc, argv, "01", &rs); 06746 } 06747 if (NIL_P(rs)) return Qnil; 06748 StringValue(rs); 06749 rslen = RSTRING_LEN(rs); 06750 if (rslen == 0) { 06751 while (len>0 && p[len-1] == '\n') { 06752 len--; 06753 if (len>0 && p[len-1] == '\r') 06754 len--; 06755 } 06756 if (len < RSTRING_LEN(str)) { 06757 STR_SET_LEN(str, len); 06758 RSTRING_PTR(str)[len] = '\0'; 06759 return str; 06760 } 06761 return Qnil; 06762 } 06763 if (rslen > len) return Qnil; 06764 newline = RSTRING_PTR(rs)[rslen-1]; 06765 if (rslen == 1 && newline == '\n') 06766 goto smart_chomp; 06767 06768 enc = rb_enc_check(str, rs); 06769 if (is_broken_string(rs)) { 06770 return Qnil; 06771 } 06772 pp = e - rslen; 06773 if (p[len-1] == newline && 06774 (rslen <= 1 || 06775 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) { 06776 if (rb_enc_left_char_head(p, pp, e, enc) != pp) 06777 return Qnil; 06778 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { 06779 ENC_CODERANGE_CLEAR(str); 06780 } 06781 STR_SET_LEN(str, RSTRING_LEN(str) - rslen); 06782 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06783 return str; 06784 } 06785 return Qnil; 06786 } 06787 06788 06789 /* 06790 * call-seq: 06791 * str.chomp(separator=$/) -> new_str 06792 * 06793 * Returns a new <code>String</code> with the given record separator removed 06794 * from the end of <i>str</i> (if present). If <code>$/</code> has not been 06795 * changed from the default Ruby record separator, then <code>chomp</code> also 06796 * removes carriage return characters (that is it will remove <code>\n</code>, 06797 * <code>\r</code>, and <code>\r\n</code>). 06798 * 06799 * "hello".chomp #=> "hello" 06800 * "hello\n".chomp #=> "hello" 06801 * "hello\r\n".chomp #=> "hello" 06802 * "hello\n\r".chomp #=> "hello\n" 06803 * "hello\r".chomp #=> "hello" 06804 * "hello \n there".chomp #=> "hello \n there" 06805 * "hello".chomp("llo") #=> "he" 06806 */ 06807 06808 static VALUE 06809 rb_str_chomp(int argc, VALUE *argv, VALUE str) 06810 { 06811 str = rb_str_dup(str); 06812 rb_str_chomp_bang(argc, argv, str); 06813 return str; 06814 } 06815 06816 /* 06817 * call-seq: 06818 * str.lstrip! -> self or nil 06819 * 06820 * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no 06821 * change was made. See also <code>String#rstrip!</code> and 06822 * <code>String#strip!</code>. 06823 * 06824 * " hello ".lstrip #=> "hello " 06825 * "hello".lstrip! #=> nil 06826 */ 06827 06828 static VALUE 06829 rb_str_lstrip_bang(VALUE str) 06830 { 06831 rb_encoding *enc; 06832 char *s, *t, *e; 06833 06834 str_modify_keep_cr(str); 06835 enc = STR_ENC_GET(str); 06836 s = RSTRING_PTR(str); 06837 if (!s || RSTRING_LEN(str) == 0) return Qnil; 06838 e = t = RSTRING_END(str); 06839 /* remove spaces at head */ 06840 while (s < e) { 06841 int n; 06842 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc); 06843 06844 if (!rb_isspace(cc)) break; 06845 s += n; 06846 } 06847 06848 if (s > RSTRING_PTR(str)) { 06849 STR_SET_LEN(str, t-s); 06850 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str)); 06851 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06852 return str; 06853 } 06854 return Qnil; 06855 } 06856 06857 06858 /* 06859 * call-seq: 06860 * str.lstrip -> new_str 06861 * 06862 * Returns a copy of <i>str</i> with leading whitespace removed. See also 06863 * <code>String#rstrip</code> and <code>String#strip</code>. 06864 * 06865 * " hello ".lstrip #=> "hello " 06866 * "hello".lstrip #=> "hello" 06867 */ 06868 06869 static VALUE 06870 rb_str_lstrip(VALUE str) 06871 { 06872 str = rb_str_dup(str); 06873 rb_str_lstrip_bang(str); 06874 return str; 06875 } 06876 06877 06878 /* 06879 * call-seq: 06880 * str.rstrip! -> self or nil 06881 * 06882 * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if 06883 * no change was made. See also <code>String#lstrip!</code> and 06884 * <code>String#strip!</code>. 06885 * 06886 * " hello ".rstrip #=> " hello" 06887 * "hello".rstrip! #=> nil 06888 */ 06889 06890 static VALUE 06891 rb_str_rstrip_bang(VALUE str) 06892 { 06893 rb_encoding *enc; 06894 char *s, *t, *e; 06895 06896 str_modify_keep_cr(str); 06897 enc = STR_ENC_GET(str); 06898 rb_str_check_dummy_enc(enc); 06899 s = RSTRING_PTR(str); 06900 if (!s || RSTRING_LEN(str) == 0) return Qnil; 06901 t = e = RSTRING_END(str); 06902 06903 /* remove trailing spaces or '\0's */ 06904 if (single_byte_optimizable(str)) { 06905 unsigned char c; 06906 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--; 06907 } 06908 else { 06909 char *tp; 06910 06911 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) { 06912 unsigned int c = rb_enc_codepoint(tp, e, enc); 06913 if (c && !rb_isspace(c)) break; 06914 t = tp; 06915 } 06916 } 06917 if (t < e) { 06918 long len = t-RSTRING_PTR(str); 06919 06920 STR_SET_LEN(str, len); 06921 RSTRING_PTR(str)[len] = '\0'; 06922 return str; 06923 } 06924 return Qnil; 06925 } 06926 06927 06928 /* 06929 * call-seq: 06930 * str.rstrip -> new_str 06931 * 06932 * Returns a copy of <i>str</i> with trailing whitespace removed. See also 06933 * <code>String#lstrip</code> and <code>String#strip</code>. 06934 * 06935 * " hello ".rstrip #=> " hello" 06936 * "hello".rstrip #=> "hello" 06937 */ 06938 06939 static VALUE 06940 rb_str_rstrip(VALUE str) 06941 { 06942 str = rb_str_dup(str); 06943 rb_str_rstrip_bang(str); 06944 return str; 06945 } 06946 06947 06948 /* 06949 * call-seq: 06950 * str.strip! -> str or nil 06951 * 06952 * Removes leading and trailing whitespace from <i>str</i>. Returns 06953 * <code>nil</code> if <i>str</i> was not altered. 06954 */ 06955 06956 static VALUE 06957 rb_str_strip_bang(VALUE str) 06958 { 06959 VALUE l = rb_str_lstrip_bang(str); 06960 VALUE r = rb_str_rstrip_bang(str); 06961 06962 if (NIL_P(l) && NIL_P(r)) return Qnil; 06963 return str; 06964 } 06965 06966 06967 /* 06968 * call-seq: 06969 * str.strip -> new_str 06970 * 06971 * Returns a copy of <i>str</i> with leading and trailing whitespace removed. 06972 * 06973 * " hello ".strip #=> "hello" 06974 * "\tgoodbye\r\n".strip #=> "goodbye" 06975 */ 06976 06977 static VALUE 06978 rb_str_strip(VALUE str) 06979 { 06980 str = rb_str_dup(str); 06981 rb_str_strip_bang(str); 06982 return str; 06983 } 06984 06985 static VALUE 06986 scan_once(VALUE str, VALUE pat, long *start) 06987 { 06988 VALUE result, match; 06989 struct re_registers *regs; 06990 int i; 06991 06992 if (rb_reg_search(pat, str, *start, 0) >= 0) { 06993 match = rb_backref_get(); 06994 regs = RMATCH_REGS(match); 06995 if (BEG(0) == END(0)) { 06996 rb_encoding *enc = STR_ENC_GET(str); 06997 /* 06998 * Always consume at least one character of the input string 06999 */ 07000 if (RSTRING_LEN(str) > END(0)) 07001 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0), 07002 RSTRING_END(str), enc); 07003 else 07004 *start = END(0)+1; 07005 } 07006 else { 07007 *start = END(0); 07008 } 07009 if (regs->num_regs == 1) { 07010 return rb_reg_nth_match(0, match); 07011 } 07012 result = rb_ary_new2(regs->num_regs); 07013 for (i=1; i < regs->num_regs; i++) { 07014 rb_ary_push(result, rb_reg_nth_match(i, match)); 07015 } 07016 07017 return result; 07018 } 07019 return Qnil; 07020 } 07021 07022 07023 /* 07024 * call-seq: 07025 * str.scan(pattern) -> array 07026 * str.scan(pattern) {|match, ...| block } -> str 07027 * 07028 * Both forms iterate through <i>str</i>, matching the pattern (which may be a 07029 * <code>Regexp</code> or a <code>String</code>). For each match, a result is 07030 * generated and either added to the result array or passed to the block. If 07031 * the pattern contains no groups, each individual result consists of the 07032 * matched string, <code>$&</code>. If the pattern contains groups, each 07033 * individual result is itself an array containing one entry per group. 07034 * 07035 * a = "cruel world" 07036 * a.scan(/\w+/) #=> ["cruel", "world"] 07037 * a.scan(/.../) #=> ["cru", "el ", "wor"] 07038 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]] 07039 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]] 07040 * 07041 * And the block form: 07042 * 07043 * a.scan(/\w+/) {|w| print "<<#{w}>> " } 07044 * print "\n" 07045 * a.scan(/(.)(.)/) {|x,y| print y, x } 07046 * print "\n" 07047 * 07048 * <em>produces:</em> 07049 * 07050 * <<cruel>> <<world>> 07051 * rceu lowlr 07052 */ 07053 07054 static VALUE 07055 rb_str_scan(VALUE str, VALUE pat) 07056 { 07057 VALUE result; 07058 long start = 0; 07059 long last = -1, prev = 0; 07060 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str); 07061 07062 pat = get_pat(pat, 1); 07063 if (!rb_block_given_p()) { 07064 VALUE ary = rb_ary_new(); 07065 07066 while (!NIL_P(result = scan_once(str, pat, &start))) { 07067 last = prev; 07068 prev = start; 07069 rb_ary_push(ary, result); 07070 } 07071 if (last >= 0) rb_reg_search(pat, str, last, 0); 07072 return ary; 07073 } 07074 07075 while (!NIL_P(result = scan_once(str, pat, &start))) { 07076 last = prev; 07077 prev = start; 07078 rb_yield(result); 07079 str_mod_check(str, p, len); 07080 } 07081 if (last >= 0) rb_reg_search(pat, str, last, 0); 07082 return str; 07083 } 07084 07085 07086 /* 07087 * call-seq: 07088 * str.hex -> integer 07089 * 07090 * Treats leading characters from <i>str</i> as a string of hexadecimal digits 07091 * (with an optional sign and an optional <code>0x</code>) and returns the 07092 * corresponding number. Zero is returned on error. 07093 * 07094 * "0x0a".hex #=> 10 07095 * "-1234".hex #=> -4660 07096 * "0".hex #=> 0 07097 * "wombat".hex #=> 0 07098 */ 07099 07100 static VALUE 07101 rb_str_hex(VALUE str) 07102 { 07103 return rb_str_to_inum(str, 16, FALSE); 07104 } 07105 07106 07107 /* 07108 * call-seq: 07109 * str.oct -> integer 07110 * 07111 * Treats leading characters of <i>str</i> as a string of octal digits (with an 07112 * optional sign) and returns the corresponding number. Returns 0 if the 07113 * conversion fails. 07114 * 07115 * "123".oct #=> 83 07116 * "-377".oct #=> -255 07117 * "bad".oct #=> 0 07118 * "0377bad".oct #=> 255 07119 */ 07120 07121 static VALUE 07122 rb_str_oct(VALUE str) 07123 { 07124 return rb_str_to_inum(str, -8, FALSE); 07125 } 07126 07127 07128 /* 07129 * call-seq: 07130 * str.crypt(salt_str) -> new_str 07131 * 07132 * Applies a one-way cryptographic hash to <i>str</i> by invoking the 07133 * standard library function <code>crypt(3)</code> with the given 07134 * salt string. While the format and the result are system and 07135 * implementation dependent, using a salt matching the regular 07136 * expression <code>\A[a-zA-Z0-9./]{2}</code> should be valid and 07137 * safe on any platform, in which only the first two characters are 07138 * significant. 07139 * 07140 * This method is for use in system specific scripts, so if you want 07141 * a cross-platform hash function consider using Digest or OpenSSL 07142 * instead. 07143 */ 07144 07145 static VALUE 07146 rb_str_crypt(VALUE str, VALUE salt) 07147 { 07148 extern char *crypt(const char *, const char *); 07149 VALUE result; 07150 const char *s, *saltp; 07151 char *res; 07152 #ifdef BROKEN_CRYPT 07153 char salt_8bit_clean[3]; 07154 #endif 07155 07156 StringValue(salt); 07157 if (RSTRING_LEN(salt) < 2) 07158 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)"); 07159 07160 s = RSTRING_PTR(str); 07161 if (!s) s = ""; 07162 saltp = RSTRING_PTR(salt); 07163 #ifdef BROKEN_CRYPT 07164 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) { 07165 salt_8bit_clean[0] = saltp[0] & 0x7f; 07166 salt_8bit_clean[1] = saltp[1] & 0x7f; 07167 salt_8bit_clean[2] = '\0'; 07168 saltp = salt_8bit_clean; 07169 } 07170 #endif 07171 res = crypt(s, saltp); 07172 if (!res) { 07173 rb_sys_fail("crypt"); 07174 } 07175 result = rb_str_new2(res); 07176 OBJ_INFECT(result, str); 07177 OBJ_INFECT(result, salt); 07178 return result; 07179 } 07180 07181 07182 /* 07183 * call-seq: 07184 * str.intern -> symbol 07185 * str.to_sym -> symbol 07186 * 07187 * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the 07188 * symbol if it did not previously exist. See <code>Symbol#id2name</code>. 07189 * 07190 * "Koala".intern #=> :Koala 07191 * s = 'cat'.to_sym #=> :cat 07192 * s == :cat #=> true 07193 * s = '@cat'.to_sym #=> :@cat 07194 * s == :@cat #=> true 07195 * 07196 * This can also be used to create symbols that cannot be represented using the 07197 * <code>:xxx</code> notation. 07198 * 07199 * 'cat and dog'.to_sym #=> :"cat and dog" 07200 */ 07201 07202 VALUE 07203 rb_str_intern(VALUE s) 07204 { 07205 VALUE str = RB_GC_GUARD(s); 07206 ID id; 07207 07208 id = rb_intern_str(str); 07209 return ID2SYM(id); 07210 } 07211 07212 07213 /* 07214 * call-seq: 07215 * str.ord -> integer 07216 * 07217 * Return the <code>Integer</code> ordinal of a one-character string. 07218 * 07219 * "a".ord #=> 97 07220 */ 07221 07222 VALUE 07223 rb_str_ord(VALUE s) 07224 { 07225 unsigned int c; 07226 07227 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s)); 07228 return UINT2NUM(c); 07229 } 07230 /* 07231 * call-seq: 07232 * str.sum(n=16) -> integer 07233 * 07234 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>, 07235 * where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting 07236 * to 16. The result is simply the sum of the binary value of each character in 07237 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good 07238 * checksum. 07239 */ 07240 07241 static VALUE 07242 rb_str_sum(int argc, VALUE *argv, VALUE str) 07243 { 07244 VALUE vbits; 07245 int bits; 07246 char *ptr, *p, *pend; 07247 long len; 07248 VALUE sum = INT2FIX(0); 07249 unsigned long sum0 = 0; 07250 07251 if (argc == 0) { 07252 bits = 16; 07253 } 07254 else { 07255 rb_scan_args(argc, argv, "01", &vbits); 07256 bits = NUM2INT(vbits); 07257 } 07258 ptr = p = RSTRING_PTR(str); 07259 len = RSTRING_LEN(str); 07260 pend = p + len; 07261 07262 while (p < pend) { 07263 if (FIXNUM_MAX - UCHAR_MAX < sum0) { 07264 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 07265 str_mod_check(str, ptr, len); 07266 sum0 = 0; 07267 } 07268 sum0 += (unsigned char)*p; 07269 p++; 07270 } 07271 07272 if (bits == 0) { 07273 if (sum0) { 07274 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 07275 } 07276 } 07277 else { 07278 if (sum == INT2FIX(0)) { 07279 if (bits < (int)sizeof(long)*CHAR_BIT) { 07280 sum0 &= (((unsigned long)1)<<bits)-1; 07281 } 07282 sum = LONG2FIX(sum0); 07283 } 07284 else { 07285 VALUE mod; 07286 07287 if (sum0) { 07288 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 07289 } 07290 07291 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits)); 07292 mod = rb_funcall(mod, '-', 1, INT2FIX(1)); 07293 sum = rb_funcall(sum, '&', 1, mod); 07294 } 07295 } 07296 return sum; 07297 } 07298 07299 static VALUE 07300 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) 07301 { 07302 rb_encoding *enc; 07303 VALUE w; 07304 long width, len, flen = 1, fclen = 1; 07305 VALUE res; 07306 char *p; 07307 const char *f = " "; 07308 long n, size, llen, rlen, llen2 = 0, rlen2 = 0; 07309 volatile VALUE pad; 07310 int singlebyte = 1, cr; 07311 07312 rb_scan_args(argc, argv, "11", &w, &pad); 07313 enc = STR_ENC_GET(str); 07314 width = NUM2LONG(w); 07315 if (argc == 2) { 07316 StringValue(pad); 07317 enc = rb_enc_check(str, pad); 07318 f = RSTRING_PTR(pad); 07319 flen = RSTRING_LEN(pad); 07320 fclen = str_strlen(pad, enc); 07321 singlebyte = single_byte_optimizable(pad); 07322 if (flen == 0 || fclen == 0) { 07323 rb_raise(rb_eArgError, "zero width padding"); 07324 } 07325 } 07326 len = str_strlen(str, enc); 07327 if (width < 0 || len >= width) return rb_str_dup(str); 07328 n = width - len; 07329 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2); 07330 rlen = n - llen; 07331 cr = ENC_CODERANGE(str); 07332 if (flen > 1) { 07333 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte); 07334 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte); 07335 } 07336 size = RSTRING_LEN(str); 07337 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen || 07338 (len *= flen) >= LONG_MAX - llen2 - rlen2 || 07339 (len += llen2 + rlen2) >= LONG_MAX - size) { 07340 rb_raise(rb_eArgError, "argument too big"); 07341 } 07342 len += size; 07343 res = rb_str_new5(str, 0, len); 07344 p = RSTRING_PTR(res); 07345 if (flen <= 1) { 07346 memset(p, *f, llen); 07347 p += llen; 07348 } 07349 else { 07350 while (llen >= fclen) { 07351 memcpy(p,f,flen); 07352 p += flen; 07353 llen -= fclen; 07354 } 07355 if (llen > 0) { 07356 memcpy(p, f, llen2); 07357 p += llen2; 07358 } 07359 } 07360 memcpy(p, RSTRING_PTR(str), size); 07361 p += size; 07362 if (flen <= 1) { 07363 memset(p, *f, rlen); 07364 p += rlen; 07365 } 07366 else { 07367 while (rlen >= fclen) { 07368 memcpy(p,f,flen); 07369 p += flen; 07370 rlen -= fclen; 07371 } 07372 if (rlen > 0) { 07373 memcpy(p, f, rlen2); 07374 p += rlen2; 07375 } 07376 } 07377 *p = '\0'; 07378 STR_SET_LEN(res, p-RSTRING_PTR(res)); 07379 OBJ_INFECT(res, str); 07380 if (!NIL_P(pad)) OBJ_INFECT(res, pad); 07381 rb_enc_associate(res, enc); 07382 if (argc == 2) 07383 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad)); 07384 if (cr != ENC_CODERANGE_BROKEN) 07385 ENC_CODERANGE_SET(res, cr); 07386 return res; 07387 } 07388 07389 07390 /* 07391 * call-seq: 07392 * str.ljust(integer, padstr=' ') -> new_str 07393 * 07394 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07395 * <code>String</code> of length <i>integer</i> with <i>str</i> left justified 07396 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07397 * 07398 * "hello".ljust(4) #=> "hello" 07399 * "hello".ljust(20) #=> "hello " 07400 * "hello".ljust(20, '1234') #=> "hello123412341234123" 07401 */ 07402 07403 static VALUE 07404 rb_str_ljust(int argc, VALUE *argv, VALUE str) 07405 { 07406 return rb_str_justify(argc, argv, str, 'l'); 07407 } 07408 07409 07410 /* 07411 * call-seq: 07412 * str.rjust(integer, padstr=' ') -> new_str 07413 * 07414 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07415 * <code>String</code> of length <i>integer</i> with <i>str</i> right justified 07416 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07417 * 07418 * "hello".rjust(4) #=> "hello" 07419 * "hello".rjust(20) #=> " hello" 07420 * "hello".rjust(20, '1234') #=> "123412341234123hello" 07421 */ 07422 07423 static VALUE 07424 rb_str_rjust(int argc, VALUE *argv, VALUE str) 07425 { 07426 return rb_str_justify(argc, argv, str, 'r'); 07427 } 07428 07429 07430 /* 07431 * call-seq: 07432 * str.center(width, padstr=' ') -> new_str 07433 * 07434 * Centers +str+ in +width+. If +width+ is greater than the length of +str+, 07435 * returns a new String of length +width+ with +str+ centered and padded with 07436 * +padstr+; otherwise, returns +str+. 07437 * 07438 * "hello".center(4) #=> "hello" 07439 * "hello".center(20) #=> " hello " 07440 * "hello".center(20, '123') #=> "1231231hello12312312" 07441 */ 07442 07443 static VALUE 07444 rb_str_center(int argc, VALUE *argv, VALUE str) 07445 { 07446 return rb_str_justify(argc, argv, str, 'c'); 07447 } 07448 07449 /* 07450 * call-seq: 07451 * str.partition(sep) -> [head, sep, tail] 07452 * str.partition(regexp) -> [head, match, tail] 07453 * 07454 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string 07455 * and returns the part before it, the match, and the part 07456 * after it. 07457 * If it is not found, returns two empty strings and <i>str</i>. 07458 * 07459 * "hello".partition("l") #=> ["he", "l", "lo"] 07460 * "hello".partition("x") #=> ["hello", "", ""] 07461 * "hello".partition(/.l/) #=> ["h", "el", "lo"] 07462 */ 07463 07464 static VALUE 07465 rb_str_partition(VALUE str, VALUE sep) 07466 { 07467 long pos; 07468 int regex = FALSE; 07469 07470 if (RB_TYPE_P(sep, T_REGEXP)) { 07471 pos = rb_reg_search(sep, str, 0, 0); 07472 regex = TRUE; 07473 } 07474 else { 07475 VALUE tmp; 07476 07477 tmp = rb_check_string_type(sep); 07478 if (NIL_P(tmp)) { 07479 rb_raise(rb_eTypeError, "type mismatch: %s given", 07480 rb_obj_classname(sep)); 07481 } 07482 sep = tmp; 07483 pos = rb_str_index(str, sep, 0); 07484 } 07485 if (pos < 0) { 07486 failed: 07487 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str)); 07488 } 07489 if (regex) { 07490 sep = rb_str_subpat(str, sep, INT2FIX(0)); 07491 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed; 07492 } 07493 return rb_ary_new3(3, rb_str_subseq(str, 0, pos), 07494 sep, 07495 rb_str_subseq(str, pos+RSTRING_LEN(sep), 07496 RSTRING_LEN(str)-pos-RSTRING_LEN(sep))); 07497 } 07498 07499 /* 07500 * call-seq: 07501 * str.rpartition(sep) -> [head, sep, tail] 07502 * str.rpartition(regexp) -> [head, match, tail] 07503 * 07504 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end 07505 * of the string, and returns the part before it, the match, and the part 07506 * after it. 07507 * If it is not found, returns two empty strings and <i>str</i>. 07508 * 07509 * "hello".rpartition("l") #=> ["hel", "l", "o"] 07510 * "hello".rpartition("x") #=> ["", "", "hello"] 07511 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"] 07512 */ 07513 07514 static VALUE 07515 rb_str_rpartition(VALUE str, VALUE sep) 07516 { 07517 long pos = RSTRING_LEN(str); 07518 int regex = FALSE; 07519 07520 if (RB_TYPE_P(sep, T_REGEXP)) { 07521 pos = rb_reg_search(sep, str, pos, 1); 07522 regex = TRUE; 07523 } 07524 else { 07525 VALUE tmp; 07526 07527 tmp = rb_check_string_type(sep); 07528 if (NIL_P(tmp)) { 07529 rb_raise(rb_eTypeError, "type mismatch: %s given", 07530 rb_obj_classname(sep)); 07531 } 07532 sep = tmp; 07533 pos = rb_str_sublen(str, pos); 07534 pos = rb_str_rindex(str, sep, pos); 07535 } 07536 if (pos < 0) { 07537 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str); 07538 } 07539 if (regex) { 07540 sep = rb_reg_nth_match(0, rb_backref_get()); 07541 } 07542 return rb_ary_new3(3, rb_str_substr(str, 0, pos), 07543 sep, 07544 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str))); 07545 } 07546 07547 /* 07548 * call-seq: 07549 * str.start_with?([prefixes]+) -> true or false 07550 * 07551 * Returns true if +str+ starts with one of the +prefixes+ given. 07552 * 07553 * "hello".start_with?("hell") #=> true 07554 * 07555 * # returns true if one of the prefixes matches. 07556 * "hello".start_with?("heaven", "hell") #=> true 07557 * "hello".start_with?("heaven", "paradise") #=> false 07558 */ 07559 07560 static VALUE 07561 rb_str_start_with(int argc, VALUE *argv, VALUE str) 07562 { 07563 int i; 07564 07565 for (i=0; i<argc; i++) { 07566 VALUE tmp = argv[i]; 07567 StringValue(tmp); 07568 rb_enc_check(str, tmp); 07569 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; 07570 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) 07571 return Qtrue; 07572 } 07573 return Qfalse; 07574 } 07575 07576 /* 07577 * call-seq: 07578 * str.end_with?([suffixes]+) -> true or false 07579 * 07580 * Returns true if +str+ ends with one of the +suffixes+ given. 07581 */ 07582 07583 static VALUE 07584 rb_str_end_with(int argc, VALUE *argv, VALUE str) 07585 { 07586 int i; 07587 char *p, *s, *e; 07588 rb_encoding *enc; 07589 07590 for (i=0; i<argc; i++) { 07591 VALUE tmp = argv[i]; 07592 StringValue(tmp); 07593 enc = rb_enc_check(str, tmp); 07594 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; 07595 p = RSTRING_PTR(str); 07596 e = p + RSTRING_LEN(str); 07597 s = e - RSTRING_LEN(tmp); 07598 if (rb_enc_left_char_head(p, s, e, enc) != s) 07599 continue; 07600 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) 07601 return Qtrue; 07602 } 07603 return Qfalse; 07604 } 07605 07606 void 07607 rb_str_setter(VALUE val, ID id, VALUE *var) 07608 { 07609 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) { 07610 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id)); 07611 } 07612 *var = val; 07613 } 07614 07615 07616 /* 07617 * call-seq: 07618 * str.force_encoding(encoding) -> str 07619 * 07620 * Changes the encoding to +encoding+ and returns self. 07621 */ 07622 07623 static VALUE 07624 rb_str_force_encoding(VALUE str, VALUE enc) 07625 { 07626 str_modifiable(str); 07627 rb_enc_associate(str, rb_to_encoding(enc)); 07628 ENC_CODERANGE_CLEAR(str); 07629 return str; 07630 } 07631 07632 /* 07633 * call-seq: 07634 * str.b -> str 07635 * 07636 * Returns a copied string whose encoding is ASCII-8BIT. 07637 */ 07638 07639 static VALUE 07640 rb_str_b(VALUE str) 07641 { 07642 VALUE str2 = str_alloc(rb_cString); 07643 str_replace_shared_without_enc(str2, str); 07644 OBJ_INFECT(str2, str); 07645 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID); 07646 return str2; 07647 } 07648 07649 /* 07650 * call-seq: 07651 * str.valid_encoding? -> true or false 07652 * 07653 * Returns true for a string which encoded correctly. 07654 * 07655 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true 07656 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false 07657 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false 07658 */ 07659 07660 static VALUE 07661 rb_str_valid_encoding_p(VALUE str) 07662 { 07663 int cr = rb_enc_str_coderange(str); 07664 07665 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue; 07666 } 07667 07668 /* 07669 * call-seq: 07670 * str.ascii_only? -> true or false 07671 * 07672 * Returns true for a string which has only ASCII characters. 07673 * 07674 * "abc".force_encoding("UTF-8").ascii_only? #=> true 07675 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false 07676 */ 07677 07678 static VALUE 07679 rb_str_is_ascii_only_p(VALUE str) 07680 { 07681 int cr = rb_enc_str_coderange(str); 07682 07683 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse; 07684 } 07685 07700 VALUE 07701 rb_str_ellipsize(VALUE str, long len) 07702 { 07703 static const char ellipsis[] = "..."; 07704 const long ellipsislen = sizeof(ellipsis) - 1; 07705 rb_encoding *const enc = rb_enc_get(str); 07706 const long blen = RSTRING_LEN(str); 07707 const char *const p = RSTRING_PTR(str), *e = p + blen; 07708 VALUE estr, ret = 0; 07709 07710 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); 07711 if (len * rb_enc_mbminlen(enc) >= blen || 07712 (e = rb_enc_nth(p, e, len, enc)) - p == blen) { 07713 ret = str; 07714 } 07715 else if (len <= ellipsislen || 07716 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) { 07717 if (rb_enc_asciicompat(enc)) { 07718 ret = rb_str_new_with_class(str, ellipsis, len); 07719 rb_enc_associate(ret, enc); 07720 } 07721 else { 07722 estr = rb_usascii_str_new(ellipsis, len); 07723 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil); 07724 } 07725 } 07726 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) { 07727 rb_str_cat(ret, ellipsis, ellipsislen); 07728 } 07729 else { 07730 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen), 07731 rb_enc_from_encoding(enc), 0, Qnil); 07732 rb_str_append(ret, estr); 07733 } 07734 return ret; 07735 } 07736 07737 /********************************************************************** 07738 * Document-class: Symbol 07739 * 07740 * <code>Symbol</code> objects represent names and some strings 07741 * inside the Ruby 07742 * interpreter. They are generated using the <code>:name</code> and 07743 * <code>:"string"</code> literals 07744 * syntax, and by the various <code>to_sym</code> methods. The same 07745 * <code>Symbol</code> object will be created for a given name or string 07746 * for the duration of a program's execution, regardless of the context 07747 * or meaning of that name. Thus if <code>Fred</code> is a constant in 07748 * one context, a method in another, and a class in a third, the 07749 * <code>Symbol</code> <code>:Fred</code> will be the same object in 07750 * all three contexts. 07751 * 07752 * module One 07753 * class Fred 07754 * end 07755 * $f1 = :Fred 07756 * end 07757 * module Two 07758 * Fred = 1 07759 * $f2 = :Fred 07760 * end 07761 * def Fred() 07762 * end 07763 * $f3 = :Fred 07764 * $f1.object_id #=> 2514190 07765 * $f2.object_id #=> 2514190 07766 * $f3.object_id #=> 2514190 07767 * 07768 */ 07769 07770 07771 /* 07772 * call-seq: 07773 * sym == obj -> true or false 07774 * 07775 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same 07776 * symbol, returns <code>true</code>. 07777 */ 07778 07779 static VALUE 07780 sym_equal(VALUE sym1, VALUE sym2) 07781 { 07782 if (sym1 == sym2) return Qtrue; 07783 return Qfalse; 07784 } 07785 07786 07787 static int 07788 sym_printable(const char *s, const char *send, rb_encoding *enc) 07789 { 07790 while (s < send) { 07791 int n; 07792 int c = rb_enc_codepoint_len(s, send, &n, enc); 07793 07794 if (!rb_enc_isprint(c, enc)) return FALSE; 07795 s += n; 07796 } 07797 return TRUE; 07798 } 07799 07800 int 07801 rb_str_symname_p(VALUE sym) 07802 { 07803 rb_encoding *enc; 07804 const char *ptr; 07805 long len; 07806 rb_encoding *resenc = rb_default_internal_encoding(); 07807 07808 if (resenc == NULL) resenc = rb_default_external_encoding(); 07809 enc = STR_ENC_GET(sym); 07810 ptr = RSTRING_PTR(sym); 07811 len = RSTRING_LEN(sym); 07812 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) || 07813 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) { 07814 return FALSE; 07815 } 07816 return TRUE; 07817 } 07818 07819 VALUE 07820 rb_str_quote_unprintable(VALUE str) 07821 { 07822 rb_encoding *enc; 07823 const char *ptr; 07824 long len; 07825 rb_encoding *resenc; 07826 07827 Check_Type(str, T_STRING); 07828 resenc = rb_default_internal_encoding(); 07829 if (resenc == NULL) resenc = rb_default_external_encoding(); 07830 enc = STR_ENC_GET(str); 07831 ptr = RSTRING_PTR(str); 07832 len = RSTRING_LEN(str); 07833 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) || 07834 !sym_printable(ptr, ptr + len, enc)) { 07835 return rb_str_inspect(str); 07836 } 07837 return str; 07838 } 07839 07840 VALUE 07841 rb_id_quote_unprintable(ID id) 07842 { 07843 return rb_str_quote_unprintable(rb_id2str(id)); 07844 } 07845 07846 /* 07847 * call-seq: 07848 * sym.inspect -> string 07849 * 07850 * Returns the representation of <i>sym</i> as a symbol literal. 07851 * 07852 * :fred.inspect #=> ":fred" 07853 */ 07854 07855 static VALUE 07856 sym_inspect(VALUE sym) 07857 { 07858 VALUE str; 07859 const char *ptr; 07860 long len; 07861 ID id = SYM2ID(sym); 07862 char *dest; 07863 07864 sym = rb_id2str(id); 07865 if (!rb_str_symname_p(sym)) { 07866 str = rb_str_inspect(sym); 07867 len = RSTRING_LEN(str); 07868 rb_str_resize(str, len + 1); 07869 dest = RSTRING_PTR(str); 07870 memmove(dest + 1, dest, len); 07871 dest[0] = ':'; 07872 } 07873 else { 07874 rb_encoding *enc = STR_ENC_GET(sym); 07875 ptr = RSTRING_PTR(sym); 07876 len = RSTRING_LEN(sym); 07877 str = rb_enc_str_new(0, len + 1, enc); 07878 dest = RSTRING_PTR(str); 07879 dest[0] = ':'; 07880 memcpy(dest + 1, ptr, len); 07881 } 07882 return str; 07883 } 07884 07885 07886 /* 07887 * call-seq: 07888 * sym.id2name -> string 07889 * sym.to_s -> string 07890 * 07891 * Returns the name or string corresponding to <i>sym</i>. 07892 * 07893 * :fred.id2name #=> "fred" 07894 */ 07895 07896 07897 VALUE 07898 rb_sym_to_s(VALUE sym) 07899 { 07900 ID id = SYM2ID(sym); 07901 07902 return str_new3(rb_cString, rb_id2str(id)); 07903 } 07904 07905 07906 /* 07907 * call-seq: 07908 * sym.to_sym -> sym 07909 * sym.intern -> sym 07910 * 07911 * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding 07912 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned 07913 * in this case. 07914 */ 07915 07916 static VALUE 07917 sym_to_sym(VALUE sym) 07918 { 07919 return sym; 07920 } 07921 07922 static VALUE 07923 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc) 07924 { 07925 VALUE obj; 07926 07927 if (argc < 1) { 07928 rb_raise(rb_eArgError, "no receiver given"); 07929 } 07930 obj = argv[0]; 07931 return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc); 07932 } 07933 07934 /* 07935 * call-seq: 07936 * sym.to_proc 07937 * 07938 * Returns a _Proc_ object which respond to the given method by _sym_. 07939 * 07940 * (1..3).collect(&:to_s) #=> ["1", "2", "3"] 07941 */ 07942 07943 static VALUE 07944 sym_to_proc(VALUE sym) 07945 { 07946 static VALUE sym_proc_cache = Qfalse; 07947 enum {SYM_PROC_CACHE_SIZE = 67}; 07948 VALUE proc; 07949 long id, index; 07950 VALUE *aryp; 07951 07952 if (!sym_proc_cache) { 07953 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2); 07954 rb_gc_register_mark_object(sym_proc_cache); 07955 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil); 07956 } 07957 07958 id = SYM2ID(sym); 07959 index = (id % SYM_PROC_CACHE_SIZE) << 1; 07960 07961 aryp = RARRAY_PTR(sym_proc_cache); 07962 if (aryp[index] == sym) { 07963 return aryp[index + 1]; 07964 } 07965 else { 07966 proc = rb_proc_new(sym_call, (VALUE)id); 07967 aryp[index] = sym; 07968 aryp[index + 1] = proc; 07969 return proc; 07970 } 07971 } 07972 07973 /* 07974 * call-seq: 07975 * 07976 * sym.succ 07977 * 07978 * Same as <code>sym.to_s.succ.intern</code>. 07979 */ 07980 07981 static VALUE 07982 sym_succ(VALUE sym) 07983 { 07984 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym))); 07985 } 07986 07987 /* 07988 * call-seq: 07989 * 07990 * symbol <=> other_symbol -> -1, 0, +1 or nil 07991 * 07992 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the 07993 * symbols. Returns -1, 0, +1 or nil depending on whether +symbol+ is less 07994 * than, equal to, or greater than +other_symbol+. 07995 * 07996 * +nil+ is returned if the two values are incomparable. 07997 * 07998 * See String#<=> for more information. 07999 */ 08000 08001 static VALUE 08002 sym_cmp(VALUE sym, VALUE other) 08003 { 08004 if (!SYMBOL_P(other)) { 08005 return Qnil; 08006 } 08007 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other)); 08008 } 08009 08010 /* 08011 * call-seq: 08012 * 08013 * sym.casecmp(other) -> -1, 0, +1 or nil 08014 * 08015 * Case-insensitive version of <code>Symbol#<=></code>. 08016 */ 08017 08018 static VALUE 08019 sym_casecmp(VALUE sym, VALUE other) 08020 { 08021 if (!SYMBOL_P(other)) { 08022 return Qnil; 08023 } 08024 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other)); 08025 } 08026 08027 /* 08028 * call-seq: 08029 * sym =~ obj -> fixnum or nil 08030 * 08031 * Returns <code>sym.to_s =~ obj</code>. 08032 */ 08033 08034 static VALUE 08035 sym_match(VALUE sym, VALUE other) 08036 { 08037 return rb_str_match(rb_sym_to_s(sym), other); 08038 } 08039 08040 /* 08041 * call-seq: 08042 * sym[idx] -> char 08043 * sym[b, n] -> char 08044 * 08045 * Returns <code>sym.to_s[]</code>. 08046 */ 08047 08048 static VALUE 08049 sym_aref(int argc, VALUE *argv, VALUE sym) 08050 { 08051 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym)); 08052 } 08053 08054 /* 08055 * call-seq: 08056 * sym.length -> integer 08057 * 08058 * Same as <code>sym.to_s.length</code>. 08059 */ 08060 08061 static VALUE 08062 sym_length(VALUE sym) 08063 { 08064 return rb_str_length(rb_id2str(SYM2ID(sym))); 08065 } 08066 08067 /* 08068 * call-seq: 08069 * sym.empty? -> true or false 08070 * 08071 * Returns that _sym_ is :"" or not. 08072 */ 08073 08074 static VALUE 08075 sym_empty(VALUE sym) 08076 { 08077 return rb_str_empty(rb_id2str(SYM2ID(sym))); 08078 } 08079 08080 /* 08081 * call-seq: 08082 * sym.upcase -> symbol 08083 * 08084 * Same as <code>sym.to_s.upcase.intern</code>. 08085 */ 08086 08087 static VALUE 08088 sym_upcase(VALUE sym) 08089 { 08090 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym)))); 08091 } 08092 08093 /* 08094 * call-seq: 08095 * sym.downcase -> symbol 08096 * 08097 * Same as <code>sym.to_s.downcase.intern</code>. 08098 */ 08099 08100 static VALUE 08101 sym_downcase(VALUE sym) 08102 { 08103 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym)))); 08104 } 08105 08106 /* 08107 * call-seq: 08108 * sym.capitalize -> symbol 08109 * 08110 * Same as <code>sym.to_s.capitalize.intern</code>. 08111 */ 08112 08113 static VALUE 08114 sym_capitalize(VALUE sym) 08115 { 08116 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym)))); 08117 } 08118 08119 /* 08120 * call-seq: 08121 * sym.swapcase -> symbol 08122 * 08123 * Same as <code>sym.to_s.swapcase.intern</code>. 08124 */ 08125 08126 static VALUE 08127 sym_swapcase(VALUE sym) 08128 { 08129 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym)))); 08130 } 08131 08132 /* 08133 * call-seq: 08134 * sym.encoding -> encoding 08135 * 08136 * Returns the Encoding object that represents the encoding of _sym_. 08137 */ 08138 08139 static VALUE 08140 sym_encoding(VALUE sym) 08141 { 08142 return rb_obj_encoding(rb_id2str(SYM2ID(sym))); 08143 } 08144 08145 ID 08146 rb_to_id(VALUE name) 08147 { 08148 VALUE tmp; 08149 08150 switch (TYPE(name)) { 08151 default: 08152 tmp = rb_check_string_type(name); 08153 if (NIL_P(tmp)) { 08154 tmp = rb_inspect(name); 08155 rb_raise(rb_eTypeError, "%s is not a symbol", 08156 RSTRING_PTR(tmp)); 08157 } 08158 name = tmp; 08159 /* fall through */ 08160 case T_STRING: 08161 name = rb_str_intern(name); 08162 /* fall through */ 08163 case T_SYMBOL: 08164 return SYM2ID(name); 08165 } 08166 08167 UNREACHABLE; 08168 } 08169 08170 /* 08171 * A <code>String</code> object holds and manipulates an arbitrary sequence of 08172 * bytes, typically representing characters. String objects may be created 08173 * using <code>String::new</code> or as literals. 08174 * 08175 * Because of aliasing issues, users of strings should be aware of the methods 08176 * that modify the contents of a <code>String</code> object. Typically, 08177 * methods with names ending in ``!'' modify their receiver, while those 08178 * without a ``!'' return a new <code>String</code>. However, there are 08179 * exceptions, such as <code>String#[]=</code>. 08180 * 08181 */ 08182 08183 void 08184 Init_String(void) 08185 { 08186 #undef rb_intern 08187 #define rb_intern(str) rb_intern_const(str) 08188 08189 rb_cString = rb_define_class("String", rb_cObject); 08190 rb_include_module(rb_cString, rb_mComparable); 08191 rb_define_alloc_func(rb_cString, empty_str_alloc); 08192 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1); 08193 rb_define_method(rb_cString, "initialize", rb_str_init, -1); 08194 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1); 08195 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1); 08196 rb_define_method(rb_cString, "==", rb_str_equal, 1); 08197 rb_define_method(rb_cString, "===", rb_str_equal, 1); 08198 rb_define_method(rb_cString, "eql?", rb_str_eql, 1); 08199 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0); 08200 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1); 08201 rb_define_method(rb_cString, "+", rb_str_plus, 1); 08202 rb_define_method(rb_cString, "*", rb_str_times, 1); 08203 rb_define_method(rb_cString, "%", rb_str_format_m, 1); 08204 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1); 08205 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1); 08206 rb_define_method(rb_cString, "insert", rb_str_insert, 2); 08207 rb_define_method(rb_cString, "length", rb_str_length, 0); 08208 rb_define_method(rb_cString, "size", rb_str_length, 0); 08209 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0); 08210 rb_define_method(rb_cString, "empty?", rb_str_empty, 0); 08211 rb_define_method(rb_cString, "=~", rb_str_match, 1); 08212 rb_define_method(rb_cString, "match", rb_str_match_m, -1); 08213 rb_define_method(rb_cString, "succ", rb_str_succ, 0); 08214 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0); 08215 rb_define_method(rb_cString, "next", rb_str_succ, 0); 08216 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0); 08217 rb_define_method(rb_cString, "upto", rb_str_upto, -1); 08218 rb_define_method(rb_cString, "index", rb_str_index_m, -1); 08219 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1); 08220 rb_define_method(rb_cString, "replace", rb_str_replace, 1); 08221 rb_define_method(rb_cString, "clear", rb_str_clear, 0); 08222 rb_define_method(rb_cString, "chr", rb_str_chr, 0); 08223 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1); 08224 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2); 08225 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1); 08226 08227 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1); 08228 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0); 08229 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0); 08230 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0); 08231 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0); 08232 rb_define_method(rb_cString, "dump", rb_str_dump, 0); 08233 08234 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0); 08235 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0); 08236 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0); 08237 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0); 08238 08239 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0); 08240 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0); 08241 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0); 08242 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0); 08243 08244 rb_define_method(rb_cString, "hex", rb_str_hex, 0); 08245 rb_define_method(rb_cString, "oct", rb_str_oct, 0); 08246 rb_define_method(rb_cString, "split", rb_str_split_m, -1); 08247 rb_define_method(rb_cString, "lines", rb_str_lines, -1); 08248 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0); 08249 rb_define_method(rb_cString, "chars", rb_str_chars, 0); 08250 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0); 08251 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0); 08252 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0); 08253 rb_define_method(rb_cString, "concat", rb_str_concat, 1); 08254 rb_define_method(rb_cString, "<<", rb_str_concat, 1); 08255 rb_define_method(rb_cString, "prepend", rb_str_prepend, 1); 08256 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1); 08257 rb_define_method(rb_cString, "intern", rb_str_intern, 0); 08258 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); 08259 rb_define_method(rb_cString, "ord", rb_str_ord, 0); 08260 08261 rb_define_method(rb_cString, "include?", rb_str_include, 1); 08262 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1); 08263 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1); 08264 08265 rb_define_method(rb_cString, "scan", rb_str_scan, 1); 08266 08267 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1); 08268 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1); 08269 rb_define_method(rb_cString, "center", rb_str_center, -1); 08270 08271 rb_define_method(rb_cString, "sub", rb_str_sub, -1); 08272 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1); 08273 rb_define_method(rb_cString, "chop", rb_str_chop, 0); 08274 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1); 08275 rb_define_method(rb_cString, "strip", rb_str_strip, 0); 08276 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0); 08277 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0); 08278 08279 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1); 08280 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1); 08281 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0); 08282 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1); 08283 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0); 08284 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0); 08285 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0); 08286 08287 rb_define_method(rb_cString, "tr", rb_str_tr, 2); 08288 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2); 08289 rb_define_method(rb_cString, "delete", rb_str_delete, -1); 08290 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1); 08291 rb_define_method(rb_cString, "count", rb_str_count, -1); 08292 08293 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2); 08294 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2); 08295 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1); 08296 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1); 08297 08298 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1); 08299 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0); 08300 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0); 08301 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0); 08302 08303 rb_define_method(rb_cString, "sum", rb_str_sum, -1); 08304 08305 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1); 08306 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1); 08307 08308 rb_define_method(rb_cString, "partition", rb_str_partition, 1); 08309 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1); 08310 08311 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */ 08312 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1); 08313 rb_define_method(rb_cString, "b", rb_str_b, 0); 08314 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0); 08315 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0); 08316 08317 id_to_s = rb_intern("to_s"); 08318 08319 rb_fs = Qnil; 08320 rb_define_variable("$;", &rb_fs); 08321 rb_define_variable("$-F", &rb_fs); 08322 08323 rb_cSymbol = rb_define_class("Symbol", rb_cObject); 08324 rb_include_module(rb_cSymbol, rb_mComparable); 08325 rb_undef_alloc_func(rb_cSymbol); 08326 rb_undef_method(CLASS_OF(rb_cSymbol), "new"); 08327 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */ 08328 08329 rb_define_method(rb_cSymbol, "==", sym_equal, 1); 08330 rb_define_method(rb_cSymbol, "===", sym_equal, 1); 08331 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0); 08332 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0); 08333 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0); 08334 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0); 08335 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0); 08336 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0); 08337 rb_define_method(rb_cSymbol, "succ", sym_succ, 0); 08338 rb_define_method(rb_cSymbol, "next", sym_succ, 0); 08339 08340 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1); 08341 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1); 08342 rb_define_method(rb_cSymbol, "=~", sym_match, 1); 08343 08344 rb_define_method(rb_cSymbol, "[]", sym_aref, -1); 08345 rb_define_method(rb_cSymbol, "slice", sym_aref, -1); 08346 rb_define_method(rb_cSymbol, "length", sym_length, 0); 08347 rb_define_method(rb_cSymbol, "size", sym_length, 0); 08348 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0); 08349 rb_define_method(rb_cSymbol, "match", sym_match, 1); 08350 08351 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0); 08352 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0); 08353 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0); 08354 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0); 08355 08356 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0); 08357 } 08358