Ruby
2.0.0p247(2013-06-27revision41674)
|
00001 /********************************************************************** 00002 00003 re.c - 00004 00005 $Author: marcandre $ 00006 created at: Mon Aug 9 18:24:49 JST 1993 00007 00008 Copyright (C) 1993-2007 Yukihiro Matsumoto 00009 00010 **********************************************************************/ 00011 00012 #include "ruby/ruby.h" 00013 #include "ruby/re.h" 00014 #include "ruby/encoding.h" 00015 #include "ruby/util.h" 00016 #include "internal.h" 00017 #include "regint.h" 00018 #include <ctype.h> 00019 00020 VALUE rb_eRegexpError; 00021 00022 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN]; 00023 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN) 00024 00025 #define BEG(no) (regs->beg[(no)]) 00026 #define END(no) (regs->end[(no)]) 00027 00028 #if 'a' == 97 /* it's ascii */ 00029 static const char casetable[] = { 00030 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', 00031 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', 00032 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', 00033 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', 00034 /* ' ' '!' '"' '#' '$' '%' '&' ''' */ 00035 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', 00036 /* '(' ')' '*' '+' ',' '-' '.' '/' */ 00037 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', 00038 /* '0' '1' '2' '3' '4' '5' '6' '7' */ 00039 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', 00040 /* '8' '9' ':' ';' '<' '=' '>' '?' */ 00041 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', 00042 /* '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' */ 00043 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 00044 /* 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' */ 00045 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 00046 /* 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' */ 00047 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 00048 /* 'X' 'Y' 'Z' '[' '\' ']' '^' '_' */ 00049 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', 00050 /* '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' */ 00051 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 00052 /* 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' */ 00053 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 00054 /* 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' */ 00055 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 00056 /* 'x' 'y' 'z' '{' '|' '}' '~' */ 00057 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', 00058 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', 00059 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', 00060 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', 00061 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', 00062 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', 00063 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', 00064 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', 00065 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', 00066 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', 00067 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', 00068 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', 00069 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', 00070 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', 00071 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', 00072 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', 00073 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', 00074 }; 00075 #else 00076 # error >>> "You lose. You will need a translation table for your character set." <<< 00077 #endif 00078 00079 int 00080 rb_memcicmp(const void *x, const void *y, long len) 00081 { 00082 const unsigned char *p1 = x, *p2 = y; 00083 int tmp; 00084 00085 while (len--) { 00086 if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++])) 00087 return tmp; 00088 } 00089 return 0; 00090 } 00091 00092 #undef rb_memcmp 00093 00094 int 00095 rb_memcmp(const void *p1, const void *p2, long len) 00096 { 00097 return memcmp(p1, p2, len); 00098 } 00099 00100 #ifdef HAVE_MEMMEM 00101 static inline long 00102 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n) 00103 { 00104 const unsigned char *y; 00105 00106 if (y = memmem(ys, n, xs, m)) 00107 return y - ys; 00108 else 00109 return -1; 00110 } 00111 #else 00112 static inline long 00113 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n) 00114 { 00115 const unsigned char *x = xs, *xe = xs + m; 00116 const unsigned char *y = ys, *ye = ys + n; 00117 #ifndef VALUE_MAX 00118 # if SIZEOF_VALUE == 8 00119 # define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL 00120 # elif SIZEOF_VALUE == 4 00121 # define VALUE_MAX 0xFFFFFFFFUL 00122 # endif 00123 #endif 00124 VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT); 00125 00126 if (m > SIZEOF_VALUE) 00127 rb_bug("!!too long pattern string!!"); 00128 00129 if (!(y = memchr(y, *x, n - m + 1))) 00130 return -1; 00131 00132 /* Prepare hash value */ 00133 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) { 00134 hx <<= CHAR_BIT; 00135 hy <<= CHAR_BIT; 00136 hx |= *x; 00137 hy |= *y; 00138 } 00139 /* Searching */ 00140 while (hx != hy) { 00141 if (y == ye) 00142 return -1; 00143 hy <<= CHAR_BIT; 00144 hy |= *y; 00145 hy &= mask; 00146 y++; 00147 } 00148 return y - ys - m; 00149 } 00150 #endif 00151 00152 static inline long 00153 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n) 00154 { 00155 const unsigned char *x = xs, *xe = xs + m; 00156 const unsigned char *y = ys; 00157 VALUE i, qstable[256]; 00158 00159 /* Preprocessing */ 00160 for (i = 0; i < 256; ++i) 00161 qstable[i] = m + 1; 00162 for (; x < xe; ++x) 00163 qstable[*x] = xe - x; 00164 /* Searching */ 00165 for (; y + m <= ys + n; y += *(qstable + y[m])) { 00166 if (*xs == *y && memcmp(xs, y, m) == 0) 00167 return y - ys; 00168 } 00169 return -1; 00170 } 00171 00172 static inline unsigned int 00173 rb_memsearch_qs_utf8_hash(const unsigned char *x) 00174 { 00175 register const unsigned int mix = 8353; 00176 register unsigned int h = *x; 00177 if (h < 0xC0) { 00178 return h + 256; 00179 } 00180 else if (h < 0xE0) { 00181 h *= mix; 00182 h += x[1]; 00183 } 00184 else if (h < 0xF0) { 00185 h *= mix; 00186 h += x[1]; 00187 h *= mix; 00188 h += x[2]; 00189 } 00190 else if (h < 0xF5) { 00191 h *= mix; 00192 h += x[1]; 00193 h *= mix; 00194 h += x[2]; 00195 h *= mix; 00196 h += x[3]; 00197 } 00198 else { 00199 return h + 256; 00200 } 00201 return (unsigned char)h; 00202 } 00203 00204 static inline long 00205 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n) 00206 { 00207 const unsigned char *x = xs, *xe = xs + m; 00208 const unsigned char *y = ys; 00209 VALUE i, qstable[512]; 00210 00211 /* Preprocessing */ 00212 for (i = 0; i < 512; ++i) { 00213 qstable[i] = m + 1; 00214 } 00215 for (; x < xe; ++x) { 00216 qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x; 00217 } 00218 /* Searching */ 00219 for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) { 00220 if (*xs == *y && memcmp(xs, y, m) == 0) 00221 return y - ys; 00222 } 00223 return -1; 00224 } 00225 00226 long 00227 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc) 00228 { 00229 const unsigned char *x = x0, *y = y0; 00230 00231 if (m > n) return -1; 00232 else if (m == n) { 00233 return memcmp(x0, y0, m) == 0 ? 0 : -1; 00234 } 00235 else if (m < 1) { 00236 return 0; 00237 } 00238 else if (m == 1) { 00239 const unsigned char *ys; 00240 00241 if (ys = memchr(y, *x, n)) 00242 return ys - y; 00243 else 00244 return -1; 00245 } 00246 else if (m <= SIZEOF_VALUE) { 00247 return rb_memsearch_ss(x0, m, y0, n); 00248 } 00249 else if (enc == rb_utf8_encoding()){ 00250 return rb_memsearch_qs_utf8(x0, m, y0, n); 00251 } 00252 else { 00253 return rb_memsearch_qs(x0, m, y0, n); 00254 } 00255 } 00256 00257 #define REG_LITERAL FL_USER5 00258 #define REG_ENCODING_NONE FL_USER6 00259 00260 #define KCODE_FIXED FL_USER4 00261 00262 #define ARG_REG_OPTION_MASK \ 00263 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND) 00264 #define ARG_ENCODING_FIXED 16 00265 #define ARG_ENCODING_NONE 32 00266 00267 static int 00268 char_to_option(int c) 00269 { 00270 int val; 00271 00272 switch (c) { 00273 case 'i': 00274 val = ONIG_OPTION_IGNORECASE; 00275 break; 00276 case 'x': 00277 val = ONIG_OPTION_EXTEND; 00278 break; 00279 case 'm': 00280 val = ONIG_OPTION_MULTILINE; 00281 break; 00282 default: 00283 val = 0; 00284 break; 00285 } 00286 return val; 00287 } 00288 00289 static char * 00290 option_to_str(char str[4], int options) 00291 { 00292 char *p = str; 00293 if (options & ONIG_OPTION_MULTILINE) *p++ = 'm'; 00294 if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i'; 00295 if (options & ONIG_OPTION_EXTEND) *p++ = 'x'; 00296 *p = 0; 00297 return str; 00298 } 00299 00300 extern int 00301 rb_char_to_option_kcode(int c, int *option, int *kcode) 00302 { 00303 *option = 0; 00304 00305 switch (c) { 00306 case 'n': 00307 *kcode = rb_ascii8bit_encindex(); 00308 return (*option = ARG_ENCODING_NONE); 00309 case 'e': 00310 *kcode = rb_enc_find_index("EUC-JP"); 00311 break; 00312 case 's': 00313 *kcode = rb_enc_find_index("Windows-31J"); 00314 break; 00315 case 'u': 00316 *kcode = rb_utf8_encindex(); 00317 break; 00318 default: 00319 *kcode = -1; 00320 return (*option = char_to_option(c)); 00321 } 00322 *option = ARG_ENCODING_FIXED; 00323 return 1; 00324 } 00325 00326 static void 00327 rb_reg_check(VALUE re) 00328 { 00329 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) { 00330 rb_raise(rb_eTypeError, "uninitialized Regexp"); 00331 } 00332 } 00333 00334 static void 00335 rb_reg_expr_str(VALUE str, const char *s, long len, 00336 rb_encoding *enc, rb_encoding *resenc) 00337 { 00338 const char *p, *pend; 00339 int cr = ENC_CODERANGE_UNKNOWN; 00340 int need_escape = 0; 00341 int c, clen; 00342 00343 p = s; pend = p + len; 00344 rb_str_coderange_scan_restartable(p, pend, enc, &cr); 00345 if (rb_enc_asciicompat(enc) && 00346 (cr == ENC_CODERANGE_VALID || cr == ENC_CODERANGE_7BIT)) { 00347 while (p < pend) { 00348 c = rb_enc_ascget(p, pend, &clen, enc); 00349 if (c == -1) { 00350 if (enc == resenc) { 00351 p += mbclen(p, pend, enc); 00352 } 00353 else { 00354 need_escape = 1; 00355 break; 00356 } 00357 } 00358 else if (c != '/' && rb_enc_isprint(c, enc)) { 00359 p += clen; 00360 } 00361 else { 00362 need_escape = 1; 00363 break; 00364 } 00365 } 00366 } 00367 else { 00368 need_escape = 1; 00369 } 00370 00371 if (!need_escape) { 00372 rb_str_buf_cat(str, s, len); 00373 } 00374 else { 00375 int unicode_p = rb_enc_unicode_p(enc); 00376 p = s; 00377 while (p<pend) { 00378 c = rb_enc_ascget(p, pend, &clen, enc); 00379 if (c == '\\' && p+clen < pend) { 00380 int n = clen + mbclen(p+clen, pend, enc); 00381 rb_str_buf_cat(str, p, n); 00382 p += n; 00383 continue; 00384 } 00385 else if (c == '/') { 00386 char c = '\\'; 00387 rb_str_buf_cat(str, &c, 1); 00388 rb_str_buf_cat(str, p, clen); 00389 } 00390 else if (c == -1) { 00391 clen = rb_enc_precise_mbclen(p, pend, enc); 00392 if (!MBCLEN_CHARFOUND_P(clen)) { 00393 c = (unsigned char)*p; 00394 clen = 1; 00395 goto hex; 00396 } 00397 if (resenc) { 00398 unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc); 00399 rb_str_buf_cat_escaped_char(str, c, unicode_p); 00400 } 00401 else { 00402 clen = MBCLEN_CHARFOUND_LEN(clen); 00403 rb_str_buf_cat(str, p, clen); 00404 } 00405 } 00406 else if (rb_enc_isprint(c, enc)) { 00407 rb_str_buf_cat(str, p, clen); 00408 } 00409 else if (!rb_enc_isspace(c, enc)) { 00410 char b[8]; 00411 00412 hex: 00413 snprintf(b, sizeof(b), "\\x%02X", c); 00414 rb_str_buf_cat(str, b, 4); 00415 } 00416 else { 00417 rb_str_buf_cat(str, p, clen); 00418 } 00419 p += clen; 00420 } 00421 } 00422 } 00423 00424 static VALUE 00425 rb_reg_desc(const char *s, long len, VALUE re) 00426 { 00427 rb_encoding *enc = rb_enc_get(re); 00428 VALUE str = rb_str_buf_new2("/"); 00429 rb_encoding *resenc = rb_default_internal_encoding(); 00430 if (resenc == NULL) resenc = rb_default_external_encoding(); 00431 00432 if (re && rb_enc_asciicompat(enc)) { 00433 rb_enc_copy(str, re); 00434 } 00435 else { 00436 rb_enc_associate(str, rb_usascii_encoding()); 00437 } 00438 rb_reg_expr_str(str, s, len, enc, resenc); 00439 rb_str_buf_cat2(str, "/"); 00440 if (re) { 00441 char opts[4]; 00442 rb_reg_check(re); 00443 if (*option_to_str(opts, RREGEXP(re)->ptr->options)) 00444 rb_str_buf_cat2(str, opts); 00445 if (RBASIC(re)->flags & REG_ENCODING_NONE) 00446 rb_str_buf_cat2(str, "n"); 00447 } 00448 OBJ_INFECT(str, re); 00449 return str; 00450 } 00451 00452 00453 /* 00454 * call-seq: 00455 * rxp.source -> str 00456 * 00457 * Returns the original string of the pattern. 00458 * 00459 * /ab+c/ix.source #=> "ab+c" 00460 * 00461 * Note that escape sequences are retained as is. 00462 * 00463 * /\x20\+/.source #=> "\\x20\\+" 00464 * 00465 */ 00466 00467 static VALUE 00468 rb_reg_source(VALUE re) 00469 { 00470 VALUE str; 00471 00472 rb_reg_check(re); 00473 str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re)); 00474 if (OBJ_TAINTED(re)) OBJ_TAINT(str); 00475 return str; 00476 } 00477 00478 /* 00479 * call-seq: 00480 * rxp.inspect -> string 00481 * 00482 * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly, 00483 * <code>#inspect</code> actually produces the more natural version of 00484 * the string than <code>#to_s</code>. 00485 * 00486 * /ab+c/ix.inspect #=> "/ab+c/ix" 00487 * 00488 */ 00489 00490 static VALUE 00491 rb_reg_inspect(VALUE re) 00492 { 00493 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) { 00494 return rb_any_to_s(re); 00495 } 00496 return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re); 00497 } 00498 00499 00500 /* 00501 * call-seq: 00502 * rxp.to_s -> str 00503 * 00504 * Returns a string containing the regular expression and its options (using the 00505 * <code>(?opts:source)</code> notation. This string can be fed back in to 00506 * <code>Regexp::new</code> to a regular expression with the same semantics as 00507 * the original. (However, <code>Regexp#==</code> may not return true when 00508 * comparing the two, as the source of the regular expression itself may 00509 * differ, as the example shows). <code>Regexp#inspect</code> produces a 00510 * generally more readable version of <i>rxp</i>. 00511 * 00512 * r1 = /ab+c/ix #=> /ab+c/ix 00513 * s1 = r1.to_s #=> "(?ix-m:ab+c)" 00514 * r2 = Regexp.new(s1) #=> /(?ix-m:ab+c)/ 00515 * r1 == r2 #=> false 00516 * r1.source #=> "ab+c" 00517 * r2.source #=> "(?ix-m:ab+c)" 00518 */ 00519 00520 static VALUE 00521 rb_reg_to_s(VALUE re) 00522 { 00523 int options, opt; 00524 const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND; 00525 long len; 00526 const UChar* ptr; 00527 VALUE str = rb_str_buf_new2("(?"); 00528 char optbuf[5]; 00529 rb_encoding *enc = rb_enc_get(re); 00530 00531 rb_reg_check(re); 00532 00533 rb_enc_copy(str, re); 00534 options = RREGEXP(re)->ptr->options; 00535 ptr = (UChar*)RREGEXP_SRC_PTR(re); 00536 len = RREGEXP_SRC_LEN(re); 00537 again: 00538 if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') { 00539 int err = 1; 00540 ptr += 2; 00541 if ((len -= 2) > 0) { 00542 do { 00543 opt = char_to_option((int )*ptr); 00544 if (opt != 0) { 00545 options |= opt; 00546 } 00547 else { 00548 break; 00549 } 00550 ++ptr; 00551 } while (--len > 0); 00552 } 00553 if (len > 1 && *ptr == '-') { 00554 ++ptr; 00555 --len; 00556 do { 00557 opt = char_to_option((int )*ptr); 00558 if (opt != 0) { 00559 options &= ~opt; 00560 } 00561 else { 00562 break; 00563 } 00564 ++ptr; 00565 } while (--len > 0); 00566 } 00567 if (*ptr == ')') { 00568 --len; 00569 ++ptr; 00570 goto again; 00571 } 00572 if (*ptr == ':' && ptr[len-1] == ')') { 00573 Regexp *rp; 00574 00575 ++ptr; 00576 len -= 2; 00577 err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT, 00578 enc, OnigDefaultSyntax, NULL); 00579 onig_free(rp); 00580 } 00581 if (err) { 00582 options = RREGEXP(re)->ptr->options; 00583 ptr = (UChar*)RREGEXP_SRC_PTR(re); 00584 len = RREGEXP_SRC_LEN(re); 00585 } 00586 } 00587 00588 if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf); 00589 00590 if ((options & embeddable) != embeddable) { 00591 optbuf[0] = '-'; 00592 option_to_str(optbuf + 1, ~options); 00593 rb_str_buf_cat2(str, optbuf); 00594 } 00595 00596 rb_str_buf_cat2(str, ":"); 00597 rb_reg_expr_str(str, (char*)ptr, len, enc, NULL); 00598 rb_str_buf_cat2(str, ")"); 00599 rb_enc_copy(str, re); 00600 00601 OBJ_INFECT(str, re); 00602 return str; 00603 } 00604 00605 static void 00606 rb_reg_raise(const char *s, long len, const char *err, VALUE re) 00607 { 00608 volatile VALUE desc = rb_reg_desc(s, len, re); 00609 00610 rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc)); 00611 } 00612 00613 static VALUE 00614 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err) 00615 { 00616 char opts[6]; 00617 VALUE desc = rb_str_buf_new2(err); 00618 rb_encoding *resenc = rb_default_internal_encoding(); 00619 if (resenc == NULL) resenc = rb_default_external_encoding(); 00620 00621 rb_enc_associate(desc, enc); 00622 rb_str_buf_cat2(desc, ": /"); 00623 rb_reg_expr_str(desc, s, len, enc, resenc); 00624 opts[0] = '/'; 00625 option_to_str(opts + 1, options); 00626 rb_str_buf_cat2(desc, opts); 00627 return rb_exc_new3(rb_eRegexpError, desc); 00628 } 00629 00630 static void 00631 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err) 00632 { 00633 rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err)); 00634 } 00635 00636 static VALUE 00637 rb_reg_error_desc(VALUE str, int options, const char *err) 00638 { 00639 return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str), 00640 rb_enc_get(str), options, err); 00641 } 00642 00643 static void 00644 rb_reg_raise_str(VALUE str, int options, const char *err) 00645 { 00646 rb_exc_raise(rb_reg_error_desc(str, options, err)); 00647 } 00648 00649 00650 /* 00651 * call-seq: 00652 * rxp.casefold? -> true or false 00653 * 00654 * Returns the value of the case-insensitive flag. 00655 * 00656 * /a/.casefold? #=> false 00657 * /a/i.casefold? #=> true 00658 * /(?i:a)/.casefold? #=> false 00659 */ 00660 00661 static VALUE 00662 rb_reg_casefold_p(VALUE re) 00663 { 00664 rb_reg_check(re); 00665 if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue; 00666 return Qfalse; 00667 } 00668 00669 00670 /* 00671 * call-seq: 00672 * rxp.options -> fixnum 00673 * 00674 * Returns the set of bits corresponding to the options used when creating this 00675 * Regexp (see <code>Regexp::new</code> for details. Note that additional bits 00676 * may be set in the returned options: these are used internally by the regular 00677 * expression code. These extra bits are ignored if the options are passed to 00678 * <code>Regexp::new</code>. 00679 * 00680 * Regexp::IGNORECASE #=> 1 00681 * Regexp::EXTENDED #=> 2 00682 * Regexp::MULTILINE #=> 4 00683 * 00684 * /cat/.options #=> 0 00685 * /cat/ix.options #=> 3 00686 * Regexp.new('cat', true).options #=> 1 00687 * /\xa1\xa2/e.options #=> 16 00688 * 00689 * r = /cat/ix 00690 * Regexp.new(r.source, r.options) #=> /cat/ix 00691 */ 00692 00693 static VALUE 00694 rb_reg_options_m(VALUE re) 00695 { 00696 int options = rb_reg_options(re); 00697 return INT2NUM(options); 00698 } 00699 00700 static int 00701 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end, 00702 int back_num, int *back_refs, OnigRegex regex, void *arg) 00703 { 00704 VALUE ary = (VALUE)arg; 00705 rb_ary_push(ary, rb_str_new((const char *)name, name_end-name)); 00706 return 0; 00707 } 00708 00709 /* 00710 * call-seq: 00711 * rxp.names -> [name1, name2, ...] 00712 * 00713 * Returns a list of names of captures as an array of strings. 00714 * 00715 * /(?<foo>.)(?<bar>.)(?<baz>.)/.names 00716 * #=> ["foo", "bar", "baz"] 00717 * 00718 * /(?<foo>.)(?<foo>.)/.names 00719 * #=> ["foo"] 00720 * 00721 * /(.)(.)/.names 00722 * #=> [] 00723 */ 00724 00725 static VALUE 00726 rb_reg_names(VALUE re) 00727 { 00728 VALUE ary = rb_ary_new(); 00729 rb_reg_check(re); 00730 onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary); 00731 return ary; 00732 } 00733 00734 static int 00735 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end, 00736 int back_num, int *back_refs, OnigRegex regex, void *arg) 00737 { 00738 VALUE hash = (VALUE)arg; 00739 VALUE ary = rb_ary_new2(back_num); 00740 int i; 00741 00742 for (i = 0; i < back_num; i++) 00743 rb_ary_store(ary, i, INT2NUM(back_refs[i])); 00744 00745 rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary); 00746 00747 return 0; 00748 } 00749 00750 /* 00751 * call-seq: 00752 * rxp.named_captures -> hash 00753 * 00754 * Returns a hash representing information about named captures of <i>rxp</i>. 00755 * 00756 * A key of the hash is a name of the named captures. 00757 * A value of the hash is an array which is list of indexes of corresponding 00758 * named captures. 00759 * 00760 * /(?<foo>.)(?<bar>.)/.named_captures 00761 * #=> {"foo"=>[1], "bar"=>[2]} 00762 * 00763 * /(?<foo>.)(?<foo>.)/.named_captures 00764 * #=> {"foo"=>[1, 2]} 00765 * 00766 * If there are no named captures, an empty hash is returned. 00767 * 00768 * /(.)(.)/.named_captures 00769 * #=> {} 00770 */ 00771 00772 static VALUE 00773 rb_reg_named_captures(VALUE re) 00774 { 00775 VALUE hash = rb_hash_new(); 00776 rb_reg_check(re); 00777 onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash); 00778 return hash; 00779 } 00780 00781 static int 00782 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end, 00783 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax, 00784 OnigErrorInfo* einfo, const char *sourcefile, int sourceline) 00785 { 00786 int r; 00787 00788 *reg = (regex_t* )malloc(sizeof(regex_t)); 00789 if (IS_NULL(*reg)) return ONIGERR_MEMORY; 00790 00791 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); 00792 if (r) goto err; 00793 00794 r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline); 00795 if (r) { 00796 err: 00797 onig_free(*reg); 00798 *reg = NULL; 00799 } 00800 return r; 00801 } 00802 00803 static Regexp* 00804 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err, 00805 const char *sourcefile, int sourceline) 00806 { 00807 Regexp *rp; 00808 int r; 00809 OnigErrorInfo einfo; 00810 00811 /* Handle escaped characters first. */ 00812 00813 /* Build a copy of the string (in dest) with the 00814 escaped characters translated, and generate the regex 00815 from that. 00816 */ 00817 00818 r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags, 00819 enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline); 00820 if (r) { 00821 onig_error_code_to_str((UChar*)err, r, &einfo); 00822 return 0; 00823 } 00824 return rp; 00825 } 00826 00827 00828 /* 00829 * Document-class: MatchData 00830 * 00831 * <code>MatchData</code> is the type of the special variable <code>$~</code>, 00832 * and is the type of the object returned by <code>Regexp#match</code> and 00833 * <code>Regexp.last_match</code>. It encapsulates all the results of a pattern 00834 * match, results normally accessed through the special variables 00835 * <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>, 00836 * <code>$2</code>, and so on. 00837 * 00838 */ 00839 00840 VALUE rb_cMatch; 00841 00842 static VALUE 00843 match_alloc(VALUE klass) 00844 { 00845 NEWOBJ_OF(match, struct RMatch, klass, T_MATCH); 00846 00847 match->str = 0; 00848 match->rmatch = 0; 00849 match->regexp = 0; 00850 match->rmatch = ALLOC(struct rmatch); 00851 MEMZERO(match->rmatch, struct rmatch, 1); 00852 00853 return (VALUE)match; 00854 } 00855 00856 typedef struct { 00857 long byte_pos; 00858 long char_pos; 00859 } pair_t; 00860 00861 static int 00862 pair_byte_cmp(const void *pair1, const void *pair2) 00863 { 00864 long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos; 00865 #if SIZEOF_LONG > SIZEOF_INT 00866 return diff ? diff > 0 ? 1 : -1 : 0; 00867 #else 00868 return (int)diff; 00869 #endif 00870 } 00871 00872 static void 00873 update_char_offset(VALUE match) 00874 { 00875 struct rmatch *rm = RMATCH(match)->rmatch; 00876 struct re_registers *regs; 00877 int i, num_regs, num_pos; 00878 long c; 00879 char *s, *p, *q; 00880 rb_encoding *enc; 00881 pair_t *pairs; 00882 00883 if (rm->char_offset_updated) 00884 return; 00885 00886 regs = &rm->regs; 00887 num_regs = rm->regs.num_regs; 00888 00889 if (rm->char_offset_num_allocated < num_regs) { 00890 REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs); 00891 rm->char_offset_num_allocated = num_regs; 00892 } 00893 00894 enc = rb_enc_get(RMATCH(match)->str); 00895 if (rb_enc_mbmaxlen(enc) == 1) { 00896 for (i = 0; i < num_regs; i++) { 00897 rm->char_offset[i].beg = BEG(i); 00898 rm->char_offset[i].end = END(i); 00899 } 00900 rm->char_offset_updated = 1; 00901 return; 00902 } 00903 00904 pairs = ALLOCA_N(pair_t, num_regs*2); 00905 num_pos = 0; 00906 for (i = 0; i < num_regs; i++) { 00907 if (BEG(i) < 0) 00908 continue; 00909 pairs[num_pos++].byte_pos = BEG(i); 00910 pairs[num_pos++].byte_pos = END(i); 00911 } 00912 qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp); 00913 00914 s = p = RSTRING_PTR(RMATCH(match)->str); 00915 c = 0; 00916 for (i = 0; i < num_pos; i++) { 00917 q = s + pairs[i].byte_pos; 00918 c += rb_enc_strlen(p, q, enc); 00919 pairs[i].char_pos = c; 00920 p = q; 00921 } 00922 00923 for (i = 0; i < num_regs; i++) { 00924 pair_t key, *found; 00925 if (BEG(i) < 0) { 00926 rm->char_offset[i].beg = -1; 00927 rm->char_offset[i].end = -1; 00928 continue; 00929 } 00930 00931 key.byte_pos = BEG(i); 00932 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); 00933 rm->char_offset[i].beg = found->char_pos; 00934 00935 key.byte_pos = END(i); 00936 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); 00937 rm->char_offset[i].end = found->char_pos; 00938 } 00939 00940 rm->char_offset_updated = 1; 00941 } 00942 00943 static void 00944 match_check(VALUE match) 00945 { 00946 if (!RMATCH(match)->regexp) { 00947 rb_raise(rb_eTypeError, "uninitialized Match"); 00948 } 00949 } 00950 00951 /* :nodoc: */ 00952 static VALUE 00953 match_init_copy(VALUE obj, VALUE orig) 00954 { 00955 struct rmatch *rm; 00956 00957 if (!OBJ_INIT_COPY(obj, orig)) return obj; 00958 00959 RMATCH(obj)->str = RMATCH(orig)->str; 00960 RMATCH(obj)->regexp = RMATCH(orig)->regexp; 00961 00962 rm = RMATCH(obj)->rmatch; 00963 onig_region_copy(&rm->regs, RMATCH_REGS(orig)); 00964 00965 if (!RMATCH(orig)->rmatch->char_offset_updated) { 00966 rm->char_offset_updated = 0; 00967 } 00968 else { 00969 if (rm->char_offset_num_allocated < rm->regs.num_regs) { 00970 REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs); 00971 rm->char_offset_num_allocated = rm->regs.num_regs; 00972 } 00973 MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset, 00974 struct rmatch_offset, rm->regs.num_regs); 00975 rm->char_offset_updated = 1; 00976 } 00977 00978 return obj; 00979 } 00980 00981 00982 /* 00983 * call-seq: 00984 * mtch.regexp -> regexp 00985 * 00986 * Returns the regexp. 00987 * 00988 * m = /a.*b/.match("abc") 00989 * m.regexp #=> /a.*b/ 00990 */ 00991 00992 static VALUE 00993 match_regexp(VALUE match) 00994 { 00995 match_check(match); 00996 return RMATCH(match)->regexp; 00997 } 00998 00999 /* 01000 * call-seq: 01001 * mtch.names -> [name1, name2, ...] 01002 * 01003 * Returns a list of names of captures as an array of strings. 01004 * It is same as mtch.regexp.names. 01005 * 01006 * /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names 01007 * #=> ["foo", "bar", "baz"] 01008 * 01009 * m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil> 01010 * m.names #=> ["x", "y"] 01011 */ 01012 01013 static VALUE 01014 match_names(VALUE match) 01015 { 01016 match_check(match); 01017 return rb_reg_names(RMATCH(match)->regexp); 01018 } 01019 01020 /* 01021 * call-seq: 01022 * mtch.length -> integer 01023 * mtch.size -> integer 01024 * 01025 * Returns the number of elements in the match array. 01026 * 01027 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01028 * m.length #=> 5 01029 * m.size #=> 5 01030 */ 01031 01032 static VALUE 01033 match_size(VALUE match) 01034 { 01035 match_check(match); 01036 return INT2FIX(RMATCH_REGS(match)->num_regs); 01037 } 01038 01039 static int 01040 match_backref_number(VALUE match, VALUE backref) 01041 { 01042 const char *name; 01043 int num; 01044 01045 struct re_registers *regs = RMATCH_REGS(match); 01046 VALUE regexp = RMATCH(match)->regexp; 01047 01048 match_check(match); 01049 switch (TYPE(backref)) { 01050 default: 01051 return NUM2INT(backref); 01052 01053 case T_SYMBOL: 01054 name = rb_id2name(SYM2ID(backref)); 01055 break; 01056 01057 case T_STRING: 01058 name = StringValueCStr(backref); 01059 break; 01060 } 01061 01062 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr, 01063 (const unsigned char*)name, 01064 (const unsigned char*)name + strlen(name), 01065 regs); 01066 01067 if (num < 1) { 01068 rb_raise(rb_eIndexError, "undefined group name reference: %s", name); 01069 } 01070 01071 return num; 01072 } 01073 01074 int 01075 rb_reg_backref_number(VALUE match, VALUE backref) 01076 { 01077 return match_backref_number(match, backref); 01078 } 01079 01080 /* 01081 * call-seq: 01082 * mtch.offset(n) -> array 01083 * 01084 * Returns a two-element array containing the beginning and ending offsets of 01085 * the <em>n</em>th match. 01086 * <em>n</em> can be a string or symbol to reference a named capture. 01087 * 01088 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01089 * m.offset(0) #=> [1, 7] 01090 * m.offset(4) #=> [6, 7] 01091 * 01092 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge") 01093 * p m.offset(:foo) #=> [0, 1] 01094 * p m.offset(:bar) #=> [2, 3] 01095 * 01096 */ 01097 01098 static VALUE 01099 match_offset(VALUE match, VALUE n) 01100 { 01101 int i = match_backref_number(match, n); 01102 struct re_registers *regs = RMATCH_REGS(match); 01103 01104 match_check(match); 01105 if (i < 0 || regs->num_regs <= i) 01106 rb_raise(rb_eIndexError, "index %d out of matches", i); 01107 01108 if (BEG(i) < 0) 01109 return rb_assoc_new(Qnil, Qnil); 01110 01111 update_char_offset(match); 01112 return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg), 01113 INT2FIX(RMATCH(match)->rmatch->char_offset[i].end)); 01114 } 01115 01116 01117 /* 01118 * call-seq: 01119 * mtch.begin(n) -> integer 01120 * 01121 * Returns the offset of the start of the <em>n</em>th element of the match 01122 * array in the string. 01123 * <em>n</em> can be a string or symbol to reference a named capture. 01124 * 01125 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01126 * m.begin(0) #=> 1 01127 * m.begin(2) #=> 2 01128 * 01129 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge") 01130 * p m.begin(:foo) #=> 0 01131 * p m.begin(:bar) #=> 2 01132 */ 01133 01134 static VALUE 01135 match_begin(VALUE match, VALUE n) 01136 { 01137 int i = match_backref_number(match, n); 01138 struct re_registers *regs = RMATCH_REGS(match); 01139 01140 match_check(match); 01141 if (i < 0 || regs->num_regs <= i) 01142 rb_raise(rb_eIndexError, "index %d out of matches", i); 01143 01144 if (BEG(i) < 0) 01145 return Qnil; 01146 01147 update_char_offset(match); 01148 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg); 01149 } 01150 01151 01152 /* 01153 * call-seq: 01154 * mtch.end(n) -> integer 01155 * 01156 * Returns the offset of the character immediately following the end of the 01157 * <em>n</em>th element of the match array in the string. 01158 * <em>n</em> can be a string or symbol to reference a named capture. 01159 * 01160 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01161 * m.end(0) #=> 7 01162 * m.end(2) #=> 3 01163 * 01164 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge") 01165 * p m.end(:foo) #=> 1 01166 * p m.end(:bar) #=> 3 01167 */ 01168 01169 static VALUE 01170 match_end(VALUE match, VALUE n) 01171 { 01172 int i = match_backref_number(match, n); 01173 struct re_registers *regs = RMATCH_REGS(match); 01174 01175 match_check(match); 01176 if (i < 0 || regs->num_regs <= i) 01177 rb_raise(rb_eIndexError, "index %d out of matches", i); 01178 01179 if (BEG(i) < 0) 01180 return Qnil; 01181 01182 update_char_offset(match); 01183 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end); 01184 } 01185 01186 #define MATCH_BUSY FL_USER2 01187 01188 void 01189 rb_match_busy(VALUE match) 01190 { 01191 FL_SET(match, MATCH_BUSY); 01192 } 01193 01194 /* 01195 * call-seq: 01196 * rxp.fixed_encoding? -> true or false 01197 * 01198 * Returns false if rxp is applicable to 01199 * a string with any ASCII compatible encoding. 01200 * Returns true otherwise. 01201 * 01202 * r = /a/ 01203 * r.fixed_encoding? #=> false 01204 * r =~ "\u{6666} a" #=> 2 01205 * r =~ "\xa1\xa2 a".force_encoding("euc-jp") #=> 2 01206 * r =~ "abc".force_encoding("euc-jp") #=> 0 01207 * 01208 * r = /a/u 01209 * r.fixed_encoding? #=> true 01210 * r.encoding #=> #<Encoding:UTF-8> 01211 * r =~ "\u{6666} a" #=> 2 01212 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError 01213 * r =~ "abc".force_encoding("euc-jp") #=> 0 01214 * 01215 * r = /\u{6666}/ 01216 * r.fixed_encoding? #=> true 01217 * r.encoding #=> #<Encoding:UTF-8> 01218 * r =~ "\u{6666} a" #=> 0 01219 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError 01220 * r =~ "abc".force_encoding("euc-jp") #=> nil 01221 */ 01222 01223 static VALUE 01224 rb_reg_fixed_encoding_p(VALUE re) 01225 { 01226 if (FL_TEST(re, KCODE_FIXED)) 01227 return Qtrue; 01228 else 01229 return Qfalse; 01230 } 01231 01232 static VALUE 01233 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc, 01234 rb_encoding **fixed_enc, onig_errmsg_buffer err); 01235 01236 01237 static void 01238 reg_enc_error(VALUE re, VALUE str) 01239 { 01240 rb_raise(rb_eEncCompatError, 01241 "incompatible encoding regexp match (%s regexp with %s string)", 01242 rb_enc_name(rb_enc_get(re)), 01243 rb_enc_name(rb_enc_get(str))); 01244 } 01245 01246 static rb_encoding* 01247 rb_reg_prepare_enc(VALUE re, VALUE str, int warn) 01248 { 01249 rb_encoding *enc = 0; 01250 01251 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) { 01252 rb_raise(rb_eArgError, 01253 "invalid byte sequence in %s", 01254 rb_enc_name(rb_enc_get(str))); 01255 } 01256 01257 rb_reg_check(re); 01258 enc = rb_enc_get(str); 01259 if (!rb_enc_str_asciicompat_p(str)) { 01260 if (RREGEXP(re)->ptr->enc != enc) { 01261 reg_enc_error(re, str); 01262 } 01263 } 01264 else if (rb_reg_fixed_encoding_p(re)) { 01265 if (RREGEXP(re)->ptr->enc != enc && 01266 (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) || 01267 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) { 01268 reg_enc_error(re, str); 01269 } 01270 enc = RREGEXP(re)->ptr->enc; 01271 } 01272 if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) && 01273 enc != rb_ascii8bit_encoding() && 01274 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { 01275 rb_warn("regexp match /.../n against to %s string", 01276 rb_enc_name(enc)); 01277 } 01278 return enc; 01279 } 01280 01281 regex_t * 01282 rb_reg_prepare_re(VALUE re, VALUE str) 01283 { 01284 regex_t *reg = RREGEXP(re)->ptr; 01285 onig_errmsg_buffer err = ""; 01286 int r; 01287 OnigErrorInfo einfo; 01288 const char *pattern; 01289 VALUE unescaped; 01290 rb_encoding *fixed_enc = 0; 01291 rb_encoding *enc = rb_reg_prepare_enc(re, str, 1); 01292 01293 if (reg->enc == enc) return reg; 01294 01295 rb_reg_check(re); 01296 reg = RREGEXP(re)->ptr; 01297 pattern = RREGEXP_SRC_PTR(re); 01298 01299 unescaped = rb_reg_preprocess( 01300 pattern, pattern + RREGEXP_SRC_LEN(re), enc, 01301 &fixed_enc, err); 01302 01303 if (unescaped == Qnil) { 01304 rb_raise(rb_eArgError, "regexp preprocess failed: %s", err); 01305 } 01306 01307 r = onig_new(®, (UChar* )RSTRING_PTR(unescaped), 01308 (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)), 01309 reg->options, enc, 01310 OnigDefaultSyntax, &einfo); 01311 if (r) { 01312 onig_error_code_to_str((UChar*)err, r, &einfo); 01313 rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re); 01314 } 01315 01316 RB_GC_GUARD(unescaped); 01317 return reg; 01318 } 01319 01320 long 01321 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse) 01322 { 01323 long range; 01324 rb_encoding *enc; 01325 UChar *p, *string; 01326 01327 enc = rb_reg_prepare_enc(re, str, 0); 01328 01329 if (reverse) { 01330 range = -pos; 01331 } 01332 else { 01333 range = RSTRING_LEN(str) - pos; 01334 } 01335 01336 if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) { 01337 string = (UChar*)RSTRING_PTR(str); 01338 01339 if (range > 0) { 01340 p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str)); 01341 } 01342 else { 01343 p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str)); 01344 } 01345 return p - string; 01346 } 01347 01348 return pos; 01349 } 01350 01351 long 01352 rb_reg_search(VALUE re, VALUE str, long pos, int reverse) 01353 { 01354 long result; 01355 VALUE match; 01356 struct re_registers regi, *regs = ®i; 01357 char *range = RSTRING_PTR(str); 01358 regex_t *reg; 01359 int tmpreg; 01360 01361 if (pos > RSTRING_LEN(str) || pos < 0) { 01362 rb_backref_set(Qnil); 01363 return -1; 01364 } 01365 01366 reg = rb_reg_prepare_re(re, str); 01367 tmpreg = reg != RREGEXP(re)->ptr; 01368 if (!tmpreg) RREGEXP(re)->usecnt++; 01369 01370 match = rb_backref_get(); 01371 if (!NIL_P(match)) { 01372 if (FL_TEST(match, MATCH_BUSY)) { 01373 match = Qnil; 01374 } 01375 else { 01376 regs = RMATCH_REGS(match); 01377 } 01378 } 01379 if (NIL_P(match)) { 01380 MEMZERO(regs, struct re_registers, 1); 01381 } 01382 if (!reverse) { 01383 range += RSTRING_LEN(str); 01384 } 01385 result = onig_search(reg, 01386 (UChar*)(RSTRING_PTR(str)), 01387 ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)), 01388 ((UChar*)(RSTRING_PTR(str)) + pos), 01389 ((UChar*)range), 01390 regs, ONIG_OPTION_NONE); 01391 if (!tmpreg) RREGEXP(re)->usecnt--; 01392 if (tmpreg) { 01393 if (RREGEXP(re)->usecnt) { 01394 onig_free(reg); 01395 } 01396 else { 01397 onig_free(RREGEXP(re)->ptr); 01398 RREGEXP(re)->ptr = reg; 01399 } 01400 } 01401 if (result < 0) { 01402 if (regs == ®i) 01403 onig_region_free(regs, 0); 01404 if (result == ONIG_MISMATCH) { 01405 rb_backref_set(Qnil); 01406 return result; 01407 } 01408 else { 01409 onig_errmsg_buffer err = ""; 01410 onig_error_code_to_str((UChar*)err, (int)result); 01411 rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re); 01412 } 01413 } 01414 01415 if (NIL_P(match)) { 01416 match = match_alloc(rb_cMatch); 01417 onig_region_copy(RMATCH_REGS(match), regs); 01418 onig_region_free(regs, 0); 01419 } 01420 else { 01421 if (rb_safe_level() >= 3) 01422 OBJ_TAINT(match); 01423 else 01424 FL_UNSET(match, FL_TAINT); 01425 } 01426 01427 RMATCH(match)->str = rb_str_new4(str); 01428 RMATCH(match)->regexp = re; 01429 RMATCH(match)->rmatch->char_offset_updated = 0; 01430 rb_backref_set(match); 01431 01432 OBJ_INFECT(match, re); 01433 OBJ_INFECT(match, str); 01434 01435 return result; 01436 } 01437 01438 VALUE 01439 rb_reg_nth_defined(int nth, VALUE match) 01440 { 01441 struct re_registers *regs; 01442 if (NIL_P(match)) return Qnil; 01443 match_check(match); 01444 regs = RMATCH_REGS(match); 01445 if (nth >= regs->num_regs) { 01446 return Qnil; 01447 } 01448 if (nth < 0) { 01449 nth += regs->num_regs; 01450 if (nth <= 0) return Qnil; 01451 } 01452 if (BEG(nth) == -1) return Qfalse; 01453 return Qtrue; 01454 } 01455 01456 VALUE 01457 rb_reg_nth_match(int nth, VALUE match) 01458 { 01459 VALUE str; 01460 long start, end, len; 01461 struct re_registers *regs; 01462 01463 if (NIL_P(match)) return Qnil; 01464 match_check(match); 01465 regs = RMATCH_REGS(match); 01466 if (nth >= regs->num_regs) { 01467 return Qnil; 01468 } 01469 if (nth < 0) { 01470 nth += regs->num_regs; 01471 if (nth <= 0) return Qnil; 01472 } 01473 start = BEG(nth); 01474 if (start == -1) return Qnil; 01475 end = END(nth); 01476 len = end - start; 01477 str = rb_str_subseq(RMATCH(match)->str, start, len); 01478 OBJ_INFECT(str, match); 01479 return str; 01480 } 01481 01482 VALUE 01483 rb_reg_last_match(VALUE match) 01484 { 01485 return rb_reg_nth_match(0, match); 01486 } 01487 01488 01489 /* 01490 * call-seq: 01491 * mtch.pre_match -> str 01492 * 01493 * Returns the portion of the original string before the current match. 01494 * Equivalent to the special variable <code>$`</code>. 01495 * 01496 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01497 * m.pre_match #=> "T" 01498 */ 01499 01500 VALUE 01501 rb_reg_match_pre(VALUE match) 01502 { 01503 VALUE str; 01504 struct re_registers *regs; 01505 01506 if (NIL_P(match)) return Qnil; 01507 match_check(match); 01508 regs = RMATCH_REGS(match); 01509 if (BEG(0) == -1) return Qnil; 01510 str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0)); 01511 if (OBJ_TAINTED(match)) OBJ_TAINT(str); 01512 return str; 01513 } 01514 01515 01516 /* 01517 * call-seq: 01518 * mtch.post_match -> str 01519 * 01520 * Returns the portion of the original string after the current match. 01521 * Equivalent to the special variable <code>$'</code>. 01522 * 01523 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie") 01524 * m.post_match #=> ": The Movie" 01525 */ 01526 01527 VALUE 01528 rb_reg_match_post(VALUE match) 01529 { 01530 VALUE str; 01531 long pos; 01532 struct re_registers *regs; 01533 01534 if (NIL_P(match)) return Qnil; 01535 match_check(match); 01536 regs = RMATCH_REGS(match); 01537 if (BEG(0) == -1) return Qnil; 01538 str = RMATCH(match)->str; 01539 pos = END(0); 01540 str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos); 01541 if (OBJ_TAINTED(match)) OBJ_TAINT(str); 01542 return str; 01543 } 01544 01545 VALUE 01546 rb_reg_match_last(VALUE match) 01547 { 01548 int i; 01549 struct re_registers *regs; 01550 01551 if (NIL_P(match)) return Qnil; 01552 match_check(match); 01553 regs = RMATCH_REGS(match); 01554 if (BEG(0) == -1) return Qnil; 01555 01556 for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--) 01557 ; 01558 if (i == 0) return Qnil; 01559 return rb_reg_nth_match(i, match); 01560 } 01561 01562 static VALUE 01563 last_match_getter(void) 01564 { 01565 return rb_reg_last_match(rb_backref_get()); 01566 } 01567 01568 static VALUE 01569 prematch_getter(void) 01570 { 01571 return rb_reg_match_pre(rb_backref_get()); 01572 } 01573 01574 static VALUE 01575 postmatch_getter(void) 01576 { 01577 return rb_reg_match_post(rb_backref_get()); 01578 } 01579 01580 static VALUE 01581 last_paren_match_getter(void) 01582 { 01583 return rb_reg_match_last(rb_backref_get()); 01584 } 01585 01586 static VALUE 01587 match_array(VALUE match, int start) 01588 { 01589 struct re_registers *regs; 01590 VALUE ary; 01591 VALUE target; 01592 int i; 01593 int taint = OBJ_TAINTED(match); 01594 01595 match_check(match); 01596 regs = RMATCH_REGS(match); 01597 ary = rb_ary_new2(regs->num_regs); 01598 target = RMATCH(match)->str; 01599 01600 for (i=start; i<regs->num_regs; i++) { 01601 if (regs->beg[i] == -1) { 01602 rb_ary_push(ary, Qnil); 01603 } 01604 else { 01605 VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]); 01606 if (taint) OBJ_TAINT(str); 01607 rb_ary_push(ary, str); 01608 } 01609 } 01610 return ary; 01611 } 01612 01613 01614 /* [MG]:FIXME: I put parens around the /.../.match() in the first line of the 01615 second example to prevent the '*' followed by a '/' from ending the 01616 comment. */ 01617 01618 /* 01619 * call-seq: 01620 * mtch.to_a -> anArray 01621 * 01622 * Returns the array of matches. 01623 * 01624 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01625 * m.to_a #=> ["HX1138", "H", "X", "113", "8"] 01626 * 01627 * Because <code>to_a</code> is called when expanding 01628 * <code>*</code><em>variable</em>, there's a useful assignment 01629 * shortcut for extracting matched fields. This is slightly slower than 01630 * accessing the fields directly (as an intermediate array is 01631 * generated). 01632 * 01633 * all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138.")) 01634 * all #=> "HX1138" 01635 * f1 #=> "H" 01636 * f2 #=> "X" 01637 * f3 #=> "113" 01638 */ 01639 01640 static VALUE 01641 match_to_a(VALUE match) 01642 { 01643 return match_array(match, 0); 01644 } 01645 01646 01647 /* 01648 * call-seq: 01649 * mtch.captures -> array 01650 * 01651 * Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>. 01652 * 01653 * f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures 01654 * f1 #=> "H" 01655 * f2 #=> "X" 01656 * f3 #=> "113" 01657 * f4 #=> "8" 01658 */ 01659 static VALUE 01660 match_captures(VALUE match) 01661 { 01662 return match_array(match, 1); 01663 } 01664 01665 static int 01666 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end) 01667 { 01668 int num; 01669 01670 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr, 01671 (const unsigned char* )name, (const unsigned char* )name_end, regs); 01672 if (num >= 1) { 01673 return num; 01674 } 01675 else { 01676 VALUE s = rb_str_new(name, (long )(name_end - name)); 01677 rb_raise(rb_eIndexError, "undefined group name reference: %s", 01678 StringValuePtr(s)); 01679 } 01680 01681 UNREACHABLE; 01682 } 01683 01684 /* 01685 * call-seq: 01686 * mtch[i] -> str or nil 01687 * mtch[start, length] -> array 01688 * mtch[range] -> array 01689 * mtch[name] -> str or nil 01690 * 01691 * Match Reference -- <code>MatchData</code> acts as an array, and may be 01692 * accessed using the normal array indexing techniques. <code>mtch[0]</code> 01693 * is equivalent to the special variable <code>$&</code>, and returns the 01694 * entire matched string. <code>mtch[1]</code>, <code>mtch[2]</code>, and so 01695 * on return the values of the matched backreferences (portions of the 01696 * pattern between parentheses). 01697 * 01698 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01699 * m #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8"> 01700 * m[0] #=> "HX1138" 01701 * m[1, 2] #=> ["H", "X"] 01702 * m[1..3] #=> ["H", "X", "113"] 01703 * m[-3, 2] #=> ["X", "113"] 01704 * 01705 * m = /(?<foo>a+)b/.match("ccaaab") 01706 * m #=> #<MatchData "aaab" foo:"aaa"> 01707 * m["foo"] #=> "aaa" 01708 * m[:foo] #=> "aaa" 01709 */ 01710 01711 static VALUE 01712 match_aref(int argc, VALUE *argv, VALUE match) 01713 { 01714 VALUE idx, rest; 01715 01716 match_check(match); 01717 rb_scan_args(argc, argv, "11", &idx, &rest); 01718 01719 if (NIL_P(rest)) { 01720 if (FIXNUM_P(idx)) { 01721 if (FIX2INT(idx) >= 0) { 01722 return rb_reg_nth_match(FIX2INT(idx), match); 01723 } 01724 } 01725 else { 01726 const char *p; 01727 int num; 01728 01729 switch (TYPE(idx)) { 01730 case T_SYMBOL: 01731 p = rb_id2name(SYM2ID(idx)); 01732 goto name_to_backref; 01733 break; 01734 case T_STRING: 01735 p = StringValuePtr(idx); 01736 01737 name_to_backref: 01738 num = name_to_backref_number(RMATCH_REGS(match), 01739 RMATCH(match)->regexp, p, p + strlen(p)); 01740 return rb_reg_nth_match(num, match); 01741 break; 01742 01743 default: 01744 break; 01745 } 01746 } 01747 } 01748 01749 return rb_ary_aref(argc, argv, match_to_a(match)); 01750 } 01751 01752 static VALUE 01753 match_entry(VALUE match, long n) 01754 { 01755 /* n should not exceed num_regs */ 01756 return rb_reg_nth_match((int)n, match); 01757 } 01758 01759 01760 /* 01761 * call-seq: 01762 * 01763 * mtch.values_at([index]*) -> array 01764 * 01765 * Uses each <i>index</i> to access the matching values, returning an array of 01766 * the corresponding matches. 01767 * 01768 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie") 01769 * m.to_a #=> ["HX1138", "H", "X", "113", "8"] 01770 * m.values_at(0, 2, -2) #=> ["HX1138", "X", "113"] 01771 */ 01772 01773 static VALUE 01774 match_values_at(int argc, VALUE *argv, VALUE match) 01775 { 01776 struct re_registers *regs; 01777 01778 match_check(match); 01779 regs = RMATCH_REGS(match); 01780 return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry); 01781 } 01782 01783 01784 /* 01785 * call-seq: 01786 * mtch.to_s -> str 01787 * 01788 * Returns the entire matched string. 01789 * 01790 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01791 * m.to_s #=> "HX1138" 01792 */ 01793 01794 static VALUE 01795 match_to_s(VALUE match) 01796 { 01797 VALUE str = rb_reg_last_match(match); 01798 01799 match_check(match); 01800 if (NIL_P(str)) str = rb_str_new(0,0); 01801 if (OBJ_TAINTED(match)) OBJ_TAINT(str); 01802 if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str); 01803 return str; 01804 } 01805 01806 01807 /* 01808 * call-seq: 01809 * mtch.string -> str 01810 * 01811 * Returns a frozen copy of the string passed in to <code>match</code>. 01812 * 01813 * m = /(.)(.)(\d+)(\d)/.match("THX1138.") 01814 * m.string #=> "THX1138." 01815 */ 01816 01817 static VALUE 01818 match_string(VALUE match) 01819 { 01820 match_check(match); 01821 return RMATCH(match)->str; /* str is frozen */ 01822 } 01823 01824 struct backref_name_tag { 01825 const UChar *name; 01826 long len; 01827 }; 01828 01829 static int 01830 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end, 01831 int back_num, int *back_refs, OnigRegex regex, void *arg0) 01832 { 01833 struct backref_name_tag *arg = (struct backref_name_tag *)arg0; 01834 int i; 01835 01836 for (i = 0; i < back_num; i++) { 01837 arg[back_refs[i]].name = name; 01838 arg[back_refs[i]].len = name_end - name; 01839 } 01840 return 0; 01841 } 01842 01843 /* 01844 * call-seq: 01845 * mtch.inspect -> str 01846 * 01847 * Returns a printable version of <i>mtch</i>. 01848 * 01849 * puts /.$/.match("foo").inspect 01850 * #=> #<MatchData "o"> 01851 * 01852 * puts /(.)(.)(.)/.match("foo").inspect 01853 * #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o"> 01854 * 01855 * puts /(.)(.)?(.)/.match("fo").inspect 01856 * #=> #<MatchData "fo" 1:"f" 2:nil 3:"o"> 01857 * 01858 * puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect 01859 * #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g"> 01860 * 01861 */ 01862 01863 static VALUE 01864 match_inspect(VALUE match) 01865 { 01866 const char *cname = rb_obj_classname(match); 01867 VALUE str; 01868 int i; 01869 struct re_registers *regs = RMATCH_REGS(match); 01870 int num_regs = regs->num_regs; 01871 struct backref_name_tag *names; 01872 VALUE regexp = RMATCH(match)->regexp; 01873 01874 if (regexp == 0) { 01875 return rb_sprintf("#<%s:%p>", cname, (void*)match); 01876 } 01877 01878 names = ALLOCA_N(struct backref_name_tag, num_regs); 01879 MEMZERO(names, struct backref_name_tag, num_regs); 01880 01881 onig_foreach_name(RREGEXP(regexp)->ptr, 01882 match_inspect_name_iter, names); 01883 01884 str = rb_str_buf_new2("#<"); 01885 rb_str_buf_cat2(str, cname); 01886 01887 for (i = 0; i < num_regs; i++) { 01888 VALUE v; 01889 rb_str_buf_cat2(str, " "); 01890 if (0 < i) { 01891 if (names[i].name) 01892 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len); 01893 else { 01894 rb_str_catf(str, "%d", i); 01895 } 01896 rb_str_buf_cat2(str, ":"); 01897 } 01898 v = rb_reg_nth_match(i, match); 01899 if (v == Qnil) 01900 rb_str_buf_cat2(str, "nil"); 01901 else 01902 rb_str_buf_append(str, rb_str_inspect(v)); 01903 } 01904 rb_str_buf_cat2(str, ">"); 01905 01906 return str; 01907 } 01908 01909 VALUE rb_cRegexp; 01910 01911 static int 01912 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err) 01913 { 01914 const char *p = *pp; 01915 int code; 01916 int meta_prefix = 0, ctrl_prefix = 0; 01917 size_t len; 01918 01919 if (p == end || *p++ != '\\') { 01920 errcpy(err, "too short escaped multibyte character"); 01921 return -1; 01922 } 01923 01924 again: 01925 if (p == end) { 01926 errcpy(err, "too short escape sequence"); 01927 return -1; 01928 } 01929 switch (*p++) { 01930 case '\\': code = '\\'; break; 01931 case 'n': code = '\n'; break; 01932 case 't': code = '\t'; break; 01933 case 'r': code = '\r'; break; 01934 case 'f': code = '\f'; break; 01935 case 'v': code = '\013'; break; 01936 case 'a': code = '\007'; break; 01937 case 'e': code = '\033'; break; 01938 01939 /* \OOO */ 01940 case '0': case '1': case '2': case '3': 01941 case '4': case '5': case '6': case '7': 01942 p--; 01943 code = scan_oct(p, end < p+3 ? end-p : 3, &len); 01944 p += len; 01945 break; 01946 01947 case 'x': /* \xHH */ 01948 code = scan_hex(p, end < p+2 ? end-p : 2, &len); 01949 if (len < 1) { 01950 errcpy(err, "invalid hex escape"); 01951 return -1; 01952 } 01953 p += len; 01954 break; 01955 01956 case 'M': /* \M-X, \M-\C-X, \M-\cX */ 01957 if (meta_prefix) { 01958 errcpy(err, "duplicate meta escape"); 01959 return -1; 01960 } 01961 meta_prefix = 1; 01962 if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) { 01963 if (*p == '\\') { 01964 p++; 01965 goto again; 01966 } 01967 else { 01968 code = *p++; 01969 break; 01970 } 01971 } 01972 errcpy(err, "too short meta escape"); 01973 return -1; 01974 01975 case 'C': /* \C-X, \C-\M-X */ 01976 if (p == end || *p++ != '-') { 01977 errcpy(err, "too short control escape"); 01978 return -1; 01979 } 01980 case 'c': /* \cX, \c\M-X */ 01981 if (ctrl_prefix) { 01982 errcpy(err, "duplicate control escape"); 01983 return -1; 01984 } 01985 ctrl_prefix = 1; 01986 if (p < end && (*p & 0x80) == 0) { 01987 if (*p == '\\') { 01988 p++; 01989 goto again; 01990 } 01991 else { 01992 code = *p++; 01993 break; 01994 } 01995 } 01996 errcpy(err, "too short control escape"); 01997 return -1; 01998 01999 default: 02000 errcpy(err, "unexpected escape sequence"); 02001 return -1; 02002 } 02003 if (code < 0 || 0xff < code) { 02004 errcpy(err, "invalid escape code"); 02005 return -1; 02006 } 02007 02008 if (ctrl_prefix) 02009 code &= 0x1f; 02010 if (meta_prefix) 02011 code |= 0x80; 02012 02013 *pp = p; 02014 return code; 02015 } 02016 02017 static int 02018 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc, 02019 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) 02020 { 02021 const char *p = *pp; 02022 int chmaxlen = rb_enc_mbmaxlen(enc); 02023 char *chbuf = ALLOCA_N(char, chmaxlen); 02024 int chlen = 0; 02025 int byte; 02026 int l; 02027 02028 memset(chbuf, 0, chmaxlen); 02029 02030 byte = read_escaped_byte(&p, end, err); 02031 if (byte == -1) { 02032 return -1; 02033 } 02034 02035 chbuf[chlen++] = byte; 02036 while (chlen < chmaxlen && 02037 MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) { 02038 byte = read_escaped_byte(&p, end, err); 02039 if (byte == -1) { 02040 return -1; 02041 } 02042 chbuf[chlen++] = byte; 02043 } 02044 02045 l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc); 02046 if (MBCLEN_INVALID_P(l)) { 02047 errcpy(err, "invalid multibyte escape"); 02048 return -1; 02049 } 02050 if (1 < chlen || (chbuf[0] & 0x80)) { 02051 rb_str_buf_cat(buf, chbuf, chlen); 02052 02053 if (*encp == 0) 02054 *encp = enc; 02055 else if (*encp != enc) { 02056 errcpy(err, "escaped non ASCII character in UTF-8 regexp"); 02057 return -1; 02058 } 02059 } 02060 else { 02061 char escbuf[5]; 02062 snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff); 02063 rb_str_buf_cat(buf, escbuf, 4); 02064 } 02065 *pp = p; 02066 return 0; 02067 } 02068 02069 static int 02070 check_unicode_range(unsigned long code, onig_errmsg_buffer err) 02071 { 02072 if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */ 02073 0x10ffff < code) { 02074 errcpy(err, "invalid Unicode range"); 02075 return -1; 02076 } 02077 return 0; 02078 } 02079 02080 static int 02081 append_utf8(unsigned long uv, 02082 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) 02083 { 02084 if (check_unicode_range(uv, err) != 0) 02085 return -1; 02086 if (uv < 0x80) { 02087 char escbuf[5]; 02088 snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv); 02089 rb_str_buf_cat(buf, escbuf, 4); 02090 } 02091 else { 02092 int len; 02093 char utf8buf[6]; 02094 len = rb_uv_to_utf8(utf8buf, uv); 02095 rb_str_buf_cat(buf, utf8buf, len); 02096 02097 if (*encp == 0) 02098 *encp = rb_utf8_encoding(); 02099 else if (*encp != rb_utf8_encoding()) { 02100 errcpy(err, "UTF-8 character in non UTF-8 regexp"); 02101 return -1; 02102 } 02103 } 02104 return 0; 02105 } 02106 02107 static int 02108 unescape_unicode_list(const char **pp, const char *end, 02109 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) 02110 { 02111 const char *p = *pp; 02112 int has_unicode = 0; 02113 unsigned long code; 02114 size_t len; 02115 02116 while (p < end && ISSPACE(*p)) p++; 02117 02118 while (1) { 02119 code = ruby_scan_hex(p, end-p, &len); 02120 if (len == 0) 02121 break; 02122 if (6 < len) { /* max 10FFFF */ 02123 errcpy(err, "invalid Unicode range"); 02124 return -1; 02125 } 02126 p += len; 02127 if (append_utf8(code, buf, encp, err) != 0) 02128 return -1; 02129 has_unicode = 1; 02130 02131 while (p < end && ISSPACE(*p)) p++; 02132 } 02133 02134 if (has_unicode == 0) { 02135 errcpy(err, "invalid Unicode list"); 02136 return -1; 02137 } 02138 02139 *pp = p; 02140 02141 return 0; 02142 } 02143 02144 static int 02145 unescape_unicode_bmp(const char **pp, const char *end, 02146 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err) 02147 { 02148 const char *p = *pp; 02149 size_t len; 02150 unsigned long code; 02151 02152 if (end < p+4) { 02153 errcpy(err, "invalid Unicode escape"); 02154 return -1; 02155 } 02156 code = ruby_scan_hex(p, 4, &len); 02157 if (len != 4) { 02158 errcpy(err, "invalid Unicode escape"); 02159 return -1; 02160 } 02161 if (append_utf8(code, buf, encp, err) != 0) 02162 return -1; 02163 *pp = p + 4; 02164 return 0; 02165 } 02166 02167 static int 02168 unescape_nonascii(const char *p, const char *end, rb_encoding *enc, 02169 VALUE buf, rb_encoding **encp, int *has_property, 02170 onig_errmsg_buffer err) 02171 { 02172 char c; 02173 char smallbuf[2]; 02174 02175 while (p < end) { 02176 int chlen = rb_enc_precise_mbclen(p, end, enc); 02177 if (!MBCLEN_CHARFOUND_P(chlen)) { 02178 errcpy(err, "invalid multibyte character"); 02179 return -1; 02180 } 02181 chlen = MBCLEN_CHARFOUND_LEN(chlen); 02182 if (1 < chlen || (*p & 0x80)) { 02183 rb_str_buf_cat(buf, p, chlen); 02184 p += chlen; 02185 if (*encp == 0) 02186 *encp = enc; 02187 else if (*encp != enc) { 02188 errcpy(err, "non ASCII character in UTF-8 regexp"); 02189 return -1; 02190 } 02191 continue; 02192 } 02193 02194 switch (c = *p++) { 02195 case '\\': 02196 if (p == end) { 02197 errcpy(err, "too short escape sequence"); 02198 return -1; 02199 } 02200 switch (c = *p++) { 02201 case '1': case '2': case '3': 02202 case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */ 02203 { 02204 size_t octlen; 02205 if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) { 02206 /* backref or 7bit octal. 02207 no need to unescape anyway. 02208 re-escaping may break backref */ 02209 goto escape_asis; 02210 } 02211 } 02212 /* xxx: How about more than 199 subexpressions? */ 02213 02214 case '0': /* \0, \0O, \0OO */ 02215 02216 case 'x': /* \xHH */ 02217 case 'c': /* \cX, \c\M-X */ 02218 case 'C': /* \C-X, \C-\M-X */ 02219 case 'M': /* \M-X, \M-\C-X, \M-\cX */ 02220 p = p-2; 02221 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0) 02222 return -1; 02223 break; 02224 02225 case 'u': 02226 if (p == end) { 02227 errcpy(err, "too short escape sequence"); 02228 return -1; 02229 } 02230 if (*p == '{') { 02231 /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */ 02232 p++; 02233 if (unescape_unicode_list(&p, end, buf, encp, err) != 0) 02234 return -1; 02235 if (p == end || *p++ != '}') { 02236 errcpy(err, "invalid Unicode list"); 02237 return -1; 02238 } 02239 break; 02240 } 02241 else { 02242 /* \uHHHH */ 02243 if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0) 02244 return -1; 02245 break; 02246 } 02247 02248 case 'p': /* \p{Hiragana} */ 02249 case 'P': 02250 if (!*encp) { 02251 *has_property = 1; 02252 } 02253 goto escape_asis; 02254 02255 default: /* \n, \\, \d, \9, etc. */ 02256 escape_asis: 02257 smallbuf[0] = '\\'; 02258 smallbuf[1] = c; 02259 rb_str_buf_cat(buf, smallbuf, 2); 02260 break; 02261 } 02262 break; 02263 02264 default: 02265 rb_str_buf_cat(buf, &c, 1); 02266 break; 02267 } 02268 } 02269 02270 return 0; 02271 } 02272 02273 static VALUE 02274 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc, 02275 rb_encoding **fixed_enc, onig_errmsg_buffer err) 02276 { 02277 VALUE buf; 02278 int has_property = 0; 02279 02280 buf = rb_str_buf_new(0); 02281 02282 if (rb_enc_asciicompat(enc)) 02283 *fixed_enc = 0; 02284 else { 02285 *fixed_enc = enc; 02286 rb_enc_associate(buf, enc); 02287 } 02288 02289 if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0) 02290 return Qnil; 02291 02292 if (has_property && !*fixed_enc) { 02293 *fixed_enc = enc; 02294 } 02295 02296 if (*fixed_enc) { 02297 rb_enc_associate(buf, *fixed_enc); 02298 } 02299 02300 return buf; 02301 } 02302 02303 VALUE 02304 rb_reg_check_preprocess(VALUE str) 02305 { 02306 rb_encoding *fixed_enc = 0; 02307 onig_errmsg_buffer err = ""; 02308 VALUE buf; 02309 char *p, *end; 02310 rb_encoding *enc; 02311 02312 StringValue(str); 02313 p = RSTRING_PTR(str); 02314 end = p + RSTRING_LEN(str); 02315 enc = rb_enc_get(str); 02316 02317 buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err); 02318 RB_GC_GUARD(str); 02319 02320 if (buf == Qnil) { 02321 return rb_reg_error_desc(str, 0, err); 02322 } 02323 return Qnil; 02324 } 02325 02326 static VALUE 02327 rb_reg_preprocess_dregexp(VALUE ary, int options) 02328 { 02329 rb_encoding *fixed_enc = 0; 02330 rb_encoding *regexp_enc = 0; 02331 onig_errmsg_buffer err = ""; 02332 int i; 02333 VALUE result = 0; 02334 rb_encoding *ascii8bit = rb_ascii8bit_encoding(); 02335 02336 if (RARRAY_LEN(ary) == 0) { 02337 rb_raise(rb_eArgError, "no arguments given"); 02338 } 02339 02340 for (i = 0; i < RARRAY_LEN(ary); i++) { 02341 VALUE str = RARRAY_PTR(ary)[i]; 02342 VALUE buf; 02343 char *p, *end; 02344 rb_encoding *src_enc; 02345 02346 src_enc = rb_enc_get(str); 02347 if (options & ARG_ENCODING_NONE && 02348 src_enc != ascii8bit) { 02349 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) 02350 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script"); 02351 else 02352 src_enc = ascii8bit; 02353 } 02354 02355 StringValue(str); 02356 p = RSTRING_PTR(str); 02357 end = p + RSTRING_LEN(str); 02358 02359 buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err); 02360 02361 if (buf == Qnil) 02362 rb_raise(rb_eArgError, "%s", err); 02363 02364 if (fixed_enc != 0) { 02365 if (regexp_enc != 0 && regexp_enc != fixed_enc) { 02366 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s", 02367 rb_enc_name(regexp_enc), rb_enc_name(fixed_enc)); 02368 } 02369 regexp_enc = fixed_enc; 02370 } 02371 02372 if (!result) 02373 result = rb_str_new3(str); 02374 else 02375 rb_str_buf_append(result, str); 02376 } 02377 if (regexp_enc) { 02378 rb_enc_associate(result, regexp_enc); 02379 } 02380 02381 return result; 02382 } 02383 02384 static int 02385 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc, 02386 int options, onig_errmsg_buffer err, 02387 const char *sourcefile, int sourceline) 02388 { 02389 struct RRegexp *re = RREGEXP(obj); 02390 VALUE unescaped; 02391 rb_encoding *fixed_enc = 0; 02392 rb_encoding *a_enc = rb_ascii8bit_encoding(); 02393 02394 if (!OBJ_UNTRUSTED(obj) && rb_safe_level() >= 4) 02395 rb_raise(rb_eSecurityError, "Insecure: can't modify regexp"); 02396 rb_check_frozen(obj); 02397 if (FL_TEST(obj, REG_LITERAL)) 02398 rb_raise(rb_eSecurityError, "can't modify literal regexp"); 02399 if (re->ptr) 02400 rb_raise(rb_eTypeError, "already initialized regexp"); 02401 re->ptr = 0; 02402 02403 if (rb_enc_dummy_p(enc)) { 02404 errcpy(err, "can't make regexp with dummy encoding"); 02405 return -1; 02406 } 02407 02408 unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err); 02409 if (unescaped == Qnil) 02410 return -1; 02411 02412 if (fixed_enc) { 02413 if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) || 02414 (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) { 02415 errcpy(err, "incompatible character encoding"); 02416 return -1; 02417 } 02418 if (fixed_enc != a_enc) { 02419 options |= ARG_ENCODING_FIXED; 02420 enc = fixed_enc; 02421 } 02422 } 02423 else if (!(options & ARG_ENCODING_FIXED)) { 02424 enc = rb_usascii_encoding(); 02425 } 02426 02427 rb_enc_associate((VALUE)re, enc); 02428 if ((options & ARG_ENCODING_FIXED) || fixed_enc) { 02429 re->basic.flags |= KCODE_FIXED; 02430 } 02431 if (options & ARG_ENCODING_NONE) { 02432 re->basic.flags |= REG_ENCODING_NONE; 02433 } 02434 02435 re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc, 02436 options & ARG_REG_OPTION_MASK, err, 02437 sourcefile, sourceline); 02438 if (!re->ptr) return -1; 02439 re->src = rb_enc_str_new(s, len, enc); 02440 OBJ_FREEZE(re->src); 02441 RB_GC_GUARD(unescaped); 02442 return 0; 02443 } 02444 02445 static int 02446 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err, 02447 const char *sourcefile, int sourceline) 02448 { 02449 int ret; 02450 rb_encoding *enc = rb_enc_get(str); 02451 if (options & ARG_ENCODING_NONE) { 02452 rb_encoding *ascii8bit = rb_ascii8bit_encoding(); 02453 if (enc != ascii8bit) { 02454 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { 02455 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script"); 02456 return -1; 02457 } 02458 enc = ascii8bit; 02459 } 02460 } 02461 ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc, 02462 options, err, sourcefile, sourceline); 02463 OBJ_INFECT(obj, str); 02464 RB_GC_GUARD(str); 02465 return ret; 02466 } 02467 02468 static VALUE 02469 rb_reg_s_alloc(VALUE klass) 02470 { 02471 NEWOBJ_OF(re, struct RRegexp, klass, T_REGEXP); 02472 02473 re->ptr = 0; 02474 re->src = 0; 02475 re->usecnt = 0; 02476 02477 return (VALUE)re; 02478 } 02479 02480 VALUE 02481 rb_reg_alloc(void) 02482 { 02483 return rb_reg_s_alloc(rb_cRegexp); 02484 } 02485 02486 VALUE 02487 rb_reg_new_str(VALUE s, int options) 02488 { 02489 return rb_reg_init_str(rb_reg_alloc(), s, options); 02490 } 02491 02492 VALUE 02493 rb_reg_init_str(VALUE re, VALUE s, int options) 02494 { 02495 onig_errmsg_buffer err = ""; 02496 02497 if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) { 02498 rb_reg_raise_str(s, options, err); 02499 } 02500 02501 return re; 02502 } 02503 02504 VALUE 02505 rb_reg_new_ary(VALUE ary, int opt) 02506 { 02507 return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt); 02508 } 02509 02510 VALUE 02511 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options) 02512 { 02513 VALUE re = rb_reg_alloc(); 02514 onig_errmsg_buffer err = ""; 02515 02516 if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) { 02517 rb_enc_reg_raise(s, len, enc, options, err); 02518 } 02519 02520 return re; 02521 } 02522 02523 VALUE 02524 rb_reg_new(const char *s, long len, int options) 02525 { 02526 return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options); 02527 } 02528 02529 VALUE 02530 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline) 02531 { 02532 VALUE re = rb_reg_alloc(); 02533 onig_errmsg_buffer err = ""; 02534 02535 if (!str) str = rb_str_new(0,0); 02536 if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) { 02537 rb_set_errinfo(rb_reg_error_desc(str, options, err)); 02538 return Qnil; 02539 } 02540 FL_SET(re, REG_LITERAL); 02541 return re; 02542 } 02543 02544 static VALUE reg_cache; 02545 02546 VALUE 02547 rb_reg_regcomp(VALUE str) 02548 { 02549 volatile VALUE save_str = str; 02550 if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str) 02551 && ENCODING_GET(reg_cache) == ENCODING_GET(str) 02552 && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0) 02553 return reg_cache; 02554 02555 return reg_cache = rb_reg_new_str(save_str, 0); 02556 } 02557 02558 static st_index_t reg_hash(VALUE re); 02559 /* 02560 * call-seq: 02561 * rxp.hash -> fixnum 02562 * 02563 * Produce a hash based on the text and options of this regular expression. 02564 */ 02565 02566 static VALUE 02567 rb_reg_hash(VALUE re) 02568 { 02569 st_index_t hashval = reg_hash(re); 02570 return LONG2FIX(hashval); 02571 } 02572 02573 static st_index_t 02574 reg_hash(VALUE re) 02575 { 02576 st_index_t hashval; 02577 02578 rb_reg_check(re); 02579 hashval = RREGEXP(re)->ptr->options; 02580 hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re))); 02581 return rb_hash_end(hashval); 02582 } 02583 02584 02585 /* 02586 * call-seq: 02587 * rxp == other_rxp -> true or false 02588 * rxp.eql?(other_rxp) -> true or false 02589 * 02590 * Equality---Two regexps are equal if their patterns are identical, they have 02591 * the same character set code, and their <code>casefold?</code> values are the 02592 * same. 02593 * 02594 * /abc/ == /abc/x #=> false 02595 * /abc/ == /abc/i #=> false 02596 * /abc/ == /abc/u #=> false 02597 * /abc/u == /abc/n #=> false 02598 */ 02599 02600 static VALUE 02601 rb_reg_equal(VALUE re1, VALUE re2) 02602 { 02603 if (re1 == re2) return Qtrue; 02604 if (!RB_TYPE_P(re2, T_REGEXP)) return Qfalse; 02605 rb_reg_check(re1); rb_reg_check(re2); 02606 if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse; 02607 if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse; 02608 if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse; 02609 if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse; 02610 if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) { 02611 return Qtrue; 02612 } 02613 return Qfalse; 02614 } 02615 02616 /* 02617 * call-seq: 02618 * mtch.hash -> integer 02619 * 02620 * Produce a hash based on the target string, regexp and matched 02621 * positions of this matchdata. 02622 */ 02623 02624 static VALUE 02625 match_hash(VALUE match) 02626 { 02627 const struct re_registers *regs; 02628 st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str)); 02629 02630 rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp)); 02631 regs = RMATCH_REGS(match); 02632 hashval = rb_hash_uint(hashval, regs->num_regs); 02633 hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg))); 02634 hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end))); 02635 hashval = rb_hash_end(hashval); 02636 return LONG2FIX(hashval); 02637 } 02638 02639 /* 02640 * call-seq: 02641 * mtch == mtch2 -> true or false 02642 * 02643 * Equality---Two matchdata are equal if their target strings, 02644 * patterns, and matched positions are identical. 02645 */ 02646 02647 static VALUE 02648 match_equal(VALUE match1, VALUE match2) 02649 { 02650 const struct re_registers *regs1, *regs2; 02651 if (match1 == match2) return Qtrue; 02652 if (!RB_TYPE_P(match2, T_MATCH)) return Qfalse; 02653 if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse; 02654 if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse; 02655 regs1 = RMATCH_REGS(match1); 02656 regs2 = RMATCH_REGS(match2); 02657 if (regs1->num_regs != regs2->num_regs) return Qfalse; 02658 if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse; 02659 if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse; 02660 return Qtrue; 02661 } 02662 02663 static VALUE 02664 reg_operand(VALUE s, int check) 02665 { 02666 if (SYMBOL_P(s)) { 02667 return rb_sym_to_s(s); 02668 } 02669 else { 02670 return (check ? rb_str_to_str : rb_check_string_type)(s); 02671 } 02672 } 02673 02674 static long 02675 reg_match_pos(VALUE re, VALUE *strp, long pos) 02676 { 02677 VALUE str = *strp; 02678 02679 if (NIL_P(str)) { 02680 rb_backref_set(Qnil); 02681 return -1; 02682 } 02683 *strp = str = reg_operand(str, TRUE); 02684 if (pos != 0) { 02685 if (pos < 0) { 02686 VALUE l = rb_str_length(str); 02687 pos += NUM2INT(l); 02688 if (pos < 0) { 02689 return pos; 02690 } 02691 } 02692 pos = rb_str_offset(str, pos); 02693 } 02694 return rb_reg_search(re, str, pos, 0); 02695 } 02696 02697 /* 02698 * call-seq: 02699 * rxp =~ str -> integer or nil 02700 * 02701 * Match---Matches <i>rxp</i> against <i>str</i>. 02702 * 02703 * /at/ =~ "input data" #=> 7 02704 * /ax/ =~ "input data" #=> nil 02705 * 02706 * If <code>=~</code> is used with a regexp literal with named captures, 02707 * captured strings (or nil) is assigned to local variables named by 02708 * the capture names. 02709 * 02710 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = y " 02711 * p lhs #=> "x" 02712 * p rhs #=> "y" 02713 * 02714 * If it is not matched, nil is assigned for the variables. 02715 * 02716 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = " 02717 * p lhs #=> nil 02718 * p rhs #=> nil 02719 * 02720 * This assignment is implemented in the Ruby parser. 02721 * The parser detects 'regexp-literal =~ expression' for the assignment. 02722 * The regexp must be a literal without interpolation and placed at left hand side. 02723 * 02724 * The assignment does not occur if the regexp is not a literal. 02725 * 02726 * re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ 02727 * re =~ " x = y " 02728 * p lhs # undefined local variable 02729 * p rhs # undefined local variable 02730 * 02731 * A regexp interpolation, <code>#{}</code>, also disables 02732 * the assignment. 02733 * 02734 * rhs_pat = /(?<rhs>\w+)/ 02735 * /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y" 02736 * p lhs # undefined local variable 02737 * 02738 * The assignment does not occur if the regexp is placed at the right hand side. 02739 * 02740 * " x = y " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ 02741 * p lhs, rhs # undefined local variable 02742 * 02743 */ 02744 02745 VALUE 02746 rb_reg_match(VALUE re, VALUE str) 02747 { 02748 long pos = reg_match_pos(re, &str, 0); 02749 if (pos < 0) return Qnil; 02750 pos = rb_str_sublen(str, pos); 02751 return LONG2FIX(pos); 02752 } 02753 02754 /* 02755 * call-seq: 02756 * rxp === str -> true or false 02757 * 02758 * Case Equality---Used in case statements. 02759 * 02760 * a = "HELLO" 02761 * case a 02762 * when /^[a-z]*$/; print "Lower case\n" 02763 * when /^[A-Z]*$/; print "Upper case\n" 02764 * else; print "Mixed case\n" 02765 * end 02766 * #=> "Upper case" 02767 * 02768 * Following a regular expression literal with the #=== operator allows you to 02769 * compare against a String. 02770 * 02771 * /^[a-z]*$/ === "HELLO" #=> false 02772 * /^[A-Z]*$/ === "HELLO" #=> true 02773 */ 02774 02775 VALUE 02776 rb_reg_eqq(VALUE re, VALUE str) 02777 { 02778 long start; 02779 02780 str = reg_operand(str, FALSE); 02781 if (NIL_P(str)) { 02782 rb_backref_set(Qnil); 02783 return Qfalse; 02784 } 02785 start = rb_reg_search(re, str, 0, 0); 02786 if (start < 0) { 02787 return Qfalse; 02788 } 02789 return Qtrue; 02790 } 02791 02792 02793 /* 02794 * call-seq: 02795 * ~ rxp -> integer or nil 02796 * 02797 * Match---Matches <i>rxp</i> against the contents of <code>$_</code>. 02798 * Equivalent to <code><i>rxp</i> =~ $_</code>. 02799 * 02800 * $_ = "input data" 02801 * ~ /at/ #=> 7 02802 */ 02803 02804 VALUE 02805 rb_reg_match2(VALUE re) 02806 { 02807 long start; 02808 VALUE line = rb_lastline_get(); 02809 02810 if (!RB_TYPE_P(line, T_STRING)) { 02811 rb_backref_set(Qnil); 02812 return Qnil; 02813 } 02814 02815 start = rb_reg_search(re, line, 0, 0); 02816 if (start < 0) { 02817 return Qnil; 02818 } 02819 start = rb_str_sublen(line, start); 02820 return LONG2FIX(start); 02821 } 02822 02823 02824 /* 02825 * call-seq: 02826 * rxp.match(str) -> matchdata or nil 02827 * rxp.match(str,pos) -> matchdata or nil 02828 * 02829 * Returns a <code>MatchData</code> object describing the match, or 02830 * <code>nil</code> if there was no match. This is equivalent to retrieving the 02831 * value of the special variable <code>$~</code> following a normal match. 02832 * If the second parameter is present, it specifies the position in the string 02833 * to begin the search. 02834 * 02835 * /(.)(.)(.)/.match("abc")[2] #=> "b" 02836 * /(.)(.)/.match("abc", 1)[2] #=> "c" 02837 * 02838 * If a block is given, invoke the block with MatchData if match succeed, so 02839 * that you can write 02840 * 02841 * pat.match(str) {|m| ...} 02842 * 02843 * instead of 02844 * 02845 * if m = pat.match(str) 02846 * ... 02847 * end 02848 * 02849 * The return value is a value from block execution in this case. 02850 */ 02851 02852 static VALUE 02853 rb_reg_match_m(int argc, VALUE *argv, VALUE re) 02854 { 02855 VALUE result, str, initpos; 02856 long pos; 02857 02858 if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) { 02859 pos = NUM2LONG(initpos); 02860 } 02861 else { 02862 pos = 0; 02863 } 02864 02865 pos = reg_match_pos(re, &str, pos); 02866 if (pos < 0) { 02867 rb_backref_set(Qnil); 02868 return Qnil; 02869 } 02870 result = rb_backref_get(); 02871 rb_match_busy(result); 02872 if (!NIL_P(result) && rb_block_given_p()) { 02873 return rb_yield(result); 02874 } 02875 return result; 02876 } 02877 02878 /* 02879 * Document-method: compile 02880 * 02881 * Synonym for <code>Regexp.new</code> 02882 */ 02883 02884 02885 /* 02886 * call-seq: 02887 * Regexp.new(string, [options [, kcode]]) -> regexp 02888 * Regexp.new(regexp) -> regexp 02889 * Regexp.compile(string, [options [, kcode]]) -> regexp 02890 * Regexp.compile(regexp) -> regexp 02891 * 02892 * Constructs a new regular expression from +pattern+, which can be either a 02893 * String or a Regexp (in which case that regexp's options are propagated), 02894 * and new options may not be specified (a change as of Ruby 1.8). 02895 * 02896 * If +options+ is a Fixnum, it should be one or more of the constants 02897 * Regexp::EXTENDED, Regexp::IGNORECASE, and Regexp::MULTILINE, 02898 * <em>or</em>-ed together. Otherwise, if +options+ is not 02899 * +nil+ or +false+, the regexp will be case insensitive. 02900 * 02901 * When the +kcode+ parameter is `n' or `N' sets the regexp no encoding. 02902 * It means that the regexp is for binary strings. 02903 * 02904 * r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/ 02905 * r2 = Regexp.new('cat', true) #=> /cat/i 02906 * r3 = Regexp.new(r2) #=> /cat/i 02907 * r4 = Regexp.new('dog', Regexp::EXTENDED | Regexp::IGNORECASE) #=> /dog/ix 02908 */ 02909 02910 static VALUE 02911 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) 02912 { 02913 onig_errmsg_buffer err = ""; 02914 int flags = 0; 02915 VALUE str; 02916 rb_encoding *enc; 02917 const char *ptr; 02918 long len; 02919 02920 rb_check_arity(argc, 1, 3); 02921 if (RB_TYPE_P(argv[0], T_REGEXP)) { 02922 VALUE re = argv[0]; 02923 02924 if (argc > 1) { 02925 rb_warn("flags ignored"); 02926 } 02927 rb_reg_check(re); 02928 flags = rb_reg_options(re); 02929 ptr = RREGEXP_SRC_PTR(re); 02930 len = RREGEXP_SRC_LEN(re); 02931 enc = rb_enc_get(re); 02932 if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) { 02933 str = rb_enc_str_new(ptr, len, enc); 02934 rb_reg_raise_str(str, flags, err); 02935 } 02936 } 02937 else { 02938 if (argc >= 2) { 02939 if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]); 02940 else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE; 02941 } 02942 enc = 0; 02943 if (argc == 3 && !NIL_P(argv[2])) { 02944 char *kcode = StringValuePtr(argv[2]); 02945 if (kcode[0] == 'n' || kcode[0] == 'N') { 02946 enc = rb_ascii8bit_encoding(); 02947 flags |= ARG_ENCODING_NONE; 02948 } 02949 else { 02950 rb_warn("encoding option is ignored - %s", kcode); 02951 } 02952 } 02953 str = argv[0]; 02954 ptr = StringValuePtr(str); 02955 if (enc 02956 ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0) 02957 : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) { 02958 rb_reg_raise_str(str, flags, err); 02959 } 02960 } 02961 return self; 02962 } 02963 02964 VALUE 02965 rb_reg_quote(VALUE str) 02966 { 02967 rb_encoding *enc = rb_enc_get(str); 02968 char *s, *send, *t; 02969 VALUE tmp; 02970 int c, clen; 02971 int ascii_only = rb_enc_str_asciionly_p(str); 02972 02973 s = RSTRING_PTR(str); 02974 send = s + RSTRING_LEN(str); 02975 while (s < send) { 02976 c = rb_enc_ascget(s, send, &clen, enc); 02977 if (c == -1) { 02978 s += mbclen(s, send, enc); 02979 continue; 02980 } 02981 switch (c) { 02982 case '[': case ']': case '{': case '}': 02983 case '(': case ')': case '|': case '-': 02984 case '*': case '.': case '\\': 02985 case '?': case '+': case '^': case '$': 02986 case ' ': case '#': 02987 case '\t': case '\f': case '\v': case '\n': case '\r': 02988 goto meta_found; 02989 } 02990 s += clen; 02991 } 02992 tmp = rb_str_new3(str); 02993 if (ascii_only) { 02994 rb_enc_associate(tmp, rb_usascii_encoding()); 02995 } 02996 return tmp; 02997 02998 meta_found: 02999 tmp = rb_str_new(0, RSTRING_LEN(str)*2); 03000 if (ascii_only) { 03001 rb_enc_associate(tmp, rb_usascii_encoding()); 03002 } 03003 else { 03004 rb_enc_copy(tmp, str); 03005 } 03006 t = RSTRING_PTR(tmp); 03007 /* copy upto metacharacter */ 03008 memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str)); 03009 t += s - RSTRING_PTR(str); 03010 03011 while (s < send) { 03012 c = rb_enc_ascget(s, send, &clen, enc); 03013 if (c == -1) { 03014 int n = mbclen(s, send, enc); 03015 03016 while (n--) 03017 *t++ = *s++; 03018 continue; 03019 } 03020 s += clen; 03021 switch (c) { 03022 case '[': case ']': case '{': case '}': 03023 case '(': case ')': case '|': case '-': 03024 case '*': case '.': case '\\': 03025 case '?': case '+': case '^': case '$': 03026 case '#': 03027 t += rb_enc_mbcput('\\', t, enc); 03028 break; 03029 case ' ': 03030 t += rb_enc_mbcput('\\', t, enc); 03031 t += rb_enc_mbcput(' ', t, enc); 03032 continue; 03033 case '\t': 03034 t += rb_enc_mbcput('\\', t, enc); 03035 t += rb_enc_mbcput('t', t, enc); 03036 continue; 03037 case '\n': 03038 t += rb_enc_mbcput('\\', t, enc); 03039 t += rb_enc_mbcput('n', t, enc); 03040 continue; 03041 case '\r': 03042 t += rb_enc_mbcput('\\', t, enc); 03043 t += rb_enc_mbcput('r', t, enc); 03044 continue; 03045 case '\f': 03046 t += rb_enc_mbcput('\\', t, enc); 03047 t += rb_enc_mbcput('f', t, enc); 03048 continue; 03049 case '\v': 03050 t += rb_enc_mbcput('\\', t, enc); 03051 t += rb_enc_mbcput('v', t, enc); 03052 continue; 03053 } 03054 t += rb_enc_mbcput(c, t, enc); 03055 } 03056 rb_str_resize(tmp, t - RSTRING_PTR(tmp)); 03057 OBJ_INFECT(tmp, str); 03058 return tmp; 03059 } 03060 03061 03062 /* 03063 * call-seq: 03064 * Regexp.escape(str) -> string 03065 * Regexp.quote(str) -> string 03066 * 03067 * Escapes any characters that would have special meaning in a regular 03068 * expression. Returns a new escaped string, or self if no characters are 03069 * escaped. For any string, 03070 * <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true. 03071 * 03072 * Regexp.escape('\*?{}.') #=> \\\*\?\{\}\. 03073 * 03074 */ 03075 03076 static VALUE 03077 rb_reg_s_quote(VALUE c, VALUE str) 03078 { 03079 return rb_reg_quote(reg_operand(str, TRUE)); 03080 } 03081 03082 int 03083 rb_reg_options(VALUE re) 03084 { 03085 int options; 03086 03087 rb_reg_check(re); 03088 options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK; 03089 if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED; 03090 if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE; 03091 return options; 03092 } 03093 03094 VALUE 03095 rb_check_regexp_type(VALUE re) 03096 { 03097 return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp"); 03098 } 03099 03100 /* 03101 * call-seq: 03102 * Regexp.try_convert(obj) -> re or nil 03103 * 03104 * Try to convert <i>obj</i> into a Regexp, using to_regexp method. 03105 * Returns converted regexp or nil if <i>obj</i> cannot be converted 03106 * for any reason. 03107 * 03108 * Regexp.try_convert(/re/) #=> /re/ 03109 * Regexp.try_convert("re") #=> nil 03110 * 03111 * o = Object.new 03112 * Regexp.try_convert(o) #=> nil 03113 * def o.to_regexp() /foo/ end 03114 * Regexp.try_convert(o) #=> /foo/ 03115 * 03116 */ 03117 static VALUE 03118 rb_reg_s_try_convert(VALUE dummy, VALUE re) 03119 { 03120 return rb_check_regexp_type(re); 03121 } 03122 03123 static VALUE 03124 rb_reg_s_union(VALUE self, VALUE args0) 03125 { 03126 long argc = RARRAY_LEN(args0); 03127 03128 if (argc == 0) { 03129 VALUE args[1]; 03130 args[0] = rb_str_new2("(?!)"); 03131 return rb_class_new_instance(1, args, rb_cRegexp); 03132 } 03133 else if (argc == 1) { 03134 VALUE arg = rb_ary_entry(args0, 0); 03135 VALUE re = rb_check_regexp_type(arg); 03136 if (!NIL_P(re)) 03137 return re; 03138 else { 03139 VALUE quoted; 03140 quoted = rb_reg_s_quote(Qnil, arg); 03141 return rb_reg_new_str(quoted, 0); 03142 } 03143 } 03144 else { 03145 int i; 03146 VALUE source = rb_str_buf_new(0); 03147 rb_encoding *result_enc; 03148 03149 int has_asciionly = 0; 03150 rb_encoding *has_ascii_compat_fixed = 0; 03151 rb_encoding *has_ascii_incompat = 0; 03152 03153 for (i = 0; i < argc; i++) { 03154 volatile VALUE v; 03155 VALUE e = rb_ary_entry(args0, i); 03156 03157 if (0 < i) 03158 rb_str_buf_cat_ascii(source, "|"); 03159 03160 v = rb_check_regexp_type(e); 03161 if (!NIL_P(v)) { 03162 rb_encoding *enc = rb_enc_get(v); 03163 if (!rb_enc_asciicompat(enc)) { 03164 if (!has_ascii_incompat) 03165 has_ascii_incompat = enc; 03166 else if (has_ascii_incompat != enc) 03167 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 03168 rb_enc_name(has_ascii_incompat), rb_enc_name(enc)); 03169 } 03170 else if (rb_reg_fixed_encoding_p(v)) { 03171 if (!has_ascii_compat_fixed) 03172 has_ascii_compat_fixed = enc; 03173 else if (has_ascii_compat_fixed != enc) 03174 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 03175 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc)); 03176 } 03177 else { 03178 has_asciionly = 1; 03179 } 03180 v = rb_reg_to_s(v); 03181 } 03182 else { 03183 rb_encoding *enc; 03184 StringValue(e); 03185 enc = rb_enc_get(e); 03186 if (!rb_enc_str_asciicompat_p(e)) { 03187 if (!has_ascii_incompat) 03188 has_ascii_incompat = enc; 03189 else if (has_ascii_incompat != enc) 03190 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 03191 rb_enc_name(has_ascii_incompat), rb_enc_name(enc)); 03192 } 03193 else if (rb_enc_str_asciionly_p(e)) { 03194 has_asciionly = 1; 03195 } 03196 else { 03197 if (!has_ascii_compat_fixed) 03198 has_ascii_compat_fixed = enc; 03199 else if (has_ascii_compat_fixed != enc) 03200 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 03201 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc)); 03202 } 03203 v = rb_reg_s_quote(Qnil, e); 03204 } 03205 if (has_ascii_incompat) { 03206 if (has_asciionly) { 03207 rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", 03208 rb_enc_name(has_ascii_incompat)); 03209 } 03210 if (has_ascii_compat_fixed) { 03211 rb_raise(rb_eArgError, "incompatible encodings: %s and %s", 03212 rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed)); 03213 } 03214 } 03215 03216 if (i == 0) { 03217 rb_enc_copy(source, v); 03218 } 03219 rb_str_append(source, v); 03220 } 03221 03222 if (has_ascii_incompat) { 03223 result_enc = has_ascii_incompat; 03224 } 03225 else if (has_ascii_compat_fixed) { 03226 result_enc = has_ascii_compat_fixed; 03227 } 03228 else { 03229 result_enc = rb_ascii8bit_encoding(); 03230 } 03231 03232 rb_enc_associate(source, result_enc); 03233 return rb_class_new_instance(1, &source, rb_cRegexp); 03234 } 03235 } 03236 03237 /* 03238 * call-seq: 03239 * Regexp.union(pat1, pat2, ...) -> new_regexp 03240 * Regexp.union(pats_ary) -> new_regexp 03241 * 03242 * Return a <code>Regexp</code> object that is the union of the given 03243 * <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s 03244 * can be Regexp objects, in which case their options will be preserved, or 03245 * Strings. If no patterns are given, returns <code>/(?!)/</code>. 03246 * The behavior is unspecified if any given <em>pattern</em> contains capture. 03247 * 03248 * Regexp.union #=> /(?!)/ 03249 * Regexp.union("penzance") #=> /penzance/ 03250 * Regexp.union("a+b*c") #=> /a\+b\*c/ 03251 * Regexp.union("skiing", "sledding") #=> /skiing|sledding/ 03252 * Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/ 03253 * Regexp.union(/dogs/, /cats/i) #=> /(?-mix:dogs)|(?i-mx:cats)/ 03254 */ 03255 static VALUE 03256 rb_reg_s_union_m(VALUE self, VALUE args) 03257 { 03258 VALUE v; 03259 if (RARRAY_LEN(args) == 1 && 03260 !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) { 03261 return rb_reg_s_union(self, v); 03262 } 03263 return rb_reg_s_union(self, args); 03264 } 03265 03266 /* :nodoc: */ 03267 static VALUE 03268 rb_reg_init_copy(VALUE copy, VALUE re) 03269 { 03270 onig_errmsg_buffer err = ""; 03271 const char *s; 03272 long len; 03273 03274 if (!OBJ_INIT_COPY(copy, re)) return copy; 03275 rb_reg_check(re); 03276 s = RREGEXP_SRC_PTR(re); 03277 len = RREGEXP_SRC_LEN(re); 03278 if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re), 03279 err, NULL, 0) != 0) { 03280 rb_reg_raise(s, len, err, re); 03281 } 03282 return copy; 03283 } 03284 03285 VALUE 03286 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) 03287 { 03288 VALUE val = 0; 03289 char *p, *s, *e; 03290 int no, clen; 03291 rb_encoding *str_enc = rb_enc_get(str); 03292 rb_encoding *src_enc = rb_enc_get(src); 03293 int acompat = rb_enc_asciicompat(str_enc); 03294 #define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc)) 03295 03296 p = s = RSTRING_PTR(str); 03297 e = s + RSTRING_LEN(str); 03298 03299 while (s < e) { 03300 int c = ASCGET(s, e, &clen); 03301 char *ss; 03302 03303 if (c == -1) { 03304 s += mbclen(s, e, str_enc); 03305 continue; 03306 } 03307 ss = s; 03308 s += clen; 03309 03310 if (c != '\\' || s == e) continue; 03311 03312 if (!val) { 03313 val = rb_str_buf_new(ss-p); 03314 } 03315 rb_enc_str_buf_cat(val, p, ss-p, str_enc); 03316 03317 c = ASCGET(s, e, &clen); 03318 if (c == -1) { 03319 s += mbclen(s, e, str_enc); 03320 rb_enc_str_buf_cat(val, ss, s-ss, str_enc); 03321 p = s; 03322 continue; 03323 } 03324 s += clen; 03325 03326 p = s; 03327 switch (c) { 03328 case '1': case '2': case '3': case '4': 03329 case '5': case '6': case '7': case '8': case '9': 03330 if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) { 03331 no = c - '0'; 03332 } 03333 else { 03334 continue; 03335 } 03336 break; 03337 03338 case 'k': 03339 if (s < e && ASCGET(s, e, &clen) == '<') { 03340 char *name, *name_end; 03341 03342 name_end = name = s + clen; 03343 while (name_end < e) { 03344 c = ASCGET(name_end, e, &clen); 03345 if (c == '>') break; 03346 name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen; 03347 } 03348 if (name_end < e) { 03349 no = name_to_backref_number(regs, regexp, name, name_end); 03350 p = s = name_end + clen; 03351 break; 03352 } 03353 else { 03354 rb_raise(rb_eRuntimeError, "invalid group name reference format"); 03355 } 03356 } 03357 03358 rb_enc_str_buf_cat(val, ss, s-ss, str_enc); 03359 continue; 03360 03361 case '0': 03362 case '&': 03363 no = 0; 03364 break; 03365 03366 case '`': 03367 rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc); 03368 continue; 03369 03370 case '\'': 03371 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc); 03372 continue; 03373 03374 case '+': 03375 no = regs->num_regs-1; 03376 while (BEG(no) == -1 && no > 0) no--; 03377 if (no == 0) continue; 03378 break; 03379 03380 case '\\': 03381 rb_enc_str_buf_cat(val, s-clen, clen, str_enc); 03382 continue; 03383 03384 default: 03385 rb_enc_str_buf_cat(val, ss, s-ss, str_enc); 03386 continue; 03387 } 03388 03389 if (no >= 0) { 03390 if (no >= regs->num_regs) continue; 03391 if (BEG(no) == -1) continue; 03392 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc); 03393 } 03394 } 03395 03396 if (!val) return str; 03397 if (p < e) { 03398 rb_enc_str_buf_cat(val, p, e-p, str_enc); 03399 } 03400 03401 return val; 03402 } 03403 03404 static VALUE 03405 kcode_getter(void) 03406 { 03407 rb_warn("variable $KCODE is no longer effective"); 03408 return Qnil; 03409 } 03410 03411 static void 03412 kcode_setter(VALUE val, ID id) 03413 { 03414 rb_warn("variable $KCODE is no longer effective; ignored"); 03415 } 03416 03417 static VALUE 03418 ignorecase_getter(void) 03419 { 03420 rb_warn("variable $= is no longer effective"); 03421 return Qfalse; 03422 } 03423 03424 static void 03425 ignorecase_setter(VALUE val, ID id) 03426 { 03427 rb_warn("variable $= is no longer effective; ignored"); 03428 } 03429 03430 static VALUE 03431 match_getter(void) 03432 { 03433 VALUE match = rb_backref_get(); 03434 03435 if (NIL_P(match)) return Qnil; 03436 rb_match_busy(match); 03437 return match; 03438 } 03439 03440 static void 03441 match_setter(VALUE val) 03442 { 03443 if (!NIL_P(val)) { 03444 Check_Type(val, T_MATCH); 03445 } 03446 rb_backref_set(val); 03447 } 03448 03449 /* 03450 * call-seq: 03451 * Regexp.last_match -> matchdata 03452 * Regexp.last_match(n) -> str 03453 * 03454 * The first form returns the MatchData object generated by the 03455 * last successful pattern match. Equivalent to reading the special global 03456 * variable <code>$~</code> (see Special global variables in Regexp for 03457 * details). 03458 * 03459 * The second form returns the <i>n</i>th field in this MatchData object. 03460 * _n_ can be a string or symbol to reference a named capture. 03461 * 03462 * Note that the last_match is local to the thread and method scope of the 03463 * method that did the pattern match. 03464 * 03465 * /c(.)t/ =~ 'cat' #=> 0 03466 * Regexp.last_match #=> #<MatchData "cat" 1:"a"> 03467 * Regexp.last_match(0) #=> "cat" 03468 * Regexp.last_match(1) #=> "a" 03469 * Regexp.last_match(2) #=> nil 03470 * 03471 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val" 03472 * Regexp.last_match #=> #<MatchData "var = val" lhs:"var" rhs:"val"> 03473 * Regexp.last_match(:lhs) #=> "var" 03474 * Regexp.last_match(:rhs) #=> "val" 03475 */ 03476 03477 static VALUE 03478 rb_reg_s_last_match(int argc, VALUE *argv) 03479 { 03480 VALUE nth; 03481 03482 if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) { 03483 VALUE match = rb_backref_get(); 03484 int n; 03485 if (NIL_P(match)) return Qnil; 03486 n = match_backref_number(match, nth); 03487 return rb_reg_nth_match(n, match); 03488 } 03489 return match_getter(); 03490 } 03491 03492 static void 03493 re_warn(const char *s) 03494 { 03495 rb_warn("%s", s); 03496 } 03497 03498 /* 03499 * Document-class: RegexpError 03500 * 03501 * Raised when given an invalid regexp expression. 03502 * 03503 * Regexp.new("?") 03504 * 03505 * <em>raises the exception:</em> 03506 * 03507 * RegexpError: target of repeat operator is not specified: /?/ 03508 */ 03509 03510 /* 03511 * Document-class: Regexp 03512 * 03513 * A <code>Regexp</code> holds a regular expression, used to match a pattern 03514 * against strings. Regexps are created using the <code>/.../</code> and 03515 * <code>%r{...}</code> literals, and by the <code>Regexp::new</code> 03516 * constructor. 03517 * 03518 * :include: doc/re.rdoc 03519 */ 03520 03521 void 03522 Init_Regexp(void) 03523 { 03524 rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError); 03525 03526 onigenc_set_default_caseconv_table((UChar*)casetable); 03527 onigenc_set_default_encoding(ONIG_ENCODING_ASCII); 03528 onig_set_warn_func(re_warn); 03529 onig_set_verb_warn_func(re_warn); 03530 03531 rb_define_virtual_variable("$~", match_getter, match_setter); 03532 rb_define_virtual_variable("$&", last_match_getter, 0); 03533 rb_define_virtual_variable("$`", prematch_getter, 0); 03534 rb_define_virtual_variable("$'", postmatch_getter, 0); 03535 rb_define_virtual_variable("$+", last_paren_match_getter, 0); 03536 03537 rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter); 03538 rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter); 03539 rb_define_virtual_variable("$-K", kcode_getter, kcode_setter); 03540 03541 rb_cRegexp = rb_define_class("Regexp", rb_cObject); 03542 rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc); 03543 rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1); 03544 rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1); 03545 rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1); 03546 rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2); 03547 rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1); 03548 rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1); 03549 03550 rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1); 03551 rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1); 03552 rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0); 03553 rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1); 03554 rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1); 03555 rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1); 03556 rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1); 03557 rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0); 03558 rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1); 03559 rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0); 03560 rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0); 03561 rb_define_method(rb_cRegexp, "source", rb_reg_source, 0); 03562 rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0); 03563 rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0); 03564 rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */ 03565 rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0); 03566 rb_define_method(rb_cRegexp, "names", rb_reg_names, 0); 03567 rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0); 03568 03569 /* see Regexp.options and Regexp.new */ 03570 rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE)); 03571 /* see Regexp.options and Regexp.new */ 03572 rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND)); 03573 /* see Regexp.options and Regexp.new */ 03574 rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE)); 03575 /* see Regexp.options and Regexp.new */ 03576 rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED)); 03577 /* see Regexp.options and Regexp.new */ 03578 rb_define_const(rb_cRegexp, "NOENCODING", INT2FIX(ARG_ENCODING_NONE)); 03579 03580 rb_global_variable(®_cache); 03581 03582 rb_cMatch = rb_define_class("MatchData", rb_cObject); 03583 rb_define_alloc_func(rb_cMatch, match_alloc); 03584 rb_undef_method(CLASS_OF(rb_cMatch), "new"); 03585 03586 rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1); 03587 rb_define_method(rb_cMatch, "regexp", match_regexp, 0); 03588 rb_define_method(rb_cMatch, "names", match_names, 0); 03589 rb_define_method(rb_cMatch, "size", match_size, 0); 03590 rb_define_method(rb_cMatch, "length", match_size, 0); 03591 rb_define_method(rb_cMatch, "offset", match_offset, 1); 03592 rb_define_method(rb_cMatch, "begin", match_begin, 1); 03593 rb_define_method(rb_cMatch, "end", match_end, 1); 03594 rb_define_method(rb_cMatch, "to_a", match_to_a, 0); 03595 rb_define_method(rb_cMatch, "[]", match_aref, -1); 03596 rb_define_method(rb_cMatch, "captures", match_captures, 0); 03597 rb_define_method(rb_cMatch, "values_at", match_values_at, -1); 03598 rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0); 03599 rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0); 03600 rb_define_method(rb_cMatch, "to_s", match_to_s, 0); 03601 rb_define_method(rb_cMatch, "inspect", match_inspect, 0); 03602 rb_define_method(rb_cMatch, "string", match_string, 0); 03603 rb_define_method(rb_cMatch, "hash", match_hash, 0); 03604 rb_define_method(rb_cMatch, "eql?", match_equal, 1); 03605 rb_define_method(rb_cMatch, "==", match_equal, 1); 03606 } 03607