Ruby  2.0.0p247(2013-06-27revision41674)
re.c
Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   re.c -
00004 
00005   $Author: marcandre $
00006   created at: Mon Aug  9 18:24:49 JST 1993
00007 
00008   Copyright (C) 1993-2007 Yukihiro Matsumoto
00009 
00010 **********************************************************************/
00011 
00012 #include "ruby/ruby.h"
00013 #include "ruby/re.h"
00014 #include "ruby/encoding.h"
00015 #include "ruby/util.h"
00016 #include "internal.h"
00017 #include "regint.h"
00018 #include <ctype.h>
00019 
00020 VALUE rb_eRegexpError;
00021 
00022 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
00023 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
00024 
00025 #define BEG(no) (regs->beg[(no)])
00026 #define END(no) (regs->end[(no)])
00027 
00028 #if 'a' == 97   /* it's ascii */
00029 static const char casetable[] = {
00030         '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
00031         '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
00032         '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
00033         '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
00034         /* ' '     '!'     '"'     '#'     '$'     '%'     '&'     ''' */
00035         '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
00036         /* '('     ')'     '*'     '+'     ','     '-'     '.'     '/' */
00037         '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
00038         /* '0'     '1'     '2'     '3'     '4'     '5'     '6'     '7' */
00039         '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
00040         /* '8'     '9'     ':'     ';'     '<'     '='     '>'     '?' */
00041         '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
00042         /* '@'     'A'     'B'     'C'     'D'     'E'     'F'     'G' */
00043         '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00044         /* 'H'     'I'     'J'     'K'     'L'     'M'     'N'     'O' */
00045         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00046         /* 'P'     'Q'     'R'     'S'     'T'     'U'     'V'     'W' */
00047         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00048         /* 'X'     'Y'     'Z'     '['     '\'     ']'     '^'     '_' */
00049         '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
00050         /* '`'     'a'     'b'     'c'     'd'     'e'     'f'     'g' */
00051         '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00052         /* 'h'     'i'     'j'     'k'     'l'     'm'     'n'     'o' */
00053         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00054         /* 'p'     'q'     'r'     's'     't'     'u'     'v'     'w' */
00055         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00056         /* 'x'     'y'     'z'     '{'     '|'     '}'     '~' */
00057         '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
00058         '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
00059         '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
00060         '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
00061         '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
00062         '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
00063         '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
00064         '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
00065         '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
00066         '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
00067         '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
00068         '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
00069         '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
00070         '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
00071         '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
00072         '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
00073         '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
00074 };
00075 #else
00076 # error >>> "You lose. You will need a translation table for your character set." <<<
00077 #endif
00078 
00079 int
00080 rb_memcicmp(const void *x, const void *y, long len)
00081 {
00082     const unsigned char *p1 = x, *p2 = y;
00083     int tmp;
00084 
00085     while (len--) {
00086         if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
00087             return tmp;
00088     }
00089     return 0;
00090 }
00091 
00092 #undef rb_memcmp
00093 
00094 int
00095 rb_memcmp(const void *p1, const void *p2, long len)
00096 {
00097     return memcmp(p1, p2, len);
00098 }
00099 
00100 #ifdef HAVE_MEMMEM
00101 static inline long
00102 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
00103 {
00104     const unsigned char *y;
00105 
00106     if (y = memmem(ys, n, xs, m))
00107         return y - ys;
00108     else
00109         return -1;
00110 }
00111 #else
00112 static inline long
00113 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
00114 {
00115     const unsigned char *x = xs, *xe = xs + m;
00116     const unsigned char *y = ys, *ye = ys + n;
00117 #ifndef VALUE_MAX
00118 # if SIZEOF_VALUE == 8
00119 #  define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
00120 # elif SIZEOF_VALUE == 4
00121 #  define VALUE_MAX 0xFFFFFFFFUL
00122 # endif
00123 #endif
00124     VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
00125 
00126     if (m > SIZEOF_VALUE)
00127         rb_bug("!!too long pattern string!!");
00128 
00129     if (!(y = memchr(y, *x, n - m + 1)))
00130         return -1;
00131 
00132     /* Prepare hash value */
00133     for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
00134         hx <<= CHAR_BIT;
00135         hy <<= CHAR_BIT;
00136         hx |= *x;
00137         hy |= *y;
00138     }
00139     /* Searching */
00140     while (hx != hy) {
00141         if (y == ye)
00142             return -1;
00143         hy <<= CHAR_BIT;
00144         hy |= *y;
00145         hy &= mask;
00146         y++;
00147     }
00148     return y - ys - m;
00149 }
00150 #endif
00151 
00152 static inline long
00153 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
00154 {
00155     const unsigned char *x = xs, *xe = xs + m;
00156     const unsigned char *y = ys;
00157     VALUE i, qstable[256];
00158 
00159     /* Preprocessing */
00160     for (i = 0; i < 256; ++i)
00161         qstable[i] = m + 1;
00162     for (; x < xe; ++x)
00163         qstable[*x] = xe - x;
00164     /* Searching */
00165     for (; y + m <= ys + n; y += *(qstable + y[m])) {
00166         if (*xs == *y && memcmp(xs, y, m) == 0)
00167             return y - ys;
00168     }
00169     return -1;
00170 }
00171 
00172 static inline unsigned int
00173 rb_memsearch_qs_utf8_hash(const unsigned char *x)
00174 {
00175     register const unsigned int mix = 8353;
00176     register unsigned int h = *x;
00177     if (h < 0xC0) {
00178         return h + 256;
00179     }
00180     else if (h < 0xE0) {
00181         h *= mix;
00182         h += x[1];
00183     }
00184     else if (h < 0xF0) {
00185         h *= mix;
00186         h += x[1];
00187         h *= mix;
00188         h += x[2];
00189     }
00190     else if (h < 0xF5) {
00191         h *= mix;
00192         h += x[1];
00193         h *= mix;
00194         h += x[2];
00195         h *= mix;
00196         h += x[3];
00197     }
00198     else {
00199         return h + 256;
00200     }
00201     return (unsigned char)h;
00202 }
00203 
00204 static inline long
00205 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
00206 {
00207     const unsigned char *x = xs, *xe = xs + m;
00208     const unsigned char *y = ys;
00209     VALUE i, qstable[512];
00210 
00211     /* Preprocessing */
00212     for (i = 0; i < 512; ++i) {
00213         qstable[i] = m + 1;
00214     }
00215     for (; x < xe; ++x) {
00216         qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
00217     }
00218     /* Searching */
00219     for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
00220         if (*xs == *y && memcmp(xs, y, m) == 0)
00221             return y - ys;
00222     }
00223     return -1;
00224 }
00225 
00226 long
00227 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
00228 {
00229     const unsigned char *x = x0, *y = y0;
00230 
00231     if (m > n) return -1;
00232     else if (m == n) {
00233         return memcmp(x0, y0, m) == 0 ? 0 : -1;
00234     }
00235     else if (m < 1) {
00236         return 0;
00237     }
00238     else if (m == 1) {
00239         const unsigned char *ys;
00240 
00241         if (ys = memchr(y, *x, n))
00242             return ys - y;
00243         else
00244             return -1;
00245     }
00246     else if (m <= SIZEOF_VALUE) {
00247         return rb_memsearch_ss(x0, m, y0, n);
00248     }
00249     else if (enc == rb_utf8_encoding()){
00250         return rb_memsearch_qs_utf8(x0, m, y0, n);
00251     }
00252     else {
00253         return rb_memsearch_qs(x0, m, y0, n);
00254     }
00255 }
00256 
00257 #define REG_LITERAL FL_USER5
00258 #define REG_ENCODING_NONE FL_USER6
00259 
00260 #define KCODE_FIXED FL_USER4
00261 
00262 #define ARG_REG_OPTION_MASK \
00263     (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
00264 #define ARG_ENCODING_FIXED    16
00265 #define ARG_ENCODING_NONE     32
00266 
00267 static int
00268 char_to_option(int c)
00269 {
00270     int val;
00271 
00272     switch (c) {
00273       case 'i':
00274         val = ONIG_OPTION_IGNORECASE;
00275         break;
00276       case 'x':
00277         val = ONIG_OPTION_EXTEND;
00278         break;
00279       case 'm':
00280         val = ONIG_OPTION_MULTILINE;
00281         break;
00282       default:
00283         val = 0;
00284         break;
00285     }
00286     return val;
00287 }
00288 
00289 static char *
00290 option_to_str(char str[4], int options)
00291 {
00292     char *p = str;
00293     if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
00294     if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
00295     if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
00296     *p = 0;
00297     return str;
00298 }
00299 
00300 extern int
00301 rb_char_to_option_kcode(int c, int *option, int *kcode)
00302 {
00303     *option = 0;
00304 
00305     switch (c) {
00306       case 'n':
00307         *kcode = rb_ascii8bit_encindex();
00308         return (*option = ARG_ENCODING_NONE);
00309       case 'e':
00310         *kcode = rb_enc_find_index("EUC-JP");
00311         break;
00312       case 's':
00313         *kcode = rb_enc_find_index("Windows-31J");
00314         break;
00315       case 'u':
00316         *kcode = rb_utf8_encindex();
00317         break;
00318       default:
00319         *kcode = -1;
00320         return (*option = char_to_option(c));
00321     }
00322     *option = ARG_ENCODING_FIXED;
00323     return 1;
00324 }
00325 
00326 static void
00327 rb_reg_check(VALUE re)
00328 {
00329     if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00330         rb_raise(rb_eTypeError, "uninitialized Regexp");
00331     }
00332 }
00333 
00334 static void
00335 rb_reg_expr_str(VALUE str, const char *s, long len,
00336         rb_encoding *enc, rb_encoding *resenc)
00337 {
00338     const char *p, *pend;
00339     int cr = ENC_CODERANGE_UNKNOWN;
00340     int need_escape = 0;
00341     int c, clen;
00342 
00343     p = s; pend = p + len;
00344     rb_str_coderange_scan_restartable(p, pend, enc, &cr);
00345     if (rb_enc_asciicompat(enc) &&
00346         (cr == ENC_CODERANGE_VALID || cr == ENC_CODERANGE_7BIT)) {
00347         while (p < pend) {
00348             c = rb_enc_ascget(p, pend, &clen, enc);
00349             if (c == -1) {
00350                 if (enc == resenc) {
00351                     p += mbclen(p, pend, enc);
00352                 }
00353                 else {
00354                     need_escape = 1;
00355                     break;
00356                 }
00357             }
00358             else if (c != '/' && rb_enc_isprint(c, enc)) {
00359                 p += clen;
00360             }
00361             else {
00362                 need_escape = 1;
00363                 break;
00364             }
00365         }
00366     }
00367     else {
00368         need_escape = 1;
00369     }
00370 
00371     if (!need_escape) {
00372         rb_str_buf_cat(str, s, len);
00373     }
00374     else {
00375         int unicode_p = rb_enc_unicode_p(enc);
00376         p = s;
00377         while (p<pend) {
00378             c = rb_enc_ascget(p, pend, &clen, enc);
00379             if (c == '\\' && p+clen < pend) {
00380                 int n = clen + mbclen(p+clen, pend, enc);
00381                 rb_str_buf_cat(str, p, n);
00382                 p += n;
00383                 continue;
00384             }
00385             else if (c == '/') {
00386                 char c = '\\';
00387                 rb_str_buf_cat(str, &c, 1);
00388                 rb_str_buf_cat(str, p, clen);
00389             }
00390             else if (c == -1) {
00391                 clen = rb_enc_precise_mbclen(p, pend, enc);
00392                 if (!MBCLEN_CHARFOUND_P(clen)) {
00393                     c = (unsigned char)*p;
00394                     clen = 1;
00395                     goto hex;
00396                 }
00397                 if (resenc) {
00398                     unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
00399                     rb_str_buf_cat_escaped_char(str, c, unicode_p);
00400                 }
00401                 else {
00402                     clen = MBCLEN_CHARFOUND_LEN(clen);
00403                     rb_str_buf_cat(str, p, clen);
00404                 }
00405             }
00406             else if (rb_enc_isprint(c, enc)) {
00407                 rb_str_buf_cat(str, p, clen);
00408             }
00409             else if (!rb_enc_isspace(c, enc)) {
00410                 char b[8];
00411 
00412               hex:
00413                 snprintf(b, sizeof(b), "\\x%02X", c);
00414                 rb_str_buf_cat(str, b, 4);
00415             }
00416             else {
00417                 rb_str_buf_cat(str, p, clen);
00418             }
00419             p += clen;
00420         }
00421     }
00422 }
00423 
00424 static VALUE
00425 rb_reg_desc(const char *s, long len, VALUE re)
00426 {
00427     rb_encoding *enc = rb_enc_get(re);
00428     VALUE str = rb_str_buf_new2("/");
00429     rb_encoding *resenc = rb_default_internal_encoding();
00430     if (resenc == NULL) resenc = rb_default_external_encoding();
00431 
00432     if (re && rb_enc_asciicompat(enc)) {
00433         rb_enc_copy(str, re);
00434     }
00435     else {
00436         rb_enc_associate(str, rb_usascii_encoding());
00437     }
00438     rb_reg_expr_str(str, s, len, enc, resenc);
00439     rb_str_buf_cat2(str, "/");
00440     if (re) {
00441         char opts[4];
00442         rb_reg_check(re);
00443         if (*option_to_str(opts, RREGEXP(re)->ptr->options))
00444             rb_str_buf_cat2(str, opts);
00445         if (RBASIC(re)->flags & REG_ENCODING_NONE)
00446             rb_str_buf_cat2(str, "n");
00447     }
00448     OBJ_INFECT(str, re);
00449     return str;
00450 }
00451 
00452 
00453 /*
00454  *  call-seq:
00455  *      rxp.source   -> str
00456  *
00457  *  Returns the original string of the pattern.
00458  *
00459  *      /ab+c/ix.source #=> "ab+c"
00460  *
00461  *  Note that escape sequences are retained as is.
00462  *
00463  *     /\x20\+/.source  #=> "\\x20\\+"
00464  *
00465  */
00466 
00467 static VALUE
00468 rb_reg_source(VALUE re)
00469 {
00470     VALUE str;
00471 
00472     rb_reg_check(re);
00473     str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re));
00474     if (OBJ_TAINTED(re)) OBJ_TAINT(str);
00475     return str;
00476 }
00477 
00478 /*
00479  * call-seq:
00480  *    rxp.inspect   -> string
00481  *
00482  * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
00483  * <code>#inspect</code> actually produces the more natural version of
00484  * the string than <code>#to_s</code>.
00485  *
00486  *      /ab+c/ix.inspect        #=> "/ab+c/ix"
00487  *
00488  */
00489 
00490 static VALUE
00491 rb_reg_inspect(VALUE re)
00492 {
00493     if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00494         return rb_any_to_s(re);
00495     }
00496     return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
00497 }
00498 
00499 
00500 /*
00501  *  call-seq:
00502  *     rxp.to_s   -> str
00503  *
00504  *  Returns a string containing the regular expression and its options (using the
00505  *  <code>(?opts:source)</code> notation. This string can be fed back in to
00506  *  <code>Regexp::new</code> to a regular expression with the same semantics as
00507  *  the original. (However, <code>Regexp#==</code> may not return true when
00508  *  comparing the two, as the source of the regular expression itself may
00509  *  differ, as the example shows).  <code>Regexp#inspect</code> produces a
00510  *  generally more readable version of <i>rxp</i>.
00511  *
00512  *      r1 = /ab+c/ix           #=> /ab+c/ix
00513  *      s1 = r1.to_s            #=> "(?ix-m:ab+c)"
00514  *      r2 = Regexp.new(s1)     #=> /(?ix-m:ab+c)/
00515  *      r1 == r2                #=> false
00516  *      r1.source               #=> "ab+c"
00517  *      r2.source               #=> "(?ix-m:ab+c)"
00518  */
00519 
00520 static VALUE
00521 rb_reg_to_s(VALUE re)
00522 {
00523     int options, opt;
00524     const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
00525     long len;
00526     const UChar* ptr;
00527     VALUE str = rb_str_buf_new2("(?");
00528     char optbuf[5];
00529     rb_encoding *enc = rb_enc_get(re);
00530 
00531     rb_reg_check(re);
00532 
00533     rb_enc_copy(str, re);
00534     options = RREGEXP(re)->ptr->options;
00535     ptr = (UChar*)RREGEXP_SRC_PTR(re);
00536     len = RREGEXP_SRC_LEN(re);
00537   again:
00538     if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
00539         int err = 1;
00540         ptr += 2;
00541         if ((len -= 2) > 0) {
00542             do {
00543                 opt = char_to_option((int )*ptr);
00544                 if (opt != 0) {
00545                     options |= opt;
00546                 }
00547                 else {
00548                     break;
00549                 }
00550                 ++ptr;
00551             } while (--len > 0);
00552         }
00553         if (len > 1 && *ptr == '-') {
00554             ++ptr;
00555             --len;
00556             do {
00557                 opt = char_to_option((int )*ptr);
00558                 if (opt != 0) {
00559                     options &= ~opt;
00560                 }
00561                 else {
00562                     break;
00563                 }
00564                 ++ptr;
00565             } while (--len > 0);
00566         }
00567         if (*ptr == ')') {
00568             --len;
00569             ++ptr;
00570             goto again;
00571         }
00572         if (*ptr == ':' && ptr[len-1] == ')') {
00573             Regexp *rp;
00574 
00575             ++ptr;
00576             len -= 2;
00577             err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT,
00578                            enc, OnigDefaultSyntax, NULL);
00579             onig_free(rp);
00580         }
00581         if (err) {
00582             options = RREGEXP(re)->ptr->options;
00583             ptr = (UChar*)RREGEXP_SRC_PTR(re);
00584             len = RREGEXP_SRC_LEN(re);
00585         }
00586     }
00587 
00588     if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
00589 
00590     if ((options & embeddable) != embeddable) {
00591         optbuf[0] = '-';
00592         option_to_str(optbuf + 1, ~options);
00593         rb_str_buf_cat2(str, optbuf);
00594     }
00595 
00596     rb_str_buf_cat2(str, ":");
00597     rb_reg_expr_str(str, (char*)ptr, len, enc, NULL);
00598     rb_str_buf_cat2(str, ")");
00599     rb_enc_copy(str, re);
00600 
00601     OBJ_INFECT(str, re);
00602     return str;
00603 }
00604 
00605 static void
00606 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
00607 {
00608     volatile VALUE desc = rb_reg_desc(s, len, re);
00609 
00610     rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
00611 }
00612 
00613 static VALUE
00614 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
00615 {
00616     char opts[6];
00617     VALUE desc = rb_str_buf_new2(err);
00618     rb_encoding *resenc = rb_default_internal_encoding();
00619     if (resenc == NULL) resenc = rb_default_external_encoding();
00620 
00621     rb_enc_associate(desc, enc);
00622     rb_str_buf_cat2(desc, ": /");
00623     rb_reg_expr_str(desc, s, len, enc, resenc);
00624     opts[0] = '/';
00625     option_to_str(opts + 1, options);
00626     rb_str_buf_cat2(desc, opts);
00627     return rb_exc_new3(rb_eRegexpError, desc);
00628 }
00629 
00630 static void
00631 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
00632 {
00633     rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
00634 }
00635 
00636 static VALUE
00637 rb_reg_error_desc(VALUE str, int options, const char *err)
00638 {
00639     return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
00640                                  rb_enc_get(str), options, err);
00641 }
00642 
00643 static void
00644 rb_reg_raise_str(VALUE str, int options, const char *err)
00645 {
00646     rb_exc_raise(rb_reg_error_desc(str, options, err));
00647 }
00648 
00649 
00650 /*
00651  *  call-seq:
00652  *     rxp.casefold?   -> true or false
00653  *
00654  *  Returns the value of the case-insensitive flag.
00655  *
00656  *      /a/.casefold?           #=> false
00657  *      /a/i.casefold?          #=> true
00658  *      /(?i:a)/.casefold?      #=> false
00659  */
00660 
00661 static VALUE
00662 rb_reg_casefold_p(VALUE re)
00663 {
00664     rb_reg_check(re);
00665     if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
00666     return Qfalse;
00667 }
00668 
00669 
00670 /*
00671  *  call-seq:
00672  *     rxp.options   -> fixnum
00673  *
00674  *  Returns the set of bits corresponding to the options used when creating this
00675  *  Regexp (see <code>Regexp::new</code> for details. Note that additional bits
00676  *  may be set in the returned options: these are used internally by the regular
00677  *  expression code. These extra bits are ignored if the options are passed to
00678  *  <code>Regexp::new</code>.
00679  *
00680  *     Regexp::IGNORECASE                  #=> 1
00681  *     Regexp::EXTENDED                    #=> 2
00682  *     Regexp::MULTILINE                   #=> 4
00683  *
00684  *     /cat/.options                       #=> 0
00685  *     /cat/ix.options                     #=> 3
00686  *     Regexp.new('cat', true).options     #=> 1
00687  *     /\xa1\xa2/e.options                 #=> 16
00688  *
00689  *     r = /cat/ix
00690  *     Regexp.new(r.source, r.options)     #=> /cat/ix
00691  */
00692 
00693 static VALUE
00694 rb_reg_options_m(VALUE re)
00695 {
00696     int options = rb_reg_options(re);
00697     return INT2NUM(options);
00698 }
00699 
00700 static int
00701 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
00702           int back_num, int *back_refs, OnigRegex regex, void *arg)
00703 {
00704     VALUE ary = (VALUE)arg;
00705     rb_ary_push(ary, rb_str_new((const char *)name, name_end-name));
00706     return 0;
00707 }
00708 
00709 /*
00710  * call-seq:
00711  *    rxp.names   -> [name1, name2, ...]
00712  *
00713  * Returns a list of names of captures as an array of strings.
00714  *
00715  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.names
00716  *     #=> ["foo", "bar", "baz"]
00717  *
00718  *     /(?<foo>.)(?<foo>.)/.names
00719  *     #=> ["foo"]
00720  *
00721  *     /(.)(.)/.names
00722  *     #=> []
00723  */
00724 
00725 static VALUE
00726 rb_reg_names(VALUE re)
00727 {
00728     VALUE ary = rb_ary_new();
00729     rb_reg_check(re);
00730     onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
00731     return ary;
00732 }
00733 
00734 static int
00735 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
00736           int back_num, int *back_refs, OnigRegex regex, void *arg)
00737 {
00738     VALUE hash = (VALUE)arg;
00739     VALUE ary = rb_ary_new2(back_num);
00740     int i;
00741 
00742     for (i = 0; i < back_num; i++)
00743         rb_ary_store(ary, i, INT2NUM(back_refs[i]));
00744 
00745     rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
00746 
00747     return 0;
00748 }
00749 
00750 /*
00751  * call-seq:
00752  *    rxp.named_captures  -> hash
00753  *
00754  * Returns a hash representing information about named captures of <i>rxp</i>.
00755  *
00756  * A key of the hash is a name of the named captures.
00757  * A value of the hash is an array which is list of indexes of corresponding
00758  * named captures.
00759  *
00760  *    /(?<foo>.)(?<bar>.)/.named_captures
00761  *    #=> {"foo"=>[1], "bar"=>[2]}
00762  *
00763  *    /(?<foo>.)(?<foo>.)/.named_captures
00764  *    #=> {"foo"=>[1, 2]}
00765  *
00766  * If there are no named captures, an empty hash is returned.
00767  *
00768  *    /(.)(.)/.named_captures
00769  *    #=> {}
00770  */
00771 
00772 static VALUE
00773 rb_reg_named_captures(VALUE re)
00774 {
00775     VALUE hash = rb_hash_new();
00776     rb_reg_check(re);
00777     onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
00778     return hash;
00779 }
00780 
00781 static int
00782 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
00783           OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
00784           OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
00785 {
00786   int r;
00787 
00788   *reg = (regex_t* )malloc(sizeof(regex_t));
00789   if (IS_NULL(*reg)) return ONIGERR_MEMORY;
00790 
00791   r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
00792   if (r) goto err;
00793 
00794   r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
00795   if (r) {
00796   err:
00797     onig_free(*reg);
00798     *reg = NULL;
00799   }
00800   return r;
00801 }
00802 
00803 static Regexp*
00804 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err,
00805         const char *sourcefile, int sourceline)
00806 {
00807     Regexp *rp;
00808     int r;
00809     OnigErrorInfo einfo;
00810 
00811     /* Handle escaped characters first. */
00812 
00813     /* Build a copy of the string (in dest) with the
00814        escaped characters translated,  and generate the regex
00815        from that.
00816     */
00817 
00818     r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags,
00819                  enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
00820     if (r) {
00821         onig_error_code_to_str((UChar*)err, r, &einfo);
00822         return 0;
00823     }
00824     return rp;
00825 }
00826 
00827 
00828 /*
00829  *  Document-class: MatchData
00830  *
00831  *  <code>MatchData</code> is the type of the special variable <code>$~</code>,
00832  *  and is the type of the object returned by <code>Regexp#match</code> and
00833  *  <code>Regexp.last_match</code>. It encapsulates all the results of a pattern
00834  *  match, results normally accessed through the special variables
00835  *  <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>,
00836  *  <code>$2</code>, and so on.
00837  *
00838  */
00839 
00840 VALUE rb_cMatch;
00841 
00842 static VALUE
00843 match_alloc(VALUE klass)
00844 {
00845     NEWOBJ_OF(match, struct RMatch, klass, T_MATCH);
00846 
00847     match->str = 0;
00848     match->rmatch = 0;
00849     match->regexp = 0;
00850     match->rmatch = ALLOC(struct rmatch);
00851     MEMZERO(match->rmatch, struct rmatch, 1);
00852 
00853     return (VALUE)match;
00854 }
00855 
00856 typedef struct {
00857     long byte_pos;
00858     long char_pos;
00859 } pair_t;
00860 
00861 static int
00862 pair_byte_cmp(const void *pair1, const void *pair2)
00863 {
00864     long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
00865 #if SIZEOF_LONG > SIZEOF_INT
00866     return diff ? diff > 0 ? 1 : -1 : 0;
00867 #else
00868     return (int)diff;
00869 #endif
00870 }
00871 
00872 static void
00873 update_char_offset(VALUE match)
00874 {
00875     struct rmatch *rm = RMATCH(match)->rmatch;
00876     struct re_registers *regs;
00877     int i, num_regs, num_pos;
00878     long c;
00879     char *s, *p, *q;
00880     rb_encoding *enc;
00881     pair_t *pairs;
00882 
00883     if (rm->char_offset_updated)
00884         return;
00885 
00886     regs = &rm->regs;
00887     num_regs = rm->regs.num_regs;
00888 
00889     if (rm->char_offset_num_allocated < num_regs) {
00890         REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
00891         rm->char_offset_num_allocated = num_regs;
00892     }
00893 
00894     enc = rb_enc_get(RMATCH(match)->str);
00895     if (rb_enc_mbmaxlen(enc) == 1) {
00896         for (i = 0; i < num_regs; i++) {
00897             rm->char_offset[i].beg = BEG(i);
00898             rm->char_offset[i].end = END(i);
00899         }
00900         rm->char_offset_updated = 1;
00901         return;
00902     }
00903 
00904     pairs = ALLOCA_N(pair_t, num_regs*2);
00905     num_pos = 0;
00906     for (i = 0; i < num_regs; i++) {
00907         if (BEG(i) < 0)
00908             continue;
00909         pairs[num_pos++].byte_pos = BEG(i);
00910         pairs[num_pos++].byte_pos = END(i);
00911     }
00912     qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00913 
00914     s = p = RSTRING_PTR(RMATCH(match)->str);
00915     c = 0;
00916     for (i = 0; i < num_pos; i++) {
00917         q = s + pairs[i].byte_pos;
00918         c += rb_enc_strlen(p, q, enc);
00919         pairs[i].char_pos = c;
00920         p = q;
00921     }
00922 
00923     for (i = 0; i < num_regs; i++) {
00924         pair_t key, *found;
00925         if (BEG(i) < 0) {
00926             rm->char_offset[i].beg = -1;
00927             rm->char_offset[i].end = -1;
00928             continue;
00929         }
00930 
00931         key.byte_pos = BEG(i);
00932         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00933         rm->char_offset[i].beg = found->char_pos;
00934 
00935         key.byte_pos = END(i);
00936         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00937         rm->char_offset[i].end = found->char_pos;
00938     }
00939 
00940     rm->char_offset_updated = 1;
00941 }
00942 
00943 static void
00944 match_check(VALUE match)
00945 {
00946     if (!RMATCH(match)->regexp) {
00947         rb_raise(rb_eTypeError, "uninitialized Match");
00948     }
00949 }
00950 
00951 /* :nodoc: */
00952 static VALUE
00953 match_init_copy(VALUE obj, VALUE orig)
00954 {
00955     struct rmatch *rm;
00956 
00957     if (!OBJ_INIT_COPY(obj, orig)) return obj;
00958 
00959     RMATCH(obj)->str = RMATCH(orig)->str;
00960     RMATCH(obj)->regexp = RMATCH(orig)->regexp;
00961 
00962     rm = RMATCH(obj)->rmatch;
00963     onig_region_copy(&rm->regs, RMATCH_REGS(orig));
00964 
00965     if (!RMATCH(orig)->rmatch->char_offset_updated) {
00966         rm->char_offset_updated = 0;
00967     }
00968     else {
00969         if (rm->char_offset_num_allocated < rm->regs.num_regs) {
00970             REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
00971             rm->char_offset_num_allocated = rm->regs.num_regs;
00972         }
00973         MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
00974                struct rmatch_offset, rm->regs.num_regs);
00975         rm->char_offset_updated = 1;
00976     }
00977 
00978     return obj;
00979 }
00980 
00981 
00982 /*
00983  * call-seq:
00984  *    mtch.regexp   -> regexp
00985  *
00986  * Returns the regexp.
00987  *
00988  *     m = /a.*b/.match("abc")
00989  *     m.regexp #=> /a.*b/
00990  */
00991 
00992 static VALUE
00993 match_regexp(VALUE match)
00994 {
00995     match_check(match);
00996     return RMATCH(match)->regexp;
00997 }
00998 
00999 /*
01000  * call-seq:
01001  *    mtch.names   -> [name1, name2, ...]
01002  *
01003  * Returns a list of names of captures as an array of strings.
01004  * It is same as mtch.regexp.names.
01005  *
01006  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
01007  *     #=> ["foo", "bar", "baz"]
01008  *
01009  *     m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
01010  *     m.names                          #=> ["x", "y"]
01011  */
01012 
01013 static VALUE
01014 match_names(VALUE match)
01015 {
01016     match_check(match);
01017     return rb_reg_names(RMATCH(match)->regexp);
01018 }
01019 
01020 /*
01021  *  call-seq:
01022  *     mtch.length   -> integer
01023  *     mtch.size     -> integer
01024  *
01025  *  Returns the number of elements in the match array.
01026  *
01027  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01028  *     m.length   #=> 5
01029  *     m.size     #=> 5
01030  */
01031 
01032 static VALUE
01033 match_size(VALUE match)
01034 {
01035     match_check(match);
01036     return INT2FIX(RMATCH_REGS(match)->num_regs);
01037 }
01038 
01039 static int
01040 match_backref_number(VALUE match, VALUE backref)
01041 {
01042     const char *name;
01043     int num;
01044 
01045     struct re_registers *regs = RMATCH_REGS(match);
01046     VALUE regexp = RMATCH(match)->regexp;
01047 
01048     match_check(match);
01049     switch (TYPE(backref)) {
01050       default:
01051         return NUM2INT(backref);
01052 
01053       case T_SYMBOL:
01054         name = rb_id2name(SYM2ID(backref));
01055         break;
01056 
01057       case T_STRING:
01058         name = StringValueCStr(backref);
01059         break;
01060     }
01061 
01062     num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01063               (const unsigned char*)name,
01064               (const unsigned char*)name + strlen(name),
01065               regs);
01066 
01067     if (num < 1) {
01068         rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
01069     }
01070 
01071     return num;
01072 }
01073 
01074 int
01075 rb_reg_backref_number(VALUE match, VALUE backref)
01076 {
01077     return match_backref_number(match, backref);
01078 }
01079 
01080 /*
01081  *  call-seq:
01082  *     mtch.offset(n)   -> array
01083  *
01084  *  Returns a two-element array containing the beginning and ending offsets of
01085  *  the <em>n</em>th match.
01086  *  <em>n</em> can be a string or symbol to reference a named capture.
01087  *
01088  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01089  *     m.offset(0)      #=> [1, 7]
01090  *     m.offset(4)      #=> [6, 7]
01091  *
01092  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
01093  *     p m.offset(:foo) #=> [0, 1]
01094  *     p m.offset(:bar) #=> [2, 3]
01095  *
01096  */
01097 
01098 static VALUE
01099 match_offset(VALUE match, VALUE n)
01100 {
01101     int i = match_backref_number(match, n);
01102     struct re_registers *regs = RMATCH_REGS(match);
01103 
01104     match_check(match);
01105     if (i < 0 || regs->num_regs <= i)
01106         rb_raise(rb_eIndexError, "index %d out of matches", i);
01107 
01108     if (BEG(i) < 0)
01109         return rb_assoc_new(Qnil, Qnil);
01110 
01111     update_char_offset(match);
01112     return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
01113                         INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
01114 }
01115 
01116 
01117 /*
01118  *  call-seq:
01119  *     mtch.begin(n)   -> integer
01120  *
01121  *  Returns the offset of the start of the <em>n</em>th element of the match
01122  *  array in the string.
01123  *  <em>n</em> can be a string or symbol to reference a named capture.
01124  *
01125  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01126  *     m.begin(0)       #=> 1
01127  *     m.begin(2)       #=> 2
01128  *
01129  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
01130  *     p m.begin(:foo)  #=> 0
01131  *     p m.begin(:bar)  #=> 2
01132  */
01133 
01134 static VALUE
01135 match_begin(VALUE match, VALUE n)
01136 {
01137     int i = match_backref_number(match, n);
01138     struct re_registers *regs = RMATCH_REGS(match);
01139 
01140     match_check(match);
01141     if (i < 0 || regs->num_regs <= i)
01142         rb_raise(rb_eIndexError, "index %d out of matches", i);
01143 
01144     if (BEG(i) < 0)
01145         return Qnil;
01146 
01147     update_char_offset(match);
01148     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
01149 }
01150 
01151 
01152 /*
01153  *  call-seq:
01154  *     mtch.end(n)   -> integer
01155  *
01156  *  Returns the offset of the character immediately following the end of the
01157  *  <em>n</em>th element of the match array in the string.
01158  *  <em>n</em> can be a string or symbol to reference a named capture.
01159  *
01160  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01161  *     m.end(0)         #=> 7
01162  *     m.end(2)         #=> 3
01163  *
01164  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
01165  *     p m.end(:foo)    #=> 1
01166  *     p m.end(:bar)    #=> 3
01167  */
01168 
01169 static VALUE
01170 match_end(VALUE match, VALUE n)
01171 {
01172     int i = match_backref_number(match, n);
01173     struct re_registers *regs = RMATCH_REGS(match);
01174 
01175     match_check(match);
01176     if (i < 0 || regs->num_regs <= i)
01177         rb_raise(rb_eIndexError, "index %d out of matches", i);
01178 
01179     if (BEG(i) < 0)
01180         return Qnil;
01181 
01182     update_char_offset(match);
01183     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
01184 }
01185 
01186 #define MATCH_BUSY FL_USER2
01187 
01188 void
01189 rb_match_busy(VALUE match)
01190 {
01191     FL_SET(match, MATCH_BUSY);
01192 }
01193 
01194 /*
01195  *  call-seq:
01196  *     rxp.fixed_encoding?   -> true or false
01197  *
01198  *  Returns false if rxp is applicable to
01199  *  a string with any ASCII compatible encoding.
01200  *  Returns true otherwise.
01201  *
01202  *      r = /a/
01203  *      r.fixed_encoding?                               #=> false
01204  *      r =~ "\u{6666} a"                               #=> 2
01205  *      r =~ "\xa1\xa2 a".force_encoding("euc-jp")      #=> 2
01206  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
01207  *
01208  *      r = /a/u
01209  *      r.fixed_encoding?                               #=> true
01210  *      r.encoding                                      #=> #<Encoding:UTF-8>
01211  *      r =~ "\u{6666} a"                               #=> 2
01212  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> ArgumentError
01213  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
01214  *
01215  *      r = /\u{6666}/
01216  *      r.fixed_encoding?                               #=> true
01217  *      r.encoding                                      #=> #<Encoding:UTF-8>
01218  *      r =~ "\u{6666} a"                               #=> 0
01219  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> ArgumentError
01220  *      r =~ "abc".force_encoding("euc-jp")             #=> nil
01221  */
01222 
01223 static VALUE
01224 rb_reg_fixed_encoding_p(VALUE re)
01225 {
01226     if (FL_TEST(re, KCODE_FIXED))
01227         return Qtrue;
01228     else
01229         return Qfalse;
01230 }
01231 
01232 static VALUE
01233 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
01234         rb_encoding **fixed_enc, onig_errmsg_buffer err);
01235 
01236 
01237 static void
01238 reg_enc_error(VALUE re, VALUE str)
01239 {
01240     rb_raise(rb_eEncCompatError,
01241              "incompatible encoding regexp match (%s regexp with %s string)",
01242              rb_enc_name(rb_enc_get(re)),
01243              rb_enc_name(rb_enc_get(str)));
01244 }
01245 
01246 static rb_encoding*
01247 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
01248 {
01249     rb_encoding *enc = 0;
01250 
01251     if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
01252         rb_raise(rb_eArgError,
01253             "invalid byte sequence in %s",
01254             rb_enc_name(rb_enc_get(str)));
01255     }
01256 
01257     rb_reg_check(re);
01258     enc = rb_enc_get(str);
01259     if (!rb_enc_str_asciicompat_p(str)) {
01260         if (RREGEXP(re)->ptr->enc != enc) {
01261             reg_enc_error(re, str);
01262         }
01263     }
01264     else if (rb_reg_fixed_encoding_p(re)) {
01265         if (RREGEXP(re)->ptr->enc != enc &&
01266             (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
01267              rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) {
01268             reg_enc_error(re, str);
01269         }
01270         enc = RREGEXP(re)->ptr->enc;
01271     }
01272     if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
01273         enc != rb_ascii8bit_encoding() &&
01274         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
01275         rb_warn("regexp match /.../n against to %s string",
01276                 rb_enc_name(enc));
01277     }
01278     return enc;
01279 }
01280 
01281 regex_t *
01282 rb_reg_prepare_re(VALUE re, VALUE str)
01283 {
01284     regex_t *reg = RREGEXP(re)->ptr;
01285     onig_errmsg_buffer err = "";
01286     int r;
01287     OnigErrorInfo einfo;
01288     const char *pattern;
01289     VALUE unescaped;
01290     rb_encoding *fixed_enc = 0;
01291     rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
01292 
01293     if (reg->enc == enc) return reg;
01294 
01295     rb_reg_check(re);
01296     reg = RREGEXP(re)->ptr;
01297     pattern = RREGEXP_SRC_PTR(re);
01298 
01299     unescaped = rb_reg_preprocess(
01300         pattern, pattern + RREGEXP_SRC_LEN(re), enc,
01301         &fixed_enc, err);
01302 
01303     if (unescaped == Qnil) {
01304         rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
01305     }
01306 
01307     r = onig_new(&reg, (UChar* )RSTRING_PTR(unescaped),
01308                  (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
01309                  reg->options, enc,
01310                  OnigDefaultSyntax, &einfo);
01311     if (r) {
01312         onig_error_code_to_str((UChar*)err, r, &einfo);
01313         rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
01314     }
01315 
01316     RB_GC_GUARD(unescaped);
01317     return reg;
01318 }
01319 
01320 long
01321 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
01322 {
01323     long range;
01324     rb_encoding *enc;
01325     UChar *p, *string;
01326 
01327     enc = rb_reg_prepare_enc(re, str, 0);
01328 
01329     if (reverse) {
01330         range = -pos;
01331     }
01332     else {
01333         range = RSTRING_LEN(str) - pos;
01334     }
01335 
01336     if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
01337          string = (UChar*)RSTRING_PTR(str);
01338 
01339          if (range > 0) {
01340               p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str));
01341          }
01342          else {
01343               p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str));
01344          }
01345          return p - string;
01346     }
01347 
01348     return pos;
01349 }
01350 
01351 long
01352 rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
01353 {
01354     long result;
01355     VALUE match;
01356     struct re_registers regi, *regs = &regi;
01357     char *range = RSTRING_PTR(str);
01358     regex_t *reg;
01359     int tmpreg;
01360 
01361     if (pos > RSTRING_LEN(str) || pos < 0) {
01362         rb_backref_set(Qnil);
01363         return -1;
01364     }
01365 
01366     reg = rb_reg_prepare_re(re, str);
01367     tmpreg = reg != RREGEXP(re)->ptr;
01368     if (!tmpreg) RREGEXP(re)->usecnt++;
01369 
01370     match = rb_backref_get();
01371     if (!NIL_P(match)) {
01372         if (FL_TEST(match, MATCH_BUSY)) {
01373             match = Qnil;
01374         }
01375         else {
01376             regs = RMATCH_REGS(match);
01377         }
01378     }
01379     if (NIL_P(match)) {
01380         MEMZERO(regs, struct re_registers, 1);
01381     }
01382     if (!reverse) {
01383         range += RSTRING_LEN(str);
01384     }
01385     result = onig_search(reg,
01386                          (UChar*)(RSTRING_PTR(str)),
01387                          ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
01388                          ((UChar*)(RSTRING_PTR(str)) + pos),
01389                          ((UChar*)range),
01390                          regs, ONIG_OPTION_NONE);
01391     if (!tmpreg) RREGEXP(re)->usecnt--;
01392     if (tmpreg) {
01393         if (RREGEXP(re)->usecnt) {
01394             onig_free(reg);
01395         }
01396         else {
01397             onig_free(RREGEXP(re)->ptr);
01398             RREGEXP(re)->ptr = reg;
01399         }
01400     }
01401     if (result < 0) {
01402         if (regs == &regi)
01403             onig_region_free(regs, 0);
01404         if (result == ONIG_MISMATCH) {
01405             rb_backref_set(Qnil);
01406             return result;
01407         }
01408         else {
01409             onig_errmsg_buffer err = "";
01410             onig_error_code_to_str((UChar*)err, (int)result);
01411             rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
01412         }
01413     }
01414 
01415     if (NIL_P(match)) {
01416         match = match_alloc(rb_cMatch);
01417         onig_region_copy(RMATCH_REGS(match), regs);
01418         onig_region_free(regs, 0);
01419     }
01420     else {
01421         if (rb_safe_level() >= 3)
01422             OBJ_TAINT(match);
01423         else
01424             FL_UNSET(match, FL_TAINT);
01425     }
01426 
01427     RMATCH(match)->str = rb_str_new4(str);
01428     RMATCH(match)->regexp = re;
01429     RMATCH(match)->rmatch->char_offset_updated = 0;
01430     rb_backref_set(match);
01431 
01432     OBJ_INFECT(match, re);
01433     OBJ_INFECT(match, str);
01434 
01435     return result;
01436 }
01437 
01438 VALUE
01439 rb_reg_nth_defined(int nth, VALUE match)
01440 {
01441     struct re_registers *regs;
01442     if (NIL_P(match)) return Qnil;
01443     match_check(match);
01444     regs = RMATCH_REGS(match);
01445     if (nth >= regs->num_regs) {
01446         return Qnil;
01447     }
01448     if (nth < 0) {
01449         nth += regs->num_regs;
01450         if (nth <= 0) return Qnil;
01451     }
01452     if (BEG(nth) == -1) return Qfalse;
01453     return Qtrue;
01454 }
01455 
01456 VALUE
01457 rb_reg_nth_match(int nth, VALUE match)
01458 {
01459     VALUE str;
01460     long start, end, len;
01461     struct re_registers *regs;
01462 
01463     if (NIL_P(match)) return Qnil;
01464     match_check(match);
01465     regs = RMATCH_REGS(match);
01466     if (nth >= regs->num_regs) {
01467         return Qnil;
01468     }
01469     if (nth < 0) {
01470         nth += regs->num_regs;
01471         if (nth <= 0) return Qnil;
01472     }
01473     start = BEG(nth);
01474     if (start == -1) return Qnil;
01475     end = END(nth);
01476     len = end - start;
01477     str = rb_str_subseq(RMATCH(match)->str, start, len);
01478     OBJ_INFECT(str, match);
01479     return str;
01480 }
01481 
01482 VALUE
01483 rb_reg_last_match(VALUE match)
01484 {
01485     return rb_reg_nth_match(0, match);
01486 }
01487 
01488 
01489 /*
01490  *  call-seq:
01491  *     mtch.pre_match   -> str
01492  *
01493  *  Returns the portion of the original string before the current match.
01494  *  Equivalent to the special variable <code>$`</code>.
01495  *
01496  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01497  *     m.pre_match   #=> "T"
01498  */
01499 
01500 VALUE
01501 rb_reg_match_pre(VALUE match)
01502 {
01503     VALUE str;
01504     struct re_registers *regs;
01505 
01506     if (NIL_P(match)) return Qnil;
01507     match_check(match);
01508     regs = RMATCH_REGS(match);
01509     if (BEG(0) == -1) return Qnil;
01510     str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
01511     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01512     return str;
01513 }
01514 
01515 
01516 /*
01517  *  call-seq:
01518  *     mtch.post_match   -> str
01519  *
01520  *  Returns the portion of the original string after the current match.
01521  *  Equivalent to the special variable <code>$'</code>.
01522  *
01523  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
01524  *     m.post_match   #=> ": The Movie"
01525  */
01526 
01527 VALUE
01528 rb_reg_match_post(VALUE match)
01529 {
01530     VALUE str;
01531     long pos;
01532     struct re_registers *regs;
01533 
01534     if (NIL_P(match)) return Qnil;
01535     match_check(match);
01536     regs = RMATCH_REGS(match);
01537     if (BEG(0) == -1) return Qnil;
01538     str = RMATCH(match)->str;
01539     pos = END(0);
01540     str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
01541     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01542     return str;
01543 }
01544 
01545 VALUE
01546 rb_reg_match_last(VALUE match)
01547 {
01548     int i;
01549     struct re_registers *regs;
01550 
01551     if (NIL_P(match)) return Qnil;
01552     match_check(match);
01553     regs = RMATCH_REGS(match);
01554     if (BEG(0) == -1) return Qnil;
01555 
01556     for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
01557         ;
01558     if (i == 0) return Qnil;
01559     return rb_reg_nth_match(i, match);
01560 }
01561 
01562 static VALUE
01563 last_match_getter(void)
01564 {
01565     return rb_reg_last_match(rb_backref_get());
01566 }
01567 
01568 static VALUE
01569 prematch_getter(void)
01570 {
01571     return rb_reg_match_pre(rb_backref_get());
01572 }
01573 
01574 static VALUE
01575 postmatch_getter(void)
01576 {
01577     return rb_reg_match_post(rb_backref_get());
01578 }
01579 
01580 static VALUE
01581 last_paren_match_getter(void)
01582 {
01583     return rb_reg_match_last(rb_backref_get());
01584 }
01585 
01586 static VALUE
01587 match_array(VALUE match, int start)
01588 {
01589     struct re_registers *regs;
01590     VALUE ary;
01591     VALUE target;
01592     int i;
01593     int taint = OBJ_TAINTED(match);
01594 
01595     match_check(match);
01596     regs = RMATCH_REGS(match);
01597     ary = rb_ary_new2(regs->num_regs);
01598     target = RMATCH(match)->str;
01599 
01600     for (i=start; i<regs->num_regs; i++) {
01601         if (regs->beg[i] == -1) {
01602             rb_ary_push(ary, Qnil);
01603         }
01604         else {
01605             VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01606             if (taint) OBJ_TAINT(str);
01607             rb_ary_push(ary, str);
01608         }
01609     }
01610     return ary;
01611 }
01612 
01613 
01614 /* [MG]:FIXME: I put parens around the /.../.match() in the first line of the
01615    second example to prevent the '*' followed by a '/' from ending the
01616    comment. */
01617 
01618 /*
01619  *  call-seq:
01620  *     mtch.to_a   -> anArray
01621  *
01622  *  Returns the array of matches.
01623  *
01624  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01625  *     m.to_a   #=> ["HX1138", "H", "X", "113", "8"]
01626  *
01627  *  Because <code>to_a</code> is called when expanding
01628  *  <code>*</code><em>variable</em>, there's a useful assignment
01629  *  shortcut for extracting matched fields. This is slightly slower than
01630  *  accessing the fields directly (as an intermediate array is
01631  *  generated).
01632  *
01633  *     all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138."))
01634  *     all   #=> "HX1138"
01635  *     f1    #=> "H"
01636  *     f2    #=> "X"
01637  *     f3    #=> "113"
01638  */
01639 
01640 static VALUE
01641 match_to_a(VALUE match)
01642 {
01643     return match_array(match, 0);
01644 }
01645 
01646 
01647 /*
01648  *  call-seq:
01649  *     mtch.captures   -> array
01650  *
01651  *  Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
01652  *
01653  *     f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
01654  *     f1    #=> "H"
01655  *     f2    #=> "X"
01656  *     f3    #=> "113"
01657  *     f4    #=> "8"
01658  */
01659 static VALUE
01660 match_captures(VALUE match)
01661 {
01662     return match_array(match, 1);
01663 }
01664 
01665 static int
01666 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
01667 {
01668     int num;
01669 
01670     num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01671         (const unsigned char* )name, (const unsigned char* )name_end, regs);
01672     if (num >= 1) {
01673         return num;
01674     }
01675     else {
01676         VALUE s = rb_str_new(name, (long )(name_end - name));
01677         rb_raise(rb_eIndexError, "undefined group name reference: %s",
01678                                  StringValuePtr(s));
01679     }
01680 
01681     UNREACHABLE;
01682 }
01683 
01684 /*
01685  *  call-seq:
01686  *     mtch[i]               -> str or nil
01687  *     mtch[start, length]   -> array
01688  *     mtch[range]           -> array
01689  *     mtch[name]            -> str or nil
01690  *
01691  *  Match Reference -- <code>MatchData</code> acts as an array, and may be
01692  *  accessed using the normal array indexing techniques.  <code>mtch[0]</code>
01693  *  is equivalent to the special variable <code>$&</code>, and returns the
01694  *  entire matched string.  <code>mtch[1]</code>, <code>mtch[2]</code>, and so
01695  *  on return the values of the matched backreferences (portions of the
01696  *  pattern between parentheses).
01697  *
01698  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01699  *     m          #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
01700  *     m[0]       #=> "HX1138"
01701  *     m[1, 2]    #=> ["H", "X"]
01702  *     m[1..3]    #=> ["H", "X", "113"]
01703  *     m[-3, 2]   #=> ["X", "113"]
01704  *
01705  *     m = /(?<foo>a+)b/.match("ccaaab")
01706  *     m          #=> #<MatchData "aaab" foo:"aaa">
01707  *     m["foo"]   #=> "aaa"
01708  *     m[:foo]    #=> "aaa"
01709  */
01710 
01711 static VALUE
01712 match_aref(int argc, VALUE *argv, VALUE match)
01713 {
01714     VALUE idx, rest;
01715 
01716     match_check(match);
01717     rb_scan_args(argc, argv, "11", &idx, &rest);
01718 
01719     if (NIL_P(rest)) {
01720         if (FIXNUM_P(idx)) {
01721             if (FIX2INT(idx) >= 0) {
01722                 return rb_reg_nth_match(FIX2INT(idx), match);
01723             }
01724         }
01725         else {
01726             const char *p;
01727             int num;
01728 
01729             switch (TYPE(idx)) {
01730               case T_SYMBOL:
01731                 p = rb_id2name(SYM2ID(idx));
01732                 goto name_to_backref;
01733                 break;
01734               case T_STRING:
01735                 p = StringValuePtr(idx);
01736 
01737               name_to_backref:
01738                 num = name_to_backref_number(RMATCH_REGS(match),
01739                                              RMATCH(match)->regexp, p, p + strlen(p));
01740                 return rb_reg_nth_match(num, match);
01741                 break;
01742 
01743               default:
01744                 break;
01745             }
01746         }
01747     }
01748 
01749     return rb_ary_aref(argc, argv, match_to_a(match));
01750 }
01751 
01752 static VALUE
01753 match_entry(VALUE match, long n)
01754 {
01755     /* n should not exceed num_regs */
01756     return rb_reg_nth_match((int)n, match);
01757 }
01758 
01759 
01760 /*
01761  *  call-seq:
01762  *
01763  *     mtch.values_at([index]*)   -> array
01764  *
01765  *  Uses each <i>index</i> to access the matching values, returning an array of
01766  *  the corresponding matches.
01767  *
01768  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
01769  *     m.to_a               #=> ["HX1138", "H", "X", "113", "8"]
01770  *     m.values_at(0, 2, -2)   #=> ["HX1138", "X", "113"]
01771  */
01772 
01773 static VALUE
01774 match_values_at(int argc, VALUE *argv, VALUE match)
01775 {
01776     struct re_registers *regs;
01777 
01778     match_check(match);
01779     regs = RMATCH_REGS(match);
01780     return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
01781 }
01782 
01783 
01784 /*
01785  *  call-seq:
01786  *     mtch.to_s   -> str
01787  *
01788  *  Returns the entire matched string.
01789  *
01790  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01791  *     m.to_s   #=> "HX1138"
01792  */
01793 
01794 static VALUE
01795 match_to_s(VALUE match)
01796 {
01797     VALUE str = rb_reg_last_match(match);
01798 
01799     match_check(match);
01800     if (NIL_P(str)) str = rb_str_new(0,0);
01801     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01802     if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
01803     return str;
01804 }
01805 
01806 
01807 /*
01808  *  call-seq:
01809  *     mtch.string   -> str
01810  *
01811  *  Returns a frozen copy of the string passed in to <code>match</code>.
01812  *
01813  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01814  *     m.string   #=> "THX1138."
01815  */
01816 
01817 static VALUE
01818 match_string(VALUE match)
01819 {
01820     match_check(match);
01821     return RMATCH(match)->str;  /* str is frozen */
01822 }
01823 
01824 struct backref_name_tag {
01825     const UChar *name;
01826     long len;
01827 };
01828 
01829 static int
01830 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
01831           int back_num, int *back_refs, OnigRegex regex, void *arg0)
01832 {
01833     struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
01834     int i;
01835 
01836     for (i = 0; i < back_num; i++) {
01837         arg[back_refs[i]].name = name;
01838         arg[back_refs[i]].len = name_end - name;
01839     }
01840     return 0;
01841 }
01842 
01843 /*
01844  * call-seq:
01845  *    mtch.inspect   -> str
01846  *
01847  * Returns a printable version of <i>mtch</i>.
01848  *
01849  *     puts /.$/.match("foo").inspect
01850  *     #=> #<MatchData "o">
01851  *
01852  *     puts /(.)(.)(.)/.match("foo").inspect
01853  *     #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o">
01854  *
01855  *     puts /(.)(.)?(.)/.match("fo").inspect
01856  *     #=> #<MatchData "fo" 1:"f" 2:nil 3:"o">
01857  *
01858  *     puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect
01859  *     #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g">
01860  *
01861  */
01862 
01863 static VALUE
01864 match_inspect(VALUE match)
01865 {
01866     const char *cname = rb_obj_classname(match);
01867     VALUE str;
01868     int i;
01869     struct re_registers *regs = RMATCH_REGS(match);
01870     int num_regs = regs->num_regs;
01871     struct backref_name_tag *names;
01872     VALUE regexp = RMATCH(match)->regexp;
01873 
01874     if (regexp == 0) {
01875         return rb_sprintf("#<%s:%p>", cname, (void*)match);
01876     }
01877 
01878     names = ALLOCA_N(struct backref_name_tag, num_regs);
01879     MEMZERO(names, struct backref_name_tag, num_regs);
01880 
01881     onig_foreach_name(RREGEXP(regexp)->ptr,
01882             match_inspect_name_iter, names);
01883 
01884     str = rb_str_buf_new2("#<");
01885     rb_str_buf_cat2(str, cname);
01886 
01887     for (i = 0; i < num_regs; i++) {
01888         VALUE v;
01889         rb_str_buf_cat2(str, " ");
01890         if (0 < i) {
01891             if (names[i].name)
01892                 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
01893             else {
01894                 rb_str_catf(str, "%d", i);
01895             }
01896             rb_str_buf_cat2(str, ":");
01897         }
01898         v = rb_reg_nth_match(i, match);
01899         if (v == Qnil)
01900             rb_str_buf_cat2(str, "nil");
01901         else
01902             rb_str_buf_append(str, rb_str_inspect(v));
01903     }
01904     rb_str_buf_cat2(str, ">");
01905 
01906     return str;
01907 }
01908 
01909 VALUE rb_cRegexp;
01910 
01911 static int
01912 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
01913 {
01914     const char *p = *pp;
01915     int code;
01916     int meta_prefix = 0, ctrl_prefix = 0;
01917     size_t len;
01918 
01919     if (p == end || *p++ != '\\') {
01920         errcpy(err, "too short escaped multibyte character");
01921         return -1;
01922     }
01923 
01924 again:
01925     if (p == end) {
01926         errcpy(err, "too short escape sequence");
01927         return -1;
01928     }
01929     switch (*p++) {
01930       case '\\': code = '\\'; break;
01931       case 'n': code = '\n'; break;
01932       case 't': code = '\t'; break;
01933       case 'r': code = '\r'; break;
01934       case 'f': code = '\f'; break;
01935       case 'v': code = '\013'; break;
01936       case 'a': code = '\007'; break;
01937       case 'e': code = '\033'; break;
01938 
01939       /* \OOO */
01940       case '0': case '1': case '2': case '3':
01941       case '4': case '5': case '6': case '7':
01942         p--;
01943         code = scan_oct(p, end < p+3 ? end-p : 3, &len);
01944         p += len;
01945         break;
01946 
01947       case 'x': /* \xHH */
01948         code = scan_hex(p, end < p+2 ? end-p : 2, &len);
01949         if (len < 1) {
01950             errcpy(err, "invalid hex escape");
01951             return -1;
01952         }
01953         p += len;
01954         break;
01955 
01956       case 'M': /* \M-X, \M-\C-X, \M-\cX */
01957         if (meta_prefix) {
01958             errcpy(err, "duplicate meta escape");
01959             return -1;
01960         }
01961         meta_prefix = 1;
01962         if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
01963             if (*p == '\\') {
01964                 p++;
01965                 goto again;
01966             }
01967             else {
01968                 code = *p++;
01969                 break;
01970             }
01971         }
01972         errcpy(err, "too short meta escape");
01973         return -1;
01974 
01975       case 'C': /* \C-X, \C-\M-X */
01976         if (p == end || *p++ != '-') {
01977             errcpy(err, "too short control escape");
01978             return -1;
01979         }
01980       case 'c': /* \cX, \c\M-X */
01981         if (ctrl_prefix) {
01982             errcpy(err, "duplicate control escape");
01983             return -1;
01984         }
01985         ctrl_prefix = 1;
01986         if (p < end && (*p & 0x80) == 0) {
01987             if (*p == '\\') {
01988                 p++;
01989                 goto again;
01990             }
01991             else {
01992                 code = *p++;
01993                 break;
01994             }
01995         }
01996         errcpy(err, "too short control escape");
01997         return -1;
01998 
01999       default:
02000         errcpy(err, "unexpected escape sequence");
02001         return -1;
02002     }
02003     if (code < 0 || 0xff < code) {
02004         errcpy(err, "invalid escape code");
02005         return -1;
02006     }
02007 
02008     if (ctrl_prefix)
02009         code &= 0x1f;
02010     if (meta_prefix)
02011         code |= 0x80;
02012 
02013     *pp = p;
02014     return code;
02015 }
02016 
02017 static int
02018 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
02019         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02020 {
02021     const char *p = *pp;
02022     int chmaxlen = rb_enc_mbmaxlen(enc);
02023     char *chbuf = ALLOCA_N(char, chmaxlen);
02024     int chlen = 0;
02025     int byte;
02026     int l;
02027 
02028     memset(chbuf, 0, chmaxlen);
02029 
02030     byte = read_escaped_byte(&p, end, err);
02031     if (byte == -1) {
02032         return -1;
02033     }
02034 
02035     chbuf[chlen++] = byte;
02036     while (chlen < chmaxlen &&
02037            MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
02038         byte = read_escaped_byte(&p, end, err);
02039         if (byte == -1) {
02040             return -1;
02041         }
02042         chbuf[chlen++] = byte;
02043     }
02044 
02045     l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
02046     if (MBCLEN_INVALID_P(l)) {
02047         errcpy(err, "invalid multibyte escape");
02048         return -1;
02049     }
02050     if (1 < chlen || (chbuf[0] & 0x80)) {
02051         rb_str_buf_cat(buf, chbuf, chlen);
02052 
02053         if (*encp == 0)
02054             *encp = enc;
02055         else if (*encp != enc) {
02056             errcpy(err, "escaped non ASCII character in UTF-8 regexp");
02057             return -1;
02058         }
02059     }
02060     else {
02061         char escbuf[5];
02062         snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
02063         rb_str_buf_cat(buf, escbuf, 4);
02064     }
02065     *pp = p;
02066     return 0;
02067 }
02068 
02069 static int
02070 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
02071 {
02072     if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */
02073         0x10ffff < code) {
02074         errcpy(err, "invalid Unicode range");
02075         return -1;
02076     }
02077     return 0;
02078 }
02079 
02080 static int
02081 append_utf8(unsigned long uv,
02082         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02083 {
02084     if (check_unicode_range(uv, err) != 0)
02085         return -1;
02086     if (uv < 0x80) {
02087         char escbuf[5];
02088         snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
02089         rb_str_buf_cat(buf, escbuf, 4);
02090     }
02091     else {
02092         int len;
02093         char utf8buf[6];
02094         len = rb_uv_to_utf8(utf8buf, uv);
02095         rb_str_buf_cat(buf, utf8buf, len);
02096 
02097         if (*encp == 0)
02098             *encp = rb_utf8_encoding();
02099         else if (*encp != rb_utf8_encoding()) {
02100             errcpy(err, "UTF-8 character in non UTF-8 regexp");
02101             return -1;
02102         }
02103     }
02104     return 0;
02105 }
02106 
02107 static int
02108 unescape_unicode_list(const char **pp, const char *end,
02109         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02110 {
02111     const char *p = *pp;
02112     int has_unicode = 0;
02113     unsigned long code;
02114     size_t len;
02115 
02116     while (p < end && ISSPACE(*p)) p++;
02117 
02118     while (1) {
02119         code = ruby_scan_hex(p, end-p, &len);
02120         if (len == 0)
02121             break;
02122         if (6 < len) { /* max 10FFFF */
02123             errcpy(err, "invalid Unicode range");
02124             return -1;
02125         }
02126         p += len;
02127         if (append_utf8(code, buf, encp, err) != 0)
02128             return -1;
02129         has_unicode = 1;
02130 
02131         while (p < end && ISSPACE(*p)) p++;
02132     }
02133 
02134     if (has_unicode == 0) {
02135         errcpy(err, "invalid Unicode list");
02136         return -1;
02137     }
02138 
02139     *pp = p;
02140 
02141     return 0;
02142 }
02143 
02144 static int
02145 unescape_unicode_bmp(const char **pp, const char *end,
02146         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02147 {
02148     const char *p = *pp;
02149     size_t len;
02150     unsigned long code;
02151 
02152     if (end < p+4) {
02153         errcpy(err, "invalid Unicode escape");
02154         return -1;
02155     }
02156     code = ruby_scan_hex(p, 4, &len);
02157     if (len != 4) {
02158         errcpy(err, "invalid Unicode escape");
02159         return -1;
02160     }
02161     if (append_utf8(code, buf, encp, err) != 0)
02162         return -1;
02163     *pp = p + 4;
02164     return 0;
02165 }
02166 
02167 static int
02168 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
02169         VALUE buf, rb_encoding **encp, int *has_property,
02170         onig_errmsg_buffer err)
02171 {
02172     char c;
02173     char smallbuf[2];
02174 
02175     while (p < end) {
02176         int chlen = rb_enc_precise_mbclen(p, end, enc);
02177         if (!MBCLEN_CHARFOUND_P(chlen)) {
02178             errcpy(err, "invalid multibyte character");
02179             return -1;
02180         }
02181         chlen = MBCLEN_CHARFOUND_LEN(chlen);
02182         if (1 < chlen || (*p & 0x80)) {
02183             rb_str_buf_cat(buf, p, chlen);
02184             p += chlen;
02185             if (*encp == 0)
02186                 *encp = enc;
02187             else if (*encp != enc) {
02188                 errcpy(err, "non ASCII character in UTF-8 regexp");
02189                 return -1;
02190             }
02191             continue;
02192         }
02193 
02194         switch (c = *p++) {
02195           case '\\':
02196             if (p == end) {
02197                 errcpy(err, "too short escape sequence");
02198                 return -1;
02199             }
02200             switch (c = *p++) {
02201               case '1': case '2': case '3':
02202               case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
02203                 {
02204                     size_t octlen;
02205                     if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
02206                         /* backref or 7bit octal.
02207                            no need to unescape anyway.
02208                            re-escaping may break backref */
02209                         goto escape_asis;
02210                     }
02211                 }
02212                 /* xxx: How about more than 199 subexpressions? */
02213 
02214               case '0': /* \0, \0O, \0OO */
02215 
02216               case 'x': /* \xHH */
02217               case 'c': /* \cX, \c\M-X */
02218               case 'C': /* \C-X, \C-\M-X */
02219               case 'M': /* \M-X, \M-\C-X, \M-\cX */
02220                 p = p-2;
02221                 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
02222                     return -1;
02223                 break;
02224 
02225               case 'u':
02226                 if (p == end) {
02227                     errcpy(err, "too short escape sequence");
02228                     return -1;
02229                 }
02230                 if (*p == '{') {
02231                     /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
02232                     p++;
02233                     if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
02234                         return -1;
02235                     if (p == end || *p++ != '}') {
02236                         errcpy(err, "invalid Unicode list");
02237                         return -1;
02238                     }
02239                     break;
02240                 }
02241                 else {
02242                     /* \uHHHH */
02243                     if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
02244                         return -1;
02245                     break;
02246                 }
02247 
02248               case 'p': /* \p{Hiragana} */
02249               case 'P':
02250                 if (!*encp) {
02251                     *has_property = 1;
02252                 }
02253                 goto escape_asis;
02254 
02255               default: /* \n, \\, \d, \9, etc. */
02256 escape_asis:
02257                 smallbuf[0] = '\\';
02258                 smallbuf[1] = c;
02259                 rb_str_buf_cat(buf, smallbuf, 2);
02260                 break;
02261             }
02262             break;
02263 
02264           default:
02265             rb_str_buf_cat(buf, &c, 1);
02266             break;
02267         }
02268     }
02269 
02270     return 0;
02271 }
02272 
02273 static VALUE
02274 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
02275         rb_encoding **fixed_enc, onig_errmsg_buffer err)
02276 {
02277     VALUE buf;
02278     int has_property = 0;
02279 
02280     buf = rb_str_buf_new(0);
02281 
02282     if (rb_enc_asciicompat(enc))
02283         *fixed_enc = 0;
02284     else {
02285         *fixed_enc = enc;
02286         rb_enc_associate(buf, enc);
02287     }
02288 
02289     if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
02290         return Qnil;
02291 
02292     if (has_property && !*fixed_enc) {
02293         *fixed_enc = enc;
02294     }
02295 
02296     if (*fixed_enc) {
02297         rb_enc_associate(buf, *fixed_enc);
02298     }
02299 
02300     return buf;
02301 }
02302 
02303 VALUE
02304 rb_reg_check_preprocess(VALUE str)
02305 {
02306     rb_encoding *fixed_enc = 0;
02307     onig_errmsg_buffer err = "";
02308     VALUE buf;
02309     char *p, *end;
02310     rb_encoding *enc;
02311 
02312     StringValue(str);
02313     p = RSTRING_PTR(str);
02314     end = p + RSTRING_LEN(str);
02315     enc = rb_enc_get(str);
02316 
02317     buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
02318     RB_GC_GUARD(str);
02319 
02320     if (buf == Qnil) {
02321         return rb_reg_error_desc(str, 0, err);
02322     }
02323     return Qnil;
02324 }
02325 
02326 static VALUE
02327 rb_reg_preprocess_dregexp(VALUE ary, int options)
02328 {
02329     rb_encoding *fixed_enc = 0;
02330     rb_encoding *regexp_enc = 0;
02331     onig_errmsg_buffer err = "";
02332     int i;
02333     VALUE result = 0;
02334     rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02335 
02336     if (RARRAY_LEN(ary) == 0) {
02337         rb_raise(rb_eArgError, "no arguments given");
02338     }
02339 
02340     for (i = 0; i < RARRAY_LEN(ary); i++) {
02341         VALUE str = RARRAY_PTR(ary)[i];
02342         VALUE buf;
02343         char *p, *end;
02344         rb_encoding *src_enc;
02345 
02346         src_enc = rb_enc_get(str);
02347         if (options & ARG_ENCODING_NONE &&
02348                 src_enc != ascii8bit) {
02349             if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)
02350                 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02351             else
02352                 src_enc = ascii8bit;
02353         }
02354 
02355         StringValue(str);
02356         p = RSTRING_PTR(str);
02357         end = p + RSTRING_LEN(str);
02358 
02359         buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
02360 
02361         if (buf == Qnil)
02362             rb_raise(rb_eArgError, "%s", err);
02363 
02364         if (fixed_enc != 0) {
02365             if (regexp_enc != 0 && regexp_enc != fixed_enc) {
02366                 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s",
02367                          rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
02368             }
02369             regexp_enc = fixed_enc;
02370         }
02371 
02372         if (!result)
02373             result = rb_str_new3(str);
02374         else
02375             rb_str_buf_append(result, str);
02376     }
02377     if (regexp_enc) {
02378         rb_enc_associate(result, regexp_enc);
02379     }
02380 
02381     return result;
02382 }
02383 
02384 static int
02385 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
02386                   int options, onig_errmsg_buffer err,
02387                   const char *sourcefile, int sourceline)
02388 {
02389     struct RRegexp *re = RREGEXP(obj);
02390     VALUE unescaped;
02391     rb_encoding *fixed_enc = 0;
02392     rb_encoding *a_enc = rb_ascii8bit_encoding();
02393 
02394     if (!OBJ_UNTRUSTED(obj) && rb_safe_level() >= 4)
02395         rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
02396     rb_check_frozen(obj);
02397     if (FL_TEST(obj, REG_LITERAL))
02398         rb_raise(rb_eSecurityError, "can't modify literal regexp");
02399     if (re->ptr)
02400         rb_raise(rb_eTypeError, "already initialized regexp");
02401     re->ptr = 0;
02402 
02403     if (rb_enc_dummy_p(enc)) {
02404         errcpy(err, "can't make regexp with dummy encoding");
02405         return -1;
02406     }
02407 
02408     unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
02409     if (unescaped == Qnil)
02410         return -1;
02411 
02412     if (fixed_enc) {
02413         if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
02414             (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
02415             errcpy(err, "incompatible character encoding");
02416             return -1;
02417         }
02418         if (fixed_enc != a_enc) {
02419             options |= ARG_ENCODING_FIXED;
02420             enc = fixed_enc;
02421         }
02422     }
02423     else if (!(options & ARG_ENCODING_FIXED)) {
02424        enc = rb_usascii_encoding();
02425     }
02426 
02427     rb_enc_associate((VALUE)re, enc);
02428     if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
02429         re->basic.flags |= KCODE_FIXED;
02430     }
02431     if (options & ARG_ENCODING_NONE) {
02432         re->basic.flags |= REG_ENCODING_NONE;
02433     }
02434 
02435     re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
02436                           options & ARG_REG_OPTION_MASK, err,
02437                           sourcefile, sourceline);
02438     if (!re->ptr) return -1;
02439     re->src = rb_enc_str_new(s, len, enc);
02440     OBJ_FREEZE(re->src);
02441     RB_GC_GUARD(unescaped);
02442     return 0;
02443 }
02444 
02445 static int
02446 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err,
02447         const char *sourcefile, int sourceline)
02448 {
02449     int ret;
02450     rb_encoding *enc = rb_enc_get(str);
02451     if (options & ARG_ENCODING_NONE) {
02452         rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02453         if (enc != ascii8bit) {
02454             if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
02455                 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02456                 return -1;
02457             }
02458             enc = ascii8bit;
02459         }
02460     }
02461     ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
02462                             options, err, sourcefile, sourceline);
02463     OBJ_INFECT(obj, str);
02464     RB_GC_GUARD(str);
02465     return ret;
02466 }
02467 
02468 static VALUE
02469 rb_reg_s_alloc(VALUE klass)
02470 {
02471     NEWOBJ_OF(re, struct RRegexp, klass, T_REGEXP);
02472 
02473     re->ptr = 0;
02474     re->src = 0;
02475     re->usecnt = 0;
02476 
02477     return (VALUE)re;
02478 }
02479 
02480 VALUE
02481 rb_reg_alloc(void)
02482 {
02483     return rb_reg_s_alloc(rb_cRegexp);
02484 }
02485 
02486 VALUE
02487 rb_reg_new_str(VALUE s, int options)
02488 {
02489     return rb_reg_init_str(rb_reg_alloc(), s, options);
02490 }
02491 
02492 VALUE
02493 rb_reg_init_str(VALUE re, VALUE s, int options)
02494 {
02495     onig_errmsg_buffer err = "";
02496 
02497     if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
02498         rb_reg_raise_str(s, options, err);
02499     }
02500 
02501     return re;
02502 }
02503 
02504 VALUE
02505 rb_reg_new_ary(VALUE ary, int opt)
02506 {
02507     return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt);
02508 }
02509 
02510 VALUE
02511 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
02512 {
02513     VALUE re = rb_reg_alloc();
02514     onig_errmsg_buffer err = "";
02515 
02516     if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) {
02517         rb_enc_reg_raise(s, len, enc, options, err);
02518     }
02519 
02520     return re;
02521 }
02522 
02523 VALUE
02524 rb_reg_new(const char *s, long len, int options)
02525 {
02526     return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
02527 }
02528 
02529 VALUE
02530 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
02531 {
02532     VALUE re = rb_reg_alloc();
02533     onig_errmsg_buffer err = "";
02534 
02535     if (!str) str = rb_str_new(0,0);
02536     if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
02537         rb_set_errinfo(rb_reg_error_desc(str, options, err));
02538         return Qnil;
02539     }
02540     FL_SET(re, REG_LITERAL);
02541     return re;
02542 }
02543 
02544 static VALUE reg_cache;
02545 
02546 VALUE
02547 rb_reg_regcomp(VALUE str)
02548 {
02549     volatile VALUE save_str = str;
02550     if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str)
02551         && ENCODING_GET(reg_cache) == ENCODING_GET(str)
02552         && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
02553         return reg_cache;
02554 
02555     return reg_cache = rb_reg_new_str(save_str, 0);
02556 }
02557 
02558 static st_index_t reg_hash(VALUE re);
02559 /*
02560  * call-seq:
02561  *   rxp.hash   -> fixnum
02562  *
02563  * Produce a hash based on the text and options of this regular expression.
02564  */
02565 
02566 static VALUE
02567 rb_reg_hash(VALUE re)
02568 {
02569     st_index_t hashval = reg_hash(re);
02570     return LONG2FIX(hashval);
02571 }
02572 
02573 static st_index_t
02574 reg_hash(VALUE re)
02575 {
02576     st_index_t hashval;
02577 
02578     rb_reg_check(re);
02579     hashval = RREGEXP(re)->ptr->options;
02580     hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re)));
02581     return rb_hash_end(hashval);
02582 }
02583 
02584 
02585 /*
02586  *  call-seq:
02587  *     rxp == other_rxp      -> true or false
02588  *     rxp.eql?(other_rxp)   -> true or false
02589  *
02590  *  Equality---Two regexps are equal if their patterns are identical, they have
02591  *  the same character set code, and their <code>casefold?</code> values are the
02592  *  same.
02593  *
02594  *     /abc/  == /abc/x   #=> false
02595  *     /abc/  == /abc/i   #=> false
02596  *     /abc/  == /abc/u   #=> false
02597  *     /abc/u == /abc/n   #=> false
02598  */
02599 
02600 static VALUE
02601 rb_reg_equal(VALUE re1, VALUE re2)
02602 {
02603     if (re1 == re2) return Qtrue;
02604     if (!RB_TYPE_P(re2, T_REGEXP)) return Qfalse;
02605     rb_reg_check(re1); rb_reg_check(re2);
02606     if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
02607     if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
02608     if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
02609     if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
02610     if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
02611         return Qtrue;
02612     }
02613     return Qfalse;
02614 }
02615 
02616 /*
02617  * call-seq:
02618  *    mtch.hash   -> integer
02619  *
02620  * Produce a hash based on the target string, regexp and matched
02621  * positions of this matchdata.
02622  */
02623 
02624 static VALUE
02625 match_hash(VALUE match)
02626 {
02627     const struct re_registers *regs;
02628     st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str));
02629 
02630     rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp));
02631     regs = RMATCH_REGS(match);
02632     hashval = rb_hash_uint(hashval, regs->num_regs);
02633     hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg)));
02634     hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end)));
02635     hashval = rb_hash_end(hashval);
02636     return LONG2FIX(hashval);
02637 }
02638 
02639 /*
02640  * call-seq:
02641  *    mtch == mtch2   -> true or false
02642  *
02643  *  Equality---Two matchdata are equal if their target strings,
02644  *  patterns, and matched positions are identical.
02645  */
02646 
02647 static VALUE
02648 match_equal(VALUE match1, VALUE match2)
02649 {
02650     const struct re_registers *regs1, *regs2;
02651     if (match1 == match2) return Qtrue;
02652     if (!RB_TYPE_P(match2, T_MATCH)) return Qfalse;
02653     if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse;
02654     if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse;
02655     regs1 = RMATCH_REGS(match1);
02656     regs2 = RMATCH_REGS(match2);
02657     if (regs1->num_regs != regs2->num_regs) return Qfalse;
02658     if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse;
02659     if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse;
02660     return Qtrue;
02661 }
02662 
02663 static VALUE
02664 reg_operand(VALUE s, int check)
02665 {
02666     if (SYMBOL_P(s)) {
02667         return rb_sym_to_s(s);
02668     }
02669     else {
02670         return (check ? rb_str_to_str : rb_check_string_type)(s);
02671     }
02672 }
02673 
02674 static long
02675 reg_match_pos(VALUE re, VALUE *strp, long pos)
02676 {
02677     VALUE str = *strp;
02678 
02679     if (NIL_P(str)) {
02680         rb_backref_set(Qnil);
02681         return -1;
02682     }
02683     *strp = str = reg_operand(str, TRUE);
02684     if (pos != 0) {
02685         if (pos < 0) {
02686             VALUE l = rb_str_length(str);
02687             pos += NUM2INT(l);
02688             if (pos < 0) {
02689                 return pos;
02690             }
02691         }
02692         pos = rb_str_offset(str, pos);
02693     }
02694     return rb_reg_search(re, str, pos, 0);
02695 }
02696 
02697 /*
02698  *  call-seq:
02699  *     rxp =~ str    -> integer or nil
02700  *
02701  *  Match---Matches <i>rxp</i> against <i>str</i>.
02702  *
02703  *     /at/ =~ "input data"   #=> 7
02704  *     /ax/ =~ "input data"   #=> nil
02705  *
02706  *  If <code>=~</code> is used with a regexp literal with named captures,
02707  *  captured strings (or nil) is assigned to local variables named by
02708  *  the capture names.
02709  *
02710  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = y  "
02711  *     p lhs    #=> "x"
02712  *     p rhs    #=> "y"
02713  *
02714  *  If it is not matched, nil is assigned for the variables.
02715  *
02716  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = "
02717  *     p lhs    #=> nil
02718  *     p rhs    #=> nil
02719  *
02720  *  This assignment is implemented in the Ruby parser.
02721  *  The parser detects 'regexp-literal =~ expression' for the assignment.
02722  *  The regexp must be a literal without interpolation and placed at left hand side.
02723  *
02724  *  The assignment does not occur if the regexp is not a literal.
02725  *
02726  *     re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
02727  *     re =~ "  x = y  "
02728  *     p lhs    # undefined local variable
02729  *     p rhs    # undefined local variable
02730  *
02731  *  A regexp interpolation, <code>#{}</code>, also disables
02732  *  the assignment.
02733  *
02734  *     rhs_pat = /(?<rhs>\w+)/
02735  *     /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
02736  *     p lhs    # undefined local variable
02737  *
02738  *  The assignment does not occur if the regexp is placed at the right hand side.
02739  *
02740  *    "  x = y  " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
02741  *    p lhs, rhs # undefined local variable
02742  *
02743  */
02744 
02745 VALUE
02746 rb_reg_match(VALUE re, VALUE str)
02747 {
02748     long pos = reg_match_pos(re, &str, 0);
02749     if (pos < 0) return Qnil;
02750     pos = rb_str_sublen(str, pos);
02751     return LONG2FIX(pos);
02752 }
02753 
02754 /*
02755  *  call-seq:
02756  *     rxp === str   -> true or false
02757  *
02758  *  Case Equality---Used in case statements.
02759  *
02760  *     a = "HELLO"
02761  *     case a
02762  *     when /^[a-z]*$/; print "Lower case\n"
02763  *     when /^[A-Z]*$/; print "Upper case\n"
02764  *     else;            print "Mixed case\n"
02765  *     end
02766  *     #=> "Upper case"
02767  *
02768  *  Following a regular expression literal with the #=== operator allows you to
02769  *  compare against a String.
02770  *
02771  *      /^[a-z]*$/ === "HELLO" #=> false
02772  *      /^[A-Z]*$/ === "HELLO" #=> true
02773  */
02774 
02775 VALUE
02776 rb_reg_eqq(VALUE re, VALUE str)
02777 {
02778     long start;
02779 
02780     str = reg_operand(str, FALSE);
02781     if (NIL_P(str)) {
02782         rb_backref_set(Qnil);
02783         return Qfalse;
02784     }
02785     start = rb_reg_search(re, str, 0, 0);
02786     if (start < 0) {
02787         return Qfalse;
02788     }
02789     return Qtrue;
02790 }
02791 
02792 
02793 /*
02794  *  call-seq:
02795  *     ~ rxp   -> integer or nil
02796  *
02797  *  Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
02798  *  Equivalent to <code><i>rxp</i> =~ $_</code>.
02799  *
02800  *     $_ = "input data"
02801  *     ~ /at/   #=> 7
02802  */
02803 
02804 VALUE
02805 rb_reg_match2(VALUE re)
02806 {
02807     long start;
02808     VALUE line = rb_lastline_get();
02809 
02810     if (!RB_TYPE_P(line, T_STRING)) {
02811         rb_backref_set(Qnil);
02812         return Qnil;
02813     }
02814 
02815     start = rb_reg_search(re, line, 0, 0);
02816     if (start < 0) {
02817         return Qnil;
02818     }
02819     start = rb_str_sublen(line, start);
02820     return LONG2FIX(start);
02821 }
02822 
02823 
02824 /*
02825  *  call-seq:
02826  *     rxp.match(str)       -> matchdata or nil
02827  *     rxp.match(str,pos)   -> matchdata or nil
02828  *
02829  *  Returns a <code>MatchData</code> object describing the match, or
02830  *  <code>nil</code> if there was no match. This is equivalent to retrieving the
02831  *  value of the special variable <code>$~</code> following a normal match.
02832  *  If the second parameter is present, it specifies the position in the string
02833  *  to begin the search.
02834  *
02835  *     /(.)(.)(.)/.match("abc")[2]   #=> "b"
02836  *     /(.)(.)/.match("abc", 1)[2]   #=> "c"
02837  *
02838  *  If a block is given, invoke the block with MatchData if match succeed, so
02839  *  that you can write
02840  *
02841  *     pat.match(str) {|m| ...}
02842  *
02843  *  instead of
02844  *
02845  *     if m = pat.match(str)
02846  *       ...
02847  *     end
02848  *
02849  *  The return value is a value from block execution in this case.
02850  */
02851 
02852 static VALUE
02853 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
02854 {
02855     VALUE result, str, initpos;
02856     long pos;
02857 
02858     if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
02859         pos = NUM2LONG(initpos);
02860     }
02861     else {
02862         pos = 0;
02863     }
02864 
02865     pos = reg_match_pos(re, &str, pos);
02866     if (pos < 0) {
02867         rb_backref_set(Qnil);
02868         return Qnil;
02869     }
02870     result = rb_backref_get();
02871     rb_match_busy(result);
02872     if (!NIL_P(result) && rb_block_given_p()) {
02873         return rb_yield(result);
02874     }
02875     return result;
02876 }
02877 
02878 /*
02879  * Document-method: compile
02880  *
02881  * Synonym for <code>Regexp.new</code>
02882  */
02883 
02884 
02885 /*
02886  *  call-seq:
02887  *     Regexp.new(string, [options [, kcode]])        -> regexp
02888  *     Regexp.new(regexp)                            -> regexp
02889  *     Regexp.compile(string, [options [, kcode]])    -> regexp
02890  *     Regexp.compile(regexp)                        -> regexp
02891  *
02892  *  Constructs a new regular expression from +pattern+, which can be either a
02893  *  String or a Regexp (in which case that regexp's options are propagated),
02894  *  and new options may not be specified (a change as of Ruby 1.8).
02895  *
02896  *  If +options+ is a Fixnum, it should be one or more of the constants
02897  *  Regexp::EXTENDED, Regexp::IGNORECASE, and Regexp::MULTILINE,
02898  *  <em>or</em>-ed together.  Otherwise, if +options+ is not
02899  *  +nil+ or +false+, the regexp will be case insensitive.
02900  *
02901  *  When the +kcode+ parameter is `n' or `N' sets the regexp no encoding.
02902  *  It means that the regexp is for binary strings.
02903  *
02904  *    r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/
02905  *    r2 = Regexp.new('cat', true)     #=> /cat/i
02906  *    r3 = Regexp.new(r2)              #=> /cat/i
02907  *    r4 = Regexp.new('dog', Regexp::EXTENDED | Regexp::IGNORECASE) #=> /dog/ix
02908  */
02909 
02910 static VALUE
02911 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
02912 {
02913     onig_errmsg_buffer err = "";
02914     int flags = 0;
02915     VALUE str;
02916     rb_encoding *enc;
02917     const char *ptr;
02918     long len;
02919 
02920     rb_check_arity(argc, 1, 3);
02921     if (RB_TYPE_P(argv[0], T_REGEXP)) {
02922         VALUE re = argv[0];
02923 
02924         if (argc > 1) {
02925             rb_warn("flags ignored");
02926         }
02927         rb_reg_check(re);
02928         flags = rb_reg_options(re);
02929         ptr = RREGEXP_SRC_PTR(re);
02930         len = RREGEXP_SRC_LEN(re);
02931         enc = rb_enc_get(re);
02932         if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) {
02933             str = rb_enc_str_new(ptr, len, enc);
02934             rb_reg_raise_str(str, flags, err);
02935         }
02936     }
02937     else {
02938         if (argc >= 2) {
02939             if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
02940             else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
02941         }
02942         enc = 0;
02943         if (argc == 3 && !NIL_P(argv[2])) {
02944             char *kcode = StringValuePtr(argv[2]);
02945             if (kcode[0] == 'n' || kcode[0] == 'N') {
02946                 enc = rb_ascii8bit_encoding();
02947                 flags |= ARG_ENCODING_NONE;
02948             }
02949             else {
02950                 rb_warn("encoding option is ignored - %s", kcode);
02951             }
02952         }
02953         str = argv[0];
02954         ptr = StringValuePtr(str);
02955         if (enc
02956             ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0)
02957             : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) {
02958             rb_reg_raise_str(str, flags, err);
02959         }
02960     }
02961     return self;
02962 }
02963 
02964 VALUE
02965 rb_reg_quote(VALUE str)
02966 {
02967     rb_encoding *enc = rb_enc_get(str);
02968     char *s, *send, *t;
02969     VALUE tmp;
02970     int c, clen;
02971     int ascii_only = rb_enc_str_asciionly_p(str);
02972 
02973     s = RSTRING_PTR(str);
02974     send = s + RSTRING_LEN(str);
02975     while (s < send) {
02976         c = rb_enc_ascget(s, send, &clen, enc);
02977         if (c == -1) {
02978             s += mbclen(s, send, enc);
02979             continue;
02980         }
02981         switch (c) {
02982           case '[': case ']': case '{': case '}':
02983           case '(': case ')': case '|': case '-':
02984           case '*': case '.': case '\\':
02985           case '?': case '+': case '^': case '$':
02986           case ' ': case '#':
02987           case '\t': case '\f': case '\v': case '\n': case '\r':
02988             goto meta_found;
02989         }
02990         s += clen;
02991     }
02992     tmp = rb_str_new3(str);
02993     if (ascii_only) {
02994         rb_enc_associate(tmp, rb_usascii_encoding());
02995     }
02996     return tmp;
02997 
02998   meta_found:
02999     tmp = rb_str_new(0, RSTRING_LEN(str)*2);
03000     if (ascii_only) {
03001         rb_enc_associate(tmp, rb_usascii_encoding());
03002     }
03003     else {
03004         rb_enc_copy(tmp, str);
03005     }
03006     t = RSTRING_PTR(tmp);
03007     /* copy upto metacharacter */
03008     memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
03009     t += s - RSTRING_PTR(str);
03010 
03011     while (s < send) {
03012         c = rb_enc_ascget(s, send, &clen, enc);
03013         if (c == -1) {
03014             int n = mbclen(s, send, enc);
03015 
03016             while (n--)
03017                 *t++ = *s++;
03018             continue;
03019         }
03020         s += clen;
03021         switch (c) {
03022           case '[': case ']': case '{': case '}':
03023           case '(': case ')': case '|': case '-':
03024           case '*': case '.': case '\\':
03025           case '?': case '+': case '^': case '$':
03026           case '#':
03027             t += rb_enc_mbcput('\\', t, enc);
03028             break;
03029           case ' ':
03030             t += rb_enc_mbcput('\\', t, enc);
03031             t += rb_enc_mbcput(' ', t, enc);
03032             continue;
03033           case '\t':
03034             t += rb_enc_mbcput('\\', t, enc);
03035             t += rb_enc_mbcput('t', t, enc);
03036             continue;
03037           case '\n':
03038             t += rb_enc_mbcput('\\', t, enc);
03039             t += rb_enc_mbcput('n', t, enc);
03040             continue;
03041           case '\r':
03042             t += rb_enc_mbcput('\\', t, enc);
03043             t += rb_enc_mbcput('r', t, enc);
03044             continue;
03045           case '\f':
03046             t += rb_enc_mbcput('\\', t, enc);
03047             t += rb_enc_mbcput('f', t, enc);
03048             continue;
03049           case '\v':
03050             t += rb_enc_mbcput('\\', t, enc);
03051             t += rb_enc_mbcput('v', t, enc);
03052             continue;
03053         }
03054         t += rb_enc_mbcput(c, t, enc);
03055     }
03056     rb_str_resize(tmp, t - RSTRING_PTR(tmp));
03057     OBJ_INFECT(tmp, str);
03058     return tmp;
03059 }
03060 
03061 
03062 /*
03063  *  call-seq:
03064  *     Regexp.escape(str)   -> string
03065  *     Regexp.quote(str)    -> string
03066  *
03067  *  Escapes any characters that would have special meaning in a regular
03068  *  expression. Returns a new escaped string, or self if no characters are
03069  *  escaped.  For any string,
03070  *  <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true.
03071  *
03072  *     Regexp.escape('\*?{}.')   #=> \\\*\?\{\}\.
03073  *
03074  */
03075 
03076 static VALUE
03077 rb_reg_s_quote(VALUE c, VALUE str)
03078 {
03079     return rb_reg_quote(reg_operand(str, TRUE));
03080 }
03081 
03082 int
03083 rb_reg_options(VALUE re)
03084 {
03085     int options;
03086 
03087     rb_reg_check(re);
03088     options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
03089     if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
03090     if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
03091     return options;
03092 }
03093 
03094 VALUE
03095 rb_check_regexp_type(VALUE re)
03096 {
03097     return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
03098 }
03099 
03100 /*
03101  *  call-seq:
03102  *     Regexp.try_convert(obj) -> re or nil
03103  *
03104  *  Try to convert <i>obj</i> into a Regexp, using to_regexp method.
03105  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
03106  *  for any reason.
03107  *
03108  *     Regexp.try_convert(/re/)         #=> /re/
03109  *     Regexp.try_convert("re")         #=> nil
03110  *
03111  *     o = Object.new
03112  *     Regexp.try_convert(o)            #=> nil
03113  *     def o.to_regexp() /foo/ end
03114  *     Regexp.try_convert(o)            #=> /foo/
03115  *
03116  */
03117 static VALUE
03118 rb_reg_s_try_convert(VALUE dummy, VALUE re)
03119 {
03120     return rb_check_regexp_type(re);
03121 }
03122 
03123 static VALUE
03124 rb_reg_s_union(VALUE self, VALUE args0)
03125 {
03126     long argc = RARRAY_LEN(args0);
03127 
03128     if (argc == 0) {
03129         VALUE args[1];
03130         args[0] = rb_str_new2("(?!)");
03131         return rb_class_new_instance(1, args, rb_cRegexp);
03132     }
03133     else if (argc == 1) {
03134         VALUE arg = rb_ary_entry(args0, 0);
03135         VALUE re = rb_check_regexp_type(arg);
03136         if (!NIL_P(re))
03137             return re;
03138         else {
03139             VALUE quoted;
03140             quoted = rb_reg_s_quote(Qnil, arg);
03141             return rb_reg_new_str(quoted, 0);
03142         }
03143     }
03144     else {
03145         int i;
03146         VALUE source = rb_str_buf_new(0);
03147         rb_encoding *result_enc;
03148 
03149         int has_asciionly = 0;
03150         rb_encoding *has_ascii_compat_fixed = 0;
03151         rb_encoding *has_ascii_incompat = 0;
03152 
03153         for (i = 0; i < argc; i++) {
03154             volatile VALUE v;
03155             VALUE e = rb_ary_entry(args0, i);
03156 
03157             if (0 < i)
03158                 rb_str_buf_cat_ascii(source, "|");
03159 
03160             v = rb_check_regexp_type(e);
03161             if (!NIL_P(v)) {
03162                 rb_encoding *enc = rb_enc_get(v);
03163                 if (!rb_enc_asciicompat(enc)) {
03164                     if (!has_ascii_incompat)
03165                         has_ascii_incompat = enc;
03166                     else if (has_ascii_incompat != enc)
03167                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03168                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03169                 }
03170                 else if (rb_reg_fixed_encoding_p(v)) {
03171                     if (!has_ascii_compat_fixed)
03172                         has_ascii_compat_fixed = enc;
03173                     else if (has_ascii_compat_fixed != enc)
03174                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03175                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03176                 }
03177                 else {
03178                     has_asciionly = 1;
03179                 }
03180                 v = rb_reg_to_s(v);
03181             }
03182             else {
03183                 rb_encoding *enc;
03184                 StringValue(e);
03185                 enc = rb_enc_get(e);
03186                 if (!rb_enc_str_asciicompat_p(e)) {
03187                     if (!has_ascii_incompat)
03188                         has_ascii_incompat = enc;
03189                     else if (has_ascii_incompat != enc)
03190                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03191                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03192                 }
03193                 else if (rb_enc_str_asciionly_p(e)) {
03194                     has_asciionly = 1;
03195                 }
03196                 else {
03197                     if (!has_ascii_compat_fixed)
03198                         has_ascii_compat_fixed = enc;
03199                     else if (has_ascii_compat_fixed != enc)
03200                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03201                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03202                 }
03203                 v = rb_reg_s_quote(Qnil, e);
03204             }
03205             if (has_ascii_incompat) {
03206                 if (has_asciionly) {
03207                     rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
03208                         rb_enc_name(has_ascii_incompat));
03209                 }
03210                 if (has_ascii_compat_fixed) {
03211                     rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03212                         rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
03213                 }
03214             }
03215 
03216             if (i == 0) {
03217                 rb_enc_copy(source, v);
03218             }
03219             rb_str_append(source, v);
03220         }
03221 
03222         if (has_ascii_incompat) {
03223             result_enc = has_ascii_incompat;
03224         }
03225         else if (has_ascii_compat_fixed) {
03226             result_enc = has_ascii_compat_fixed;
03227         }
03228         else {
03229             result_enc = rb_ascii8bit_encoding();
03230         }
03231 
03232         rb_enc_associate(source, result_enc);
03233         return rb_class_new_instance(1, &source, rb_cRegexp);
03234     }
03235 }
03236 
03237 /*
03238  *  call-seq:
03239  *     Regexp.union(pat1, pat2, ...)            -> new_regexp
03240  *     Regexp.union(pats_ary)                   -> new_regexp
03241  *
03242  *  Return a <code>Regexp</code> object that is the union of the given
03243  *  <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s
03244  *  can be Regexp objects, in which case their options will be preserved, or
03245  *  Strings. If no patterns are given, returns <code>/(?!)/</code>.
03246  *  The behavior is unspecified if any given <em>pattern</em> contains capture.
03247  *
03248  *     Regexp.union                         #=> /(?!)/
03249  *     Regexp.union("penzance")             #=> /penzance/
03250  *     Regexp.union("a+b*c")                #=> /a\+b\*c/
03251  *     Regexp.union("skiing", "sledding")   #=> /skiing|sledding/
03252  *     Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
03253  *     Regexp.union(/dogs/, /cats/i)        #=> /(?-mix:dogs)|(?i-mx:cats)/
03254  */
03255 static VALUE
03256 rb_reg_s_union_m(VALUE self, VALUE args)
03257 {
03258     VALUE v;
03259     if (RARRAY_LEN(args) == 1 &&
03260         !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
03261         return rb_reg_s_union(self, v);
03262     }
03263     return rb_reg_s_union(self, args);
03264 }
03265 
03266 /* :nodoc: */
03267 static VALUE
03268 rb_reg_init_copy(VALUE copy, VALUE re)
03269 {
03270     onig_errmsg_buffer err = "";
03271     const char *s;
03272     long len;
03273 
03274     if (!OBJ_INIT_COPY(copy, re)) return copy;
03275     rb_reg_check(re);
03276     s = RREGEXP_SRC_PTR(re);
03277     len = RREGEXP_SRC_LEN(re);
03278     if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re),
03279                 err, NULL, 0) != 0) {
03280         rb_reg_raise(s, len, err, re);
03281     }
03282     return copy;
03283 }
03284 
03285 VALUE
03286 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
03287 {
03288     VALUE val = 0;
03289     char *p, *s, *e;
03290     int no, clen;
03291     rb_encoding *str_enc = rb_enc_get(str);
03292     rb_encoding *src_enc = rb_enc_get(src);
03293     int acompat = rb_enc_asciicompat(str_enc);
03294 #define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc))
03295 
03296     p = s = RSTRING_PTR(str);
03297     e = s + RSTRING_LEN(str);
03298 
03299     while (s < e) {
03300         int c = ASCGET(s, e, &clen);
03301         char *ss;
03302 
03303         if (c == -1) {
03304             s += mbclen(s, e, str_enc);
03305             continue;
03306         }
03307         ss = s;
03308         s += clen;
03309 
03310         if (c != '\\' || s == e) continue;
03311 
03312         if (!val) {
03313             val = rb_str_buf_new(ss-p);
03314         }
03315         rb_enc_str_buf_cat(val, p, ss-p, str_enc);
03316 
03317         c = ASCGET(s, e, &clen);
03318         if (c == -1) {
03319             s += mbclen(s, e, str_enc);
03320             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03321             p = s;
03322             continue;
03323         }
03324         s += clen;
03325 
03326         p = s;
03327         switch (c) {
03328           case '1': case '2': case '3': case '4':
03329           case '5': case '6': case '7': case '8': case '9':
03330             if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
03331                 no = c - '0';
03332             }
03333             else {
03334                 continue;
03335             }
03336             break;
03337 
03338           case 'k':
03339             if (s < e && ASCGET(s, e, &clen) == '<') {
03340                 char *name, *name_end;
03341 
03342                 name_end = name = s + clen;
03343                 while (name_end < e) {
03344                     c = ASCGET(name_end, e, &clen);
03345                     if (c == '>') break;
03346                     name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
03347                 }
03348                 if (name_end < e) {
03349                     no = name_to_backref_number(regs, regexp, name, name_end);
03350                     p = s = name_end + clen;
03351                     break;
03352                 }
03353                 else {
03354                     rb_raise(rb_eRuntimeError, "invalid group name reference format");
03355                 }
03356             }
03357 
03358             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03359             continue;
03360 
03361           case '0':
03362           case '&':
03363             no = 0;
03364             break;
03365 
03366           case '`':
03367             rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
03368             continue;
03369 
03370           case '\'':
03371             rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
03372             continue;
03373 
03374           case '+':
03375             no = regs->num_regs-1;
03376             while (BEG(no) == -1 && no > 0) no--;
03377             if (no == 0) continue;
03378             break;
03379 
03380           case '\\':
03381             rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
03382             continue;
03383 
03384           default:
03385             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03386             continue;
03387         }
03388 
03389         if (no >= 0) {
03390             if (no >= regs->num_regs) continue;
03391             if (BEG(no) == -1) continue;
03392             rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
03393         }
03394     }
03395 
03396     if (!val) return str;
03397     if (p < e) {
03398         rb_enc_str_buf_cat(val, p, e-p, str_enc);
03399     }
03400 
03401     return val;
03402 }
03403 
03404 static VALUE
03405 kcode_getter(void)
03406 {
03407     rb_warn("variable $KCODE is no longer effective");
03408     return Qnil;
03409 }
03410 
03411 static void
03412 kcode_setter(VALUE val, ID id)
03413 {
03414     rb_warn("variable $KCODE is no longer effective; ignored");
03415 }
03416 
03417 static VALUE
03418 ignorecase_getter(void)
03419 {
03420     rb_warn("variable $= is no longer effective");
03421     return Qfalse;
03422 }
03423 
03424 static void
03425 ignorecase_setter(VALUE val, ID id)
03426 {
03427     rb_warn("variable $= is no longer effective; ignored");
03428 }
03429 
03430 static VALUE
03431 match_getter(void)
03432 {
03433     VALUE match = rb_backref_get();
03434 
03435     if (NIL_P(match)) return Qnil;
03436     rb_match_busy(match);
03437     return match;
03438 }
03439 
03440 static void
03441 match_setter(VALUE val)
03442 {
03443     if (!NIL_P(val)) {
03444         Check_Type(val, T_MATCH);
03445     }
03446     rb_backref_set(val);
03447 }
03448 
03449 /*
03450  *  call-seq:
03451  *     Regexp.last_match           -> matchdata
03452  *     Regexp.last_match(n)        -> str
03453  *
03454  *  The first form returns the MatchData object generated by the
03455  *  last successful pattern match.  Equivalent to reading the special global
03456  *  variable <code>$~</code> (see Special global variables in Regexp for
03457  *  details).
03458  *
03459  *  The second form returns the <i>n</i>th field in this MatchData object.
03460  *  _n_ can be a string or symbol to reference a named capture.
03461  *
03462  *  Note that the last_match is local to the thread and method scope of the
03463  *  method that did the pattern match.
03464  *
03465  *     /c(.)t/ =~ 'cat'        #=> 0
03466  *     Regexp.last_match       #=> #<MatchData "cat" 1:"a">
03467  *     Regexp.last_match(0)    #=> "cat"
03468  *     Regexp.last_match(1)    #=> "a"
03469  *     Regexp.last_match(2)    #=> nil
03470  *
03471  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
03472  *     Regexp.last_match       #=> #<MatchData "var = val" lhs:"var" rhs:"val">
03473  *     Regexp.last_match(:lhs) #=> "var"
03474  *     Regexp.last_match(:rhs) #=> "val"
03475  */
03476 
03477 static VALUE
03478 rb_reg_s_last_match(int argc, VALUE *argv)
03479 {
03480     VALUE nth;
03481 
03482     if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
03483         VALUE match = rb_backref_get();
03484         int n;
03485         if (NIL_P(match)) return Qnil;
03486         n = match_backref_number(match, nth);
03487         return rb_reg_nth_match(n, match);
03488     }
03489     return match_getter();
03490 }
03491 
03492 static void
03493 re_warn(const char *s)
03494 {
03495     rb_warn("%s", s);
03496 }
03497 
03498 /*
03499  *  Document-class: RegexpError
03500  *
03501  *  Raised when given an invalid regexp expression.
03502  *
03503  *     Regexp.new("?")
03504  *
03505  *  <em>raises the exception:</em>
03506  *
03507  *     RegexpError: target of repeat operator is not specified: /?/
03508  */
03509 
03510 /*
03511  *  Document-class: Regexp
03512  *
03513  *  A <code>Regexp</code> holds a regular expression, used to match a pattern
03514  *  against strings. Regexps are created using the <code>/.../</code> and
03515  *  <code>%r{...}</code> literals, and by the <code>Regexp::new</code>
03516  *  constructor.
03517  *
03518  *  :include: doc/re.rdoc
03519  */
03520 
03521 void
03522 Init_Regexp(void)
03523 {
03524     rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
03525 
03526     onigenc_set_default_caseconv_table((UChar*)casetable);
03527     onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
03528     onig_set_warn_func(re_warn);
03529     onig_set_verb_warn_func(re_warn);
03530 
03531     rb_define_virtual_variable("$~", match_getter, match_setter);
03532     rb_define_virtual_variable("$&", last_match_getter, 0);
03533     rb_define_virtual_variable("$`", prematch_getter, 0);
03534     rb_define_virtual_variable("$'", postmatch_getter, 0);
03535     rb_define_virtual_variable("$+", last_paren_match_getter, 0);
03536 
03537     rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
03538     rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
03539     rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
03540 
03541     rb_cRegexp = rb_define_class("Regexp", rb_cObject);
03542     rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
03543     rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
03544     rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
03545     rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
03546     rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
03547     rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
03548     rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
03549 
03550     rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
03551     rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
03552     rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
03553     rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
03554     rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
03555     rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
03556     rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
03557     rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
03558     rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
03559     rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
03560     rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
03561     rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
03562     rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
03563     rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
03564     rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */
03565     rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
03566     rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
03567     rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
03568 
03569     /* see Regexp.options and Regexp.new */
03570     rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
03571     /* see Regexp.options and Regexp.new */
03572     rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
03573     /* see Regexp.options and Regexp.new */
03574     rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
03575     /* see Regexp.options and Regexp.new */
03576     rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED));
03577     /* see Regexp.options and Regexp.new */
03578     rb_define_const(rb_cRegexp, "NOENCODING", INT2FIX(ARG_ENCODING_NONE));
03579 
03580     rb_global_variable(&reg_cache);
03581 
03582     rb_cMatch  = rb_define_class("MatchData", rb_cObject);
03583     rb_define_alloc_func(rb_cMatch, match_alloc);
03584     rb_undef_method(CLASS_OF(rb_cMatch), "new");
03585 
03586     rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
03587     rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
03588     rb_define_method(rb_cMatch, "names", match_names, 0);
03589     rb_define_method(rb_cMatch, "size", match_size, 0);
03590     rb_define_method(rb_cMatch, "length", match_size, 0);
03591     rb_define_method(rb_cMatch, "offset", match_offset, 1);
03592     rb_define_method(rb_cMatch, "begin", match_begin, 1);
03593     rb_define_method(rb_cMatch, "end", match_end, 1);
03594     rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
03595     rb_define_method(rb_cMatch, "[]", match_aref, -1);
03596     rb_define_method(rb_cMatch, "captures", match_captures, 0);
03597     rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
03598     rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
03599     rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
03600     rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
03601     rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
03602     rb_define_method(rb_cMatch, "string", match_string, 0);
03603     rb_define_method(rb_cMatch, "hash", match_hash, 0);
03604     rb_define_method(rb_cMatch, "eql?", match_equal, 1);
03605     rb_define_method(rb_cMatch, "==", match_equal, 1);
03606 }
03607