Ruby
2.0.0p247(2013-06-27revision41674)
|
00001 /********************************************************************** 00002 regparse.c - Onigmo (Oniguruma-mod) (regular expression library) 00003 **********************************************************************/ 00004 /*- 00005 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 00006 * Copyright (c) 2011-2013 K.Takata <kentkt AT csc DOT jp> 00007 * All rights reserved. 00008 * 00009 * Redistribution and use in source and binary forms, with or without 00010 * modification, are permitted provided that the following conditions 00011 * are met: 00012 * 1. Redistributions of source code must retain the above copyright 00013 * notice, this list of conditions and the following disclaimer. 00014 * 2. Redistributions in binary form must reproduce the above copyright 00015 * notice, this list of conditions and the following disclaimer in the 00016 * documentation and/or other materials provided with the distribution. 00017 * 00018 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 00019 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00020 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00021 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 00022 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00023 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00024 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00025 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00026 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00027 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00028 * SUCH DAMAGE. 00029 */ 00030 00031 #include "regparse.h" 00032 00033 #define WARN_BUFSIZE 256 00034 00035 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 00036 00037 00038 const OnigSyntaxType OnigSyntaxRuby = { 00039 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | 00040 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | 00041 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS | 00042 ONIG_SYN_OP_ESC_C_CONTROL ) 00043 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) 00044 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | 00045 ONIG_SYN_OP2_OPTION_RUBY | 00046 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF | 00047 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | 00048 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | 00049 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | 00050 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | 00051 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | 00052 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | 00053 ONIG_SYN_OP2_ESC_H_XDIGIT | 00054 ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER | 00055 ONIG_SYN_OP2_QMARK_LPAREN_CONDITION | 00056 ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK | 00057 ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP ) 00058 , ( SYN_GNU_REGEX_BV | 00059 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | 00060 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | 00061 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | 00062 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | 00063 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | 00064 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | 00065 ONIG_SYN_WARN_CC_DUP | 00066 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) 00067 , ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE | 00068 ONIG_OPTION_WORD_BOUND_ALL_RANGE ) 00069 , 00070 { 00071 (OnigCodePoint )'\\' /* esc */ 00072 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ 00073 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ 00074 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ 00075 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ 00076 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ 00077 } 00078 }; 00079 00080 const OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; 00081 00082 extern void onig_null_warn(const char* s ARG_UNUSED) { } 00083 00084 #ifdef DEFAULT_WARN_FUNCTION 00085 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; 00086 #else 00087 static OnigWarnFunc onig_warn = onig_null_warn; 00088 #endif 00089 00090 #ifdef DEFAULT_VERB_WARN_FUNCTION 00091 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION; 00092 #else 00093 static OnigWarnFunc onig_verb_warn = onig_null_warn; 00094 #endif 00095 00096 extern void onig_set_warn_func(OnigWarnFunc f) 00097 { 00098 onig_warn = f; 00099 } 00100 00101 extern void onig_set_verb_warn_func(OnigWarnFunc f) 00102 { 00103 onig_verb_warn = f; 00104 } 00105 00106 static void CC_DUP_WARN(ScanEnv *env); 00107 00108 static void 00109 bbuf_free(BBuf* bbuf) 00110 { 00111 if (IS_NOT_NULL(bbuf)) { 00112 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p); 00113 xfree(bbuf); 00114 } 00115 } 00116 00117 static int 00118 bbuf_clone(BBuf** rto, BBuf* from) 00119 { 00120 int r; 00121 BBuf *to; 00122 00123 *rto = to = (BBuf* )xmalloc(sizeof(BBuf)); 00124 CHECK_NULL_RETURN_MEMERR(to); 00125 r = BBUF_INIT(to, from->alloc); 00126 if (r != 0) return r; 00127 to->used = from->used; 00128 xmemcpy(to->p, from->p, from->used); 00129 return 0; 00130 } 00131 00132 #define BACKREF_REL_TO_ABS(rel_no, env) \ 00133 ((env)->num_mem + 1 + (rel_no)) 00134 00135 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) 00136 00137 #define MBCODE_START_POS(enc) \ 00138 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80) 00139 00140 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \ 00141 add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ONIG_LAST_CODE_POINT) 00142 00143 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\ 00144 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\ 00145 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\ 00146 if (r) return r;\ 00147 }\ 00148 } while (0) 00149 00150 00151 #define BITSET_SET_BIT_CHKDUP(bs, pos) do { \ 00152 if (BITSET_AT(bs, pos)) CC_DUP_WARN(env); \ 00153 BS_ROOM(bs, pos) |= BS_BIT(pos); \ 00154 } while (0) 00155 00156 #define BITSET_IS_EMPTY(bs,empty) do {\ 00157 int i;\ 00158 empty = 1;\ 00159 for (i = 0; i < BITSET_SIZE; i++) {\ 00160 if ((bs)[i] != 0) {\ 00161 empty = 0; break;\ 00162 }\ 00163 }\ 00164 } while (0) 00165 00166 static void 00167 bitset_set_range(ScanEnv *env, BitSetRef bs, int from, int to) 00168 { 00169 int i; 00170 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) { 00171 BITSET_SET_BIT_CHKDUP(bs, i); 00172 } 00173 } 00174 00175 #if 0 00176 static void 00177 bitset_set_all(BitSetRef bs) 00178 { 00179 int i; 00180 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); } 00181 } 00182 #endif 00183 00184 static void 00185 bitset_invert(BitSetRef bs) 00186 { 00187 int i; 00188 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~(bs[i]); } 00189 } 00190 00191 static void 00192 bitset_invert_to(BitSetRef from, BitSetRef to) 00193 { 00194 int i; 00195 for (i = 0; i < BITSET_SIZE; i++) { to[i] = ~(from[i]); } 00196 } 00197 00198 static void 00199 bitset_and(BitSetRef dest, BitSetRef bs) 00200 { 00201 int i; 00202 for (i = 0; i < BITSET_SIZE; i++) { dest[i] &= bs[i]; } 00203 } 00204 00205 static void 00206 bitset_or(BitSetRef dest, BitSetRef bs) 00207 { 00208 int i; 00209 for (i = 0; i < BITSET_SIZE; i++) { dest[i] |= bs[i]; } 00210 } 00211 00212 static void 00213 bitset_copy(BitSetRef dest, BitSetRef bs) 00214 { 00215 int i; 00216 for (i = 0; i < BITSET_SIZE; i++) { dest[i] = bs[i]; } 00217 } 00218 00219 extern int 00220 onig_strncmp(const UChar* s1, const UChar* s2, int n) 00221 { 00222 int x; 00223 00224 while (n-- > 0) { 00225 x = *s2++ - *s1++; 00226 if (x) return x; 00227 } 00228 return 0; 00229 } 00230 00231 extern void 00232 onig_strcpy(UChar* dest, const UChar* src, const UChar* end) 00233 { 00234 ptrdiff_t len = end - src; 00235 if (len > 0) { 00236 xmemcpy(dest, src, len); 00237 dest[len] = (UChar )0; 00238 } 00239 } 00240 00241 #ifdef USE_NAMED_GROUP 00242 static UChar* 00243 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end) 00244 { 00245 ptrdiff_t slen; 00246 int term_len, i; 00247 UChar *r; 00248 00249 slen = end - s; 00250 term_len = ONIGENC_MBC_MINLEN(enc); 00251 00252 r = (UChar* )xmalloc(slen + term_len); 00253 CHECK_NULL_RETURN(r); 00254 xmemcpy(r, s, slen); 00255 00256 for (i = 0; i < term_len; i++) 00257 r[slen + i] = (UChar )0; 00258 00259 return r; 00260 } 00261 #endif 00262 00263 /* scan pattern methods */ 00264 #define PEND_VALUE 0 00265 00266 #ifdef __GNUC__ 00267 /* get rid of Wunused-but-set-variable and Wuninitialized */ 00268 #define PFETCH_READY UChar* pfetch_prev = NULL; (void)pfetch_prev 00269 #else 00270 #define PFETCH_READY UChar* pfetch_prev 00271 #endif 00272 #define PEND (p < end ? 0 : 1) 00273 #define PUNFETCH p = pfetch_prev 00274 #define PINC do { \ 00275 pfetch_prev = p; \ 00276 p += enclen(enc, p, end); \ 00277 } while (0) 00278 #define PFETCH(c) do { \ 00279 c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \ 00280 pfetch_prev = p; \ 00281 p += enclen(enc, p, end); \ 00282 } while (0) 00283 00284 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE) 00285 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c) 00286 00287 static UChar* 00288 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end, 00289 size_t capa) 00290 { 00291 UChar* r; 00292 00293 if (dest) 00294 r = (UChar* )xrealloc(dest, capa + 1); 00295 else 00296 r = (UChar* )xmalloc(capa + 1); 00297 00298 CHECK_NULL_RETURN(r); 00299 onig_strcpy(r + (dest_end - dest), src, src_end); 00300 return r; 00301 } 00302 00303 /* dest on static area */ 00304 static UChar* 00305 strcat_capa_from_static(UChar* dest, UChar* dest_end, 00306 const UChar* src, const UChar* src_end, size_t capa) 00307 { 00308 UChar* r; 00309 00310 r = (UChar* )xmalloc(capa + 1); 00311 CHECK_NULL_RETURN(r); 00312 onig_strcpy(r, dest, dest_end); 00313 onig_strcpy(r + (dest_end - dest), src, src_end); 00314 return r; 00315 } 00316 00317 00318 #ifdef USE_ST_LIBRARY 00319 00320 #include "ruby/st.h" 00321 00322 typedef struct { 00323 const UChar* s; 00324 const UChar* end; 00325 } st_str_end_key; 00326 00327 static int 00328 str_end_cmp(st_data_t xp, st_data_t yp) 00329 { 00330 const st_str_end_key *x, *y; 00331 const UChar *p, *q; 00332 int c; 00333 00334 x = (const st_str_end_key *)xp; 00335 y = (const st_str_end_key *)yp; 00336 if ((x->end - x->s) != (y->end - y->s)) 00337 return 1; 00338 00339 p = x->s; 00340 q = y->s; 00341 while (p < x->end) { 00342 c = (int )*p - (int )*q; 00343 if (c != 0) return c; 00344 00345 p++; q++; 00346 } 00347 00348 return 0; 00349 } 00350 00351 static st_index_t 00352 str_end_hash(st_data_t xp) 00353 { 00354 const st_str_end_key *x = (const st_str_end_key *)xp; 00355 const UChar *p; 00356 st_index_t val = 0; 00357 00358 p = x->s; 00359 while (p < x->end) { 00360 val = val * 997 + (int )*p++; 00361 } 00362 00363 return val + (val >> 5); 00364 } 00365 00366 extern hash_table_type* 00367 onig_st_init_strend_table_with_size(st_index_t size) 00368 { 00369 static const struct st_hash_type hashType = { 00370 str_end_cmp, 00371 str_end_hash, 00372 }; 00373 00374 return (hash_table_type* ) 00375 onig_st_init_table_with_size(&hashType, size); 00376 } 00377 00378 extern int 00379 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key, 00380 const UChar* end_key, hash_data_type *value) 00381 { 00382 st_str_end_key key; 00383 00384 key.s = (UChar* )str_key; 00385 key.end = (UChar* )end_key; 00386 00387 return onig_st_lookup(table, (st_data_t )(&key), value); 00388 } 00389 00390 extern int 00391 onig_st_insert_strend(hash_table_type* table, const UChar* str_key, 00392 const UChar* end_key, hash_data_type value) 00393 { 00394 st_str_end_key* key; 00395 int result; 00396 00397 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key)); 00398 key->s = (UChar* )str_key; 00399 key->end = (UChar* )end_key; 00400 result = onig_st_insert(table, (st_data_t )key, value); 00401 if (result) { 00402 xfree(key); 00403 } 00404 return result; 00405 } 00406 00407 #endif /* USE_ST_LIBRARY */ 00408 00409 00410 #ifdef USE_NAMED_GROUP 00411 00412 #define INIT_NAME_BACKREFS_ALLOC_NUM 8 00413 00414 typedef struct { 00415 UChar* name; 00416 size_t name_len; /* byte length */ 00417 int back_num; /* number of backrefs */ 00418 int back_alloc; 00419 int back_ref1; 00420 int* back_refs; 00421 } NameEntry; 00422 00423 #ifdef USE_ST_LIBRARY 00424 00425 typedef st_table NameTable; 00426 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ 00427 00428 #ifdef ONIG_DEBUG 00429 static int 00430 i_print_name_entry(UChar* key, NameEntry* e, void* arg) 00431 { 00432 int i; 00433 FILE* fp = (FILE* )arg; 00434 00435 fprintf(fp, "%s: ", e->name); 00436 if (e->back_num == 0) 00437 fputs("-", fp); 00438 else if (e->back_num == 1) 00439 fprintf(fp, "%d", e->back_ref1); 00440 else { 00441 for (i = 0; i < e->back_num; i++) { 00442 if (i > 0) fprintf(fp, ", "); 00443 fprintf(fp, "%d", e->back_refs[i]); 00444 } 00445 } 00446 fputs("\n", fp); 00447 return ST_CONTINUE; 00448 } 00449 00450 extern int 00451 onig_print_names(FILE* fp, regex_t* reg) 00452 { 00453 NameTable* t = (NameTable* )reg->name_table; 00454 00455 if (IS_NOT_NULL(t)) { 00456 fprintf(fp, "name table\n"); 00457 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp); 00458 fputs("\n", fp); 00459 } 00460 return 0; 00461 } 00462 #endif /* ONIG_DEBUG */ 00463 00464 static int 00465 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED) 00466 { 00467 xfree(e->name); 00468 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); 00469 xfree(key); 00470 xfree(e); 00471 return ST_DELETE; 00472 } 00473 00474 static int 00475 names_clear(regex_t* reg) 00476 { 00477 NameTable* t = (NameTable* )reg->name_table; 00478 00479 if (IS_NOT_NULL(t)) { 00480 onig_st_foreach(t, i_free_name_entry, 0); 00481 } 00482 return 0; 00483 } 00484 00485 extern int 00486 onig_names_free(regex_t* reg) 00487 { 00488 int r; 00489 NameTable* t; 00490 00491 r = names_clear(reg); 00492 if (r) return r; 00493 00494 t = (NameTable* )reg->name_table; 00495 if (IS_NOT_NULL(t)) onig_st_free_table(t); 00496 reg->name_table = (void* )NULL; 00497 return 0; 00498 } 00499 00500 static NameEntry* 00501 name_find(regex_t* reg, const UChar* name, const UChar* name_end) 00502 { 00503 NameEntry* e; 00504 NameTable* t = (NameTable* )reg->name_table; 00505 00506 e = (NameEntry* )NULL; 00507 if (IS_NOT_NULL(t)) { 00508 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e))); 00509 } 00510 return e; 00511 } 00512 00513 typedef struct { 00514 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*); 00515 regex_t* reg; 00516 void* arg; 00517 int ret; 00518 OnigEncoding enc; 00519 } INamesArg; 00520 00521 static int 00522 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg) 00523 { 00524 int r = (*(arg->func))(e->name, 00525 e->name + e->name_len, 00526 e->back_num, 00527 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), 00528 arg->reg, arg->arg); 00529 if (r != 0) { 00530 arg->ret = r; 00531 return ST_STOP; 00532 } 00533 return ST_CONTINUE; 00534 } 00535 00536 extern int 00537 onig_foreach_name(regex_t* reg, 00538 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) 00539 { 00540 INamesArg narg; 00541 NameTable* t = (NameTable* )reg->name_table; 00542 00543 narg.ret = 0; 00544 if (IS_NOT_NULL(t)) { 00545 narg.func = func; 00546 narg.reg = reg; 00547 narg.arg = arg; 00548 narg.enc = reg->enc; /* should be pattern encoding. */ 00549 onig_st_foreach(t, i_names, (HashDataType )&narg); 00550 } 00551 return narg.ret; 00552 } 00553 00554 static int 00555 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map) 00556 { 00557 int i; 00558 00559 if (e->back_num > 1) { 00560 for (i = 0; i < e->back_num; i++) { 00561 e->back_refs[i] = map[e->back_refs[i]].new_val; 00562 } 00563 } 00564 else if (e->back_num == 1) { 00565 e->back_ref1 = map[e->back_ref1].new_val; 00566 } 00567 00568 return ST_CONTINUE; 00569 } 00570 00571 extern int 00572 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map) 00573 { 00574 NameTable* t = (NameTable* )reg->name_table; 00575 00576 if (IS_NOT_NULL(t)) { 00577 onig_st_foreach(t, i_renumber_name, (HashDataType )map); 00578 } 00579 return 0; 00580 } 00581 00582 00583 extern int 00584 onig_number_of_names(regex_t* reg) 00585 { 00586 NameTable* t = (NameTable* )reg->name_table; 00587 00588 if (IS_NOT_NULL(t)) 00589 return (int )t->num_entries; 00590 else 00591 return 0; 00592 } 00593 00594 #else /* USE_ST_LIBRARY */ 00595 00596 #define INIT_NAMES_ALLOC_NUM 8 00597 00598 typedef struct { 00599 NameEntry* e; 00600 int num; 00601 int alloc; 00602 } NameTable; 00603 00604 #ifdef ONIG_DEBUG 00605 extern int 00606 onig_print_names(FILE* fp, regex_t* reg) 00607 { 00608 int i, j; 00609 NameEntry* e; 00610 NameTable* t = (NameTable* )reg->name_table; 00611 00612 if (IS_NOT_NULL(t) && t->num > 0) { 00613 fprintf(fp, "name table\n"); 00614 for (i = 0; i < t->num; i++) { 00615 e = &(t->e[i]); 00616 fprintf(fp, "%s: ", e->name); 00617 if (e->back_num == 0) { 00618 fputs("-", fp); 00619 } 00620 else if (e->back_num == 1) { 00621 fprintf(fp, "%d", e->back_ref1); 00622 } 00623 else { 00624 for (j = 0; j < e->back_num; j++) { 00625 if (j > 0) fprintf(fp, ", "); 00626 fprintf(fp, "%d", e->back_refs[j]); 00627 } 00628 } 00629 fputs("\n", fp); 00630 } 00631 fputs("\n", fp); 00632 } 00633 return 0; 00634 } 00635 #endif 00636 00637 static int 00638 names_clear(regex_t* reg) 00639 { 00640 int i; 00641 NameEntry* e; 00642 NameTable* t = (NameTable* )reg->name_table; 00643 00644 if (IS_NOT_NULL(t)) { 00645 for (i = 0; i < t->num; i++) { 00646 e = &(t->e[i]); 00647 if (IS_NOT_NULL(e->name)) { 00648 xfree(e->name); 00649 e->name = NULL; 00650 e->name_len = 0; 00651 e->back_num = 0; 00652 e->back_alloc = 0; 00653 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); 00654 e->back_refs = (int* )NULL; 00655 } 00656 } 00657 if (IS_NOT_NULL(t->e)) { 00658 xfree(t->e); 00659 t->e = NULL; 00660 } 00661 t->num = 0; 00662 } 00663 return 0; 00664 } 00665 00666 extern int 00667 onig_names_free(regex_t* reg) 00668 { 00669 int r; 00670 NameTable* t; 00671 00672 r = names_clear(reg); 00673 if (r) return r; 00674 00675 t = (NameTable* )reg->name_table; 00676 if (IS_NOT_NULL(t)) xfree(t); 00677 reg->name_table = NULL; 00678 return 0; 00679 } 00680 00681 static NameEntry* 00682 name_find(regex_t* reg, const UChar* name, const UChar* name_end) 00683 { 00684 int i, len; 00685 NameEntry* e; 00686 NameTable* t = (NameTable* )reg->name_table; 00687 00688 if (IS_NOT_NULL(t)) { 00689 len = name_end - name; 00690 for (i = 0; i < t->num; i++) { 00691 e = &(t->e[i]); 00692 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0) 00693 return e; 00694 } 00695 } 00696 return (NameEntry* )NULL; 00697 } 00698 00699 extern int 00700 onig_foreach_name(regex_t* reg, 00701 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) 00702 { 00703 int i, r; 00704 NameEntry* e; 00705 NameTable* t = (NameTable* )reg->name_table; 00706 00707 if (IS_NOT_NULL(t)) { 00708 for (i = 0; i < t->num; i++) { 00709 e = &(t->e[i]); 00710 r = (*func)(e->name, e->name + e->name_len, e->back_num, 00711 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), 00712 reg, arg); 00713 if (r != 0) return r; 00714 } 00715 } 00716 return 0; 00717 } 00718 00719 extern int 00720 onig_number_of_names(regex_t* reg) 00721 { 00722 NameTable* t = (NameTable* )reg->name_table; 00723 00724 if (IS_NOT_NULL(t)) 00725 return t->num; 00726 else 00727 return 0; 00728 } 00729 00730 #endif /* else USE_ST_LIBRARY */ 00731 00732 static int 00733 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) 00734 { 00735 int alloc; 00736 NameEntry* e; 00737 NameTable* t = (NameTable* )reg->name_table; 00738 00739 if (name_end - name <= 0) 00740 return ONIGERR_EMPTY_GROUP_NAME; 00741 00742 e = name_find(reg, name, name_end); 00743 if (IS_NULL(e)) { 00744 #ifdef USE_ST_LIBRARY 00745 if (IS_NULL(t)) { 00746 t = onig_st_init_strend_table_with_size(5); 00747 reg->name_table = (void* )t; 00748 } 00749 e = (NameEntry* )xmalloc(sizeof(NameEntry)); 00750 CHECK_NULL_RETURN_MEMERR(e); 00751 00752 e->name = strdup_with_null(reg->enc, name, name_end); 00753 if (IS_NULL(e->name)) { 00754 xfree(e); 00755 return ONIGERR_MEMORY; 00756 } 00757 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)), 00758 (HashDataType )e); 00759 00760 e->name_len = name_end - name; 00761 e->back_num = 0; 00762 e->back_alloc = 0; 00763 e->back_refs = (int* )NULL; 00764 00765 #else 00766 00767 if (IS_NULL(t)) { 00768 alloc = INIT_NAMES_ALLOC_NUM; 00769 t = (NameTable* )xmalloc(sizeof(NameTable)); 00770 CHECK_NULL_RETURN_MEMERR(t); 00771 t->e = NULL; 00772 t->alloc = 0; 00773 t->num = 0; 00774 00775 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc); 00776 if (IS_NULL(t->e)) { 00777 xfree(t); 00778 return ONIGERR_MEMORY; 00779 } 00780 t->alloc = alloc; 00781 reg->name_table = t; 00782 goto clear; 00783 } 00784 else if (t->num == t->alloc) { 00785 int i; 00786 NameEntry* p; 00787 00788 alloc = t->alloc * 2; 00789 p = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc); 00790 CHECK_NULL_RETURN_MEMERR(p); 00791 t->e = p; 00792 t->alloc = alloc; 00793 00794 clear: 00795 for (i = t->num; i < t->alloc; i++) { 00796 t->e[i].name = NULL; 00797 t->e[i].name_len = 0; 00798 t->e[i].back_num = 0; 00799 t->e[i].back_alloc = 0; 00800 t->e[i].back_refs = (int* )NULL; 00801 } 00802 } 00803 e = &(t->e[t->num]); 00804 t->num++; 00805 e->name = strdup_with_null(reg->enc, name, name_end); 00806 if (IS_NULL(e->name)) return ONIGERR_MEMORY; 00807 e->name_len = name_end - name; 00808 #endif 00809 } 00810 00811 if (e->back_num >= 1 && 00812 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) { 00813 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, 00814 name, name_end); 00815 return ONIGERR_MULTIPLEX_DEFINED_NAME; 00816 } 00817 00818 e->back_num++; 00819 if (e->back_num == 1) { 00820 e->back_ref1 = backref; 00821 } 00822 else { 00823 if (e->back_num == 2) { 00824 alloc = INIT_NAME_BACKREFS_ALLOC_NUM; 00825 e->back_refs = (int* )xmalloc(sizeof(int) * alloc); 00826 CHECK_NULL_RETURN_MEMERR(e->back_refs); 00827 e->back_alloc = alloc; 00828 e->back_refs[0] = e->back_ref1; 00829 e->back_refs[1] = backref; 00830 } 00831 else { 00832 if (e->back_num > e->back_alloc) { 00833 int* p; 00834 alloc = e->back_alloc * 2; 00835 p = (int* )xrealloc(e->back_refs, sizeof(int) * alloc); 00836 CHECK_NULL_RETURN_MEMERR(p); 00837 e->back_refs = p; 00838 e->back_alloc = alloc; 00839 } 00840 e->back_refs[e->back_num - 1] = backref; 00841 } 00842 } 00843 00844 return 0; 00845 } 00846 00847 extern int 00848 onig_name_to_group_numbers(regex_t* reg, const UChar* name, 00849 const UChar* name_end, int** nums) 00850 { 00851 NameEntry* e = name_find(reg, name, name_end); 00852 00853 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE; 00854 00855 switch (e->back_num) { 00856 case 0: 00857 *nums = 0; 00858 break; 00859 case 1: 00860 *nums = &(e->back_ref1); 00861 break; 00862 default: 00863 *nums = e->back_refs; 00864 break; 00865 } 00866 return e->back_num; 00867 } 00868 00869 extern int 00870 onig_name_to_backref_number(regex_t* reg, const UChar* name, 00871 const UChar* name_end, OnigRegion *region) 00872 { 00873 int i, n, *nums; 00874 00875 n = onig_name_to_group_numbers(reg, name, name_end, &nums); 00876 if (n < 0) 00877 return n; 00878 else if (n == 0) 00879 return ONIGERR_PARSER_BUG; 00880 else if (n == 1) 00881 return nums[0]; 00882 else { 00883 if (IS_NOT_NULL(region)) { 00884 for (i = n - 1; i >= 0; i--) { 00885 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS) 00886 return nums[i]; 00887 } 00888 } 00889 return nums[n - 1]; 00890 } 00891 } 00892 00893 #else /* USE_NAMED_GROUP */ 00894 00895 extern int 00896 onig_name_to_group_numbers(regex_t* reg, const UChar* name, 00897 const UChar* name_end, int** nums) 00898 { 00899 return ONIG_NO_SUPPORT_CONFIG; 00900 } 00901 00902 extern int 00903 onig_name_to_backref_number(regex_t* reg, const UChar* name, 00904 const UChar* name_end, OnigRegion* region) 00905 { 00906 return ONIG_NO_SUPPORT_CONFIG; 00907 } 00908 00909 extern int 00910 onig_foreach_name(regex_t* reg, 00911 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) 00912 { 00913 return ONIG_NO_SUPPORT_CONFIG; 00914 } 00915 00916 extern int 00917 onig_number_of_names(regex_t* reg) 00918 { 00919 return 0; 00920 } 00921 #endif /* else USE_NAMED_GROUP */ 00922 00923 extern int 00924 onig_noname_group_capture_is_active(regex_t* reg) 00925 { 00926 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP)) 00927 return 0; 00928 00929 #ifdef USE_NAMED_GROUP 00930 if (onig_number_of_names(reg) > 0 && 00931 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && 00932 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) { 00933 return 0; 00934 } 00935 #endif 00936 00937 return 1; 00938 } 00939 00940 00941 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16 00942 00943 static void 00944 scan_env_clear(ScanEnv* env) 00945 { 00946 int i; 00947 00948 BIT_STATUS_CLEAR(env->capture_history); 00949 BIT_STATUS_CLEAR(env->bt_mem_start); 00950 BIT_STATUS_CLEAR(env->bt_mem_end); 00951 BIT_STATUS_CLEAR(env->backrefed_mem); 00952 env->error = (UChar* )NULL; 00953 env->error_end = (UChar* )NULL; 00954 env->num_call = 0; 00955 env->num_mem = 0; 00956 #ifdef USE_NAMED_GROUP 00957 env->num_named = 0; 00958 #endif 00959 env->mem_alloc = 0; 00960 env->mem_nodes_dynamic = (Node** )NULL; 00961 00962 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++) 00963 env->mem_nodes_static[i] = NULL_NODE; 00964 00965 #ifdef USE_COMBINATION_EXPLOSION_CHECK 00966 env->num_comb_exp_check = 0; 00967 env->comb_exp_max_regnum = 0; 00968 env->curr_max_regnum = 0; 00969 env->has_recursion = 0; 00970 #endif 00971 env->warnings_flag = 0; 00972 } 00973 00974 static int 00975 scan_env_add_mem_entry(ScanEnv* env) 00976 { 00977 int i, need, alloc; 00978 Node** p; 00979 00980 need = env->num_mem + 1; 00981 if (need >= SCANENV_MEMNODES_SIZE) { 00982 if (env->mem_alloc <= need) { 00983 if (IS_NULL(env->mem_nodes_dynamic)) { 00984 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE; 00985 p = (Node** )xmalloc(sizeof(Node*) * alloc); 00986 xmemcpy(p, env->mem_nodes_static, 00987 sizeof(Node*) * SCANENV_MEMNODES_SIZE); 00988 } 00989 else { 00990 alloc = env->mem_alloc * 2; 00991 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc); 00992 } 00993 CHECK_NULL_RETURN_MEMERR(p); 00994 00995 for (i = env->num_mem + 1; i < alloc; i++) 00996 p[i] = NULL_NODE; 00997 00998 env->mem_nodes_dynamic = p; 00999 env->mem_alloc = alloc; 01000 } 01001 } 01002 01003 env->num_mem++; 01004 return env->num_mem; 01005 } 01006 01007 static int 01008 scan_env_set_mem_node(ScanEnv* env, int num, Node* node) 01009 { 01010 if (env->num_mem >= num) 01011 SCANENV_MEM_NODES(env)[num] = node; 01012 else 01013 return ONIGERR_PARSER_BUG; 01014 return 0; 01015 } 01016 01017 01018 #ifdef USE_PARSE_TREE_NODE_RECYCLE 01019 typedef struct _FreeNode { 01020 struct _FreeNode* next; 01021 } FreeNode; 01022 01023 static FreeNode* FreeNodeList = (FreeNode* )NULL; 01024 #endif 01025 01026 extern void 01027 onig_node_free(Node* node) 01028 { 01029 start: 01030 if (IS_NULL(node)) return ; 01031 01032 switch (NTYPE(node)) { 01033 case NT_STR: 01034 if (NSTR(node)->capa != 0 && 01035 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) { 01036 xfree(NSTR(node)->s); 01037 } 01038 break; 01039 01040 case NT_LIST: 01041 case NT_ALT: 01042 onig_node_free(NCAR(node)); 01043 { 01044 Node* next_node = NCDR(node); 01045 01046 #ifdef USE_PARSE_TREE_NODE_RECYCLE 01047 { 01048 FreeNode* n = (FreeNode* )node; 01049 01050 THREAD_ATOMIC_START; 01051 n->next = FreeNodeList; 01052 FreeNodeList = n; 01053 THREAD_ATOMIC_END; 01054 } 01055 #else 01056 xfree(node); 01057 #endif 01058 node = next_node; 01059 goto start; 01060 } 01061 break; 01062 01063 case NT_CCLASS: 01064 { 01065 CClassNode* cc = NCCLASS(node); 01066 01067 if (IS_NCCLASS_SHARE(cc)) return ; 01068 if (cc->mbuf) 01069 bbuf_free(cc->mbuf); 01070 } 01071 break; 01072 01073 case NT_QTFR: 01074 if (NQTFR(node)->target) 01075 onig_node_free(NQTFR(node)->target); 01076 break; 01077 01078 case NT_ENCLOSE: 01079 if (NENCLOSE(node)->target) 01080 onig_node_free(NENCLOSE(node)->target); 01081 break; 01082 01083 case NT_BREF: 01084 if (IS_NOT_NULL(NBREF(node)->back_dynamic)) 01085 xfree(NBREF(node)->back_dynamic); 01086 break; 01087 01088 case NT_ANCHOR: 01089 if (NANCHOR(node)->target) 01090 onig_node_free(NANCHOR(node)->target); 01091 break; 01092 } 01093 01094 #ifdef USE_PARSE_TREE_NODE_RECYCLE 01095 { 01096 FreeNode* n = (FreeNode* )node; 01097 01098 THREAD_ATOMIC_START; 01099 n->next = FreeNodeList; 01100 FreeNodeList = n; 01101 THREAD_ATOMIC_END; 01102 } 01103 #else 01104 xfree(node); 01105 #endif 01106 } 01107 01108 #ifdef USE_PARSE_TREE_NODE_RECYCLE 01109 extern int 01110 onig_free_node_list(void) 01111 { 01112 FreeNode* n; 01113 01114 /* THREAD_ATOMIC_START; */ 01115 while (IS_NOT_NULL(FreeNodeList)) { 01116 n = FreeNodeList; 01117 FreeNodeList = FreeNodeList->next; 01118 xfree(n); 01119 } 01120 /* THREAD_ATOMIC_END; */ 01121 return 0; 01122 } 01123 #endif 01124 01125 static Node* 01126 node_new(void) 01127 { 01128 Node* node; 01129 01130 #ifdef USE_PARSE_TREE_NODE_RECYCLE 01131 THREAD_ATOMIC_START; 01132 if (IS_NOT_NULL(FreeNodeList)) { 01133 node = (Node* )FreeNodeList; 01134 FreeNodeList = FreeNodeList->next; 01135 THREAD_ATOMIC_END; 01136 return node; 01137 } 01138 THREAD_ATOMIC_END; 01139 #endif 01140 01141 node = (Node* )xmalloc(sizeof(Node)); 01142 /* xmemset(node, 0, sizeof(Node)); */ 01143 return node; 01144 } 01145 01146 01147 static void 01148 initialize_cclass(CClassNode* cc) 01149 { 01150 BITSET_CLEAR(cc->bs); 01151 /* cc->base.flags = 0; */ 01152 cc->flags = 0; 01153 cc->mbuf = NULL; 01154 } 01155 01156 static Node* 01157 node_new_cclass(void) 01158 { 01159 Node* node = node_new(); 01160 CHECK_NULL_RETURN(node); 01161 01162 SET_NTYPE(node, NT_CCLASS); 01163 initialize_cclass(NCCLASS(node)); 01164 return node; 01165 } 01166 01167 static Node* 01168 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out, 01169 const OnigCodePoint ranges[]) 01170 { 01171 int n, i; 01172 CClassNode* cc; 01173 OnigCodePoint j; 01174 01175 Node* node = node_new_cclass(); 01176 CHECK_NULL_RETURN(node); 01177 01178 cc = NCCLASS(node); 01179 if (not != 0) NCCLASS_SET_NOT(cc); 01180 01181 BITSET_CLEAR(cc->bs); 01182 if (sb_out > 0 && IS_NOT_NULL(ranges)) { 01183 n = ONIGENC_CODE_RANGE_NUM(ranges); 01184 for (i = 0; i < n; i++) { 01185 for (j = ONIGENC_CODE_RANGE_FROM(ranges, i); 01186 j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) { 01187 if (j >= sb_out) goto sb_end; 01188 01189 BITSET_SET_BIT(cc->bs, j); 01190 } 01191 } 01192 } 01193 01194 sb_end: 01195 if (IS_NULL(ranges)) { 01196 is_null: 01197 cc->mbuf = NULL; 01198 } 01199 else { 01200 BBuf* bbuf; 01201 01202 n = ONIGENC_CODE_RANGE_NUM(ranges); 01203 if (n == 0) goto is_null; 01204 01205 bbuf = (BBuf* )xmalloc(sizeof(BBuf)); 01206 CHECK_NULL_RETURN(bbuf); 01207 bbuf->alloc = n + 1; 01208 bbuf->used = n + 1; 01209 bbuf->p = (UChar* )((void* )ranges); 01210 01211 cc->mbuf = bbuf; 01212 } 01213 01214 return node; 01215 } 01216 01217 static Node* 01218 node_new_ctype(int type, int not, int ascii_range) 01219 { 01220 Node* node = node_new(); 01221 CHECK_NULL_RETURN(node); 01222 01223 SET_NTYPE(node, NT_CTYPE); 01224 NCTYPE(node)->ctype = type; 01225 NCTYPE(node)->not = not; 01226 NCTYPE(node)->ascii_range = ascii_range; 01227 return node; 01228 } 01229 01230 static Node* 01231 node_new_anychar(void) 01232 { 01233 Node* node = node_new(); 01234 CHECK_NULL_RETURN(node); 01235 01236 SET_NTYPE(node, NT_CANY); 01237 return node; 01238 } 01239 01240 static Node* 01241 node_new_list(Node* left, Node* right) 01242 { 01243 Node* node = node_new(); 01244 CHECK_NULL_RETURN(node); 01245 01246 SET_NTYPE(node, NT_LIST); 01247 NCAR(node) = left; 01248 NCDR(node) = right; 01249 return node; 01250 } 01251 01252 extern Node* 01253 onig_node_new_list(Node* left, Node* right) 01254 { 01255 return node_new_list(left, right); 01256 } 01257 01258 extern Node* 01259 onig_node_list_add(Node* list, Node* x) 01260 { 01261 Node *n; 01262 01263 n = onig_node_new_list(x, NULL); 01264 if (IS_NULL(n)) return NULL_NODE; 01265 01266 if (IS_NOT_NULL(list)) { 01267 while (IS_NOT_NULL(NCDR(list))) 01268 list = NCDR(list); 01269 01270 NCDR(list) = n; 01271 } 01272 01273 return n; 01274 } 01275 01276 extern Node* 01277 onig_node_new_alt(Node* left, Node* right) 01278 { 01279 Node* node = node_new(); 01280 CHECK_NULL_RETURN(node); 01281 01282 SET_NTYPE(node, NT_ALT); 01283 NCAR(node) = left; 01284 NCDR(node) = right; 01285 return node; 01286 } 01287 01288 extern Node* 01289 onig_node_new_anchor(int type) 01290 { 01291 Node* node = node_new(); 01292 CHECK_NULL_RETURN(node); 01293 01294 SET_NTYPE(node, NT_ANCHOR); 01295 NANCHOR(node)->type = type; 01296 NANCHOR(node)->target = NULL; 01297 NANCHOR(node)->char_len = -1; 01298 NANCHOR(node)->ascii_range = 0; 01299 return node; 01300 } 01301 01302 static Node* 01303 node_new_backref(int back_num, int* backrefs, int by_name, 01304 #ifdef USE_BACKREF_WITH_LEVEL 01305 int exist_level, int nest_level, 01306 #endif 01307 ScanEnv* env) 01308 { 01309 int i; 01310 Node* node = node_new(); 01311 01312 CHECK_NULL_RETURN(node); 01313 01314 SET_NTYPE(node, NT_BREF); 01315 NBREF(node)->state = 0; 01316 NBREF(node)->back_num = back_num; 01317 NBREF(node)->back_dynamic = (int* )NULL; 01318 if (by_name != 0) 01319 NBREF(node)->state |= NST_NAME_REF; 01320 01321 #ifdef USE_BACKREF_WITH_LEVEL 01322 if (exist_level != 0) { 01323 NBREF(node)->state |= NST_NEST_LEVEL; 01324 NBREF(node)->nest_level = nest_level; 01325 } 01326 #endif 01327 01328 for (i = 0; i < back_num; i++) { 01329 if (backrefs[i] <= env->num_mem && 01330 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) { 01331 NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */ 01332 break; 01333 } 01334 } 01335 01336 if (back_num <= NODE_BACKREFS_SIZE) { 01337 for (i = 0; i < back_num; i++) 01338 NBREF(node)->back_static[i] = backrefs[i]; 01339 } 01340 else { 01341 int* p = (int* )xmalloc(sizeof(int) * back_num); 01342 if (IS_NULL(p)) { 01343 onig_node_free(node); 01344 return NULL; 01345 } 01346 NBREF(node)->back_dynamic = p; 01347 for (i = 0; i < back_num; i++) 01348 p[i] = backrefs[i]; 01349 } 01350 return node; 01351 } 01352 01353 #ifdef USE_SUBEXP_CALL 01354 static Node* 01355 node_new_call(UChar* name, UChar* name_end, int gnum) 01356 { 01357 Node* node = node_new(); 01358 CHECK_NULL_RETURN(node); 01359 01360 SET_NTYPE(node, NT_CALL); 01361 NCALL(node)->state = 0; 01362 NCALL(node)->target = NULL_NODE; 01363 NCALL(node)->name = name; 01364 NCALL(node)->name_end = name_end; 01365 NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */ 01366 return node; 01367 } 01368 #endif 01369 01370 static Node* 01371 node_new_quantifier(int lower, int upper, int by_number) 01372 { 01373 Node* node = node_new(); 01374 CHECK_NULL_RETURN(node); 01375 01376 SET_NTYPE(node, NT_QTFR); 01377 NQTFR(node)->state = 0; 01378 NQTFR(node)->target = NULL; 01379 NQTFR(node)->lower = lower; 01380 NQTFR(node)->upper = upper; 01381 NQTFR(node)->greedy = 1; 01382 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY; 01383 NQTFR(node)->head_exact = NULL_NODE; 01384 NQTFR(node)->next_head_exact = NULL_NODE; 01385 NQTFR(node)->is_refered = 0; 01386 if (by_number != 0) 01387 NQTFR(node)->state |= NST_BY_NUMBER; 01388 01389 #ifdef USE_COMBINATION_EXPLOSION_CHECK 01390 NQTFR(node)->comb_exp_check_num = 0; 01391 #endif 01392 01393 return node; 01394 } 01395 01396 static Node* 01397 node_new_enclose(int type) 01398 { 01399 Node* node = node_new(); 01400 CHECK_NULL_RETURN(node); 01401 01402 SET_NTYPE(node, NT_ENCLOSE); 01403 NENCLOSE(node)->type = type; 01404 NENCLOSE(node)->state = 0; 01405 NENCLOSE(node)->regnum = 0; 01406 NENCLOSE(node)->option = 0; 01407 NENCLOSE(node)->target = NULL; 01408 NENCLOSE(node)->call_addr = -1; 01409 NENCLOSE(node)->opt_count = 0; 01410 return node; 01411 } 01412 01413 extern Node* 01414 onig_node_new_enclose(int type) 01415 { 01416 return node_new_enclose(type); 01417 } 01418 01419 static Node* 01420 node_new_enclose_memory(OnigOptionType option, int is_named) 01421 { 01422 Node* node = node_new_enclose(ENCLOSE_MEMORY); 01423 CHECK_NULL_RETURN(node); 01424 if (is_named != 0) 01425 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP); 01426 01427 #ifdef USE_SUBEXP_CALL 01428 NENCLOSE(node)->option = option; 01429 #endif 01430 return node; 01431 } 01432 01433 static Node* 01434 node_new_option(OnigOptionType option) 01435 { 01436 Node* node = node_new_enclose(ENCLOSE_OPTION); 01437 CHECK_NULL_RETURN(node); 01438 NENCLOSE(node)->option = option; 01439 return node; 01440 } 01441 01442 extern int 01443 onig_node_str_cat(Node* node, const UChar* s, const UChar* end) 01444 { 01445 ptrdiff_t addlen = end - s; 01446 01447 if (addlen > 0) { 01448 ptrdiff_t len = NSTR(node)->end - NSTR(node)->s; 01449 01450 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) { 01451 UChar* p; 01452 ptrdiff_t capa = len + addlen + NODE_STR_MARGIN; 01453 01454 if (capa <= NSTR(node)->capa) { 01455 onig_strcpy(NSTR(node)->s + len, s, end); 01456 } 01457 else { 01458 if (NSTR(node)->s == NSTR(node)->buf) 01459 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end, 01460 s, end, capa); 01461 else 01462 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa); 01463 01464 CHECK_NULL_RETURN_MEMERR(p); 01465 NSTR(node)->s = p; 01466 NSTR(node)->capa = (int )capa; 01467 } 01468 } 01469 else { 01470 onig_strcpy(NSTR(node)->s + len, s, end); 01471 } 01472 NSTR(node)->end = NSTR(node)->s + len + addlen; 01473 } 01474 01475 return 0; 01476 } 01477 01478 extern int 01479 onig_node_str_set(Node* node, const UChar* s, const UChar* end) 01480 { 01481 onig_node_str_clear(node); 01482 return onig_node_str_cat(node, s, end); 01483 } 01484 01485 static int 01486 node_str_cat_char(Node* node, UChar c) 01487 { 01488 UChar s[1]; 01489 01490 s[0] = c; 01491 return onig_node_str_cat(node, s, s + 1); 01492 } 01493 01494 static int 01495 node_str_cat_codepoint(Node* node, OnigEncoding enc, OnigCodePoint c) 01496 { 01497 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; 01498 int num = ONIGENC_CODE_TO_MBC(enc, c, buf); 01499 if (num < 0) return num; 01500 return onig_node_str_cat(node, buf, buf + num); 01501 } 01502 01503 extern void 01504 onig_node_conv_to_str_node(Node* node, int flag) 01505 { 01506 SET_NTYPE(node, NT_STR); 01507 NSTR(node)->flag = flag; 01508 NSTR(node)->capa = 0; 01509 NSTR(node)->s = NSTR(node)->buf; 01510 NSTR(node)->end = NSTR(node)->buf; 01511 } 01512 01513 extern void 01514 onig_node_str_clear(Node* node) 01515 { 01516 if (NSTR(node)->capa != 0 && 01517 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) { 01518 xfree(NSTR(node)->s); 01519 } 01520 01521 NSTR(node)->capa = 0; 01522 NSTR(node)->flag = 0; 01523 NSTR(node)->s = NSTR(node)->buf; 01524 NSTR(node)->end = NSTR(node)->buf; 01525 } 01526 01527 static Node* 01528 node_new_str(const UChar* s, const UChar* end) 01529 { 01530 Node* node = node_new(); 01531 CHECK_NULL_RETURN(node); 01532 01533 SET_NTYPE(node, NT_STR); 01534 NSTR(node)->capa = 0; 01535 NSTR(node)->flag = 0; 01536 NSTR(node)->s = NSTR(node)->buf; 01537 NSTR(node)->end = NSTR(node)->buf; 01538 if (onig_node_str_cat(node, s, end)) { 01539 onig_node_free(node); 01540 return NULL; 01541 } 01542 return node; 01543 } 01544 01545 extern Node* 01546 onig_node_new_str(const UChar* s, const UChar* end) 01547 { 01548 return node_new_str(s, end); 01549 } 01550 01551 static Node* 01552 node_new_str_raw(UChar* s, UChar* end) 01553 { 01554 Node* node = node_new_str(s, end); 01555 if (IS_NOT_NULL(node)) 01556 NSTRING_SET_RAW(node); 01557 return node; 01558 } 01559 01560 static Node* 01561 node_new_empty(void) 01562 { 01563 return node_new_str(NULL, NULL); 01564 } 01565 01566 static Node* 01567 node_new_str_raw_char(UChar c) 01568 { 01569 UChar p[1]; 01570 01571 p[0] = c; 01572 return node_new_str_raw(p, p + 1); 01573 } 01574 01575 static Node* 01576 str_node_split_last_char(StrNode* sn, OnigEncoding enc) 01577 { 01578 const UChar *p; 01579 Node* n = NULL_NODE; 01580 01581 if (sn->end > sn->s) { 01582 p = onigenc_get_prev_char_head(enc, sn->s, sn->end, sn->end); 01583 if (p && p > sn->s) { /* can be split. */ 01584 n = node_new_str(p, sn->end); 01585 if (IS_NOT_NULL(n) && (sn->flag & NSTR_RAW) != 0) 01586 NSTRING_SET_RAW(n); 01587 sn->end = (UChar* )p; 01588 } 01589 } 01590 return n; 01591 } 01592 01593 static int 01594 str_node_can_be_split(StrNode* sn, OnigEncoding enc) 01595 { 01596 if (sn->end > sn->s) { 01597 return ((enclen(enc, sn->s, sn->end) < sn->end - sn->s) ? 1 : 0); 01598 } 01599 return 0; 01600 } 01601 01602 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR 01603 static int 01604 node_str_head_pad(StrNode* sn, int num, UChar val) 01605 { 01606 UChar buf[NODE_STR_BUF_SIZE]; 01607 int i, len; 01608 01609 len = sn->end - sn->s; 01610 onig_strcpy(buf, sn->s, sn->end); 01611 onig_strcpy(&(sn->s[num]), buf, buf + len); 01612 sn->end += num; 01613 01614 for (i = 0; i < num; i++) { 01615 sn->s[i] = val; 01616 } 01617 } 01618 #endif 01619 01620 extern int 01621 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) 01622 { 01623 unsigned int num, val; 01624 OnigCodePoint c; 01625 UChar* p = *src; 01626 PFETCH_READY; 01627 01628 num = 0; 01629 while (!PEND) { 01630 PFETCH(c); 01631 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 01632 val = (unsigned int )DIGITVAL(c); 01633 if ((INT_MAX_LIMIT - val) / 10UL < num) 01634 return -1; /* overflow */ 01635 01636 num = num * 10 + val; 01637 } 01638 else { 01639 PUNFETCH; 01640 break; 01641 } 01642 } 01643 *src = p; 01644 return num; 01645 } 01646 01647 static int 01648 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, 01649 int maxlen, OnigEncoding enc) 01650 { 01651 OnigCodePoint c; 01652 unsigned int num, val; 01653 int restlen; 01654 UChar* p = *src; 01655 PFETCH_READY; 01656 01657 restlen = maxlen - minlen; 01658 num = 0; 01659 while (!PEND && maxlen-- != 0) { 01660 PFETCH(c); 01661 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) { 01662 val = (unsigned int )XDIGITVAL(enc,c); 01663 if ((INT_MAX_LIMIT - val) / 16UL < num) 01664 return -1; /* overflow */ 01665 01666 num = (num << 4) + XDIGITVAL(enc,c); 01667 } 01668 else { 01669 PUNFETCH; 01670 break; 01671 } 01672 } 01673 if (maxlen > restlen) 01674 return -2; /* not enough digits */ 01675 *src = p; 01676 return num; 01677 } 01678 01679 static int 01680 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, 01681 OnigEncoding enc) 01682 { 01683 OnigCodePoint c; 01684 unsigned int num, val; 01685 UChar* p = *src; 01686 PFETCH_READY; 01687 01688 num = 0; 01689 while (!PEND && maxlen-- != 0) { 01690 PFETCH(c); 01691 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') { 01692 val = ODIGITVAL(c); 01693 if ((INT_MAX_LIMIT - val) / 8UL < num) 01694 return -1; /* overflow */ 01695 01696 num = (num << 3) + val; 01697 } 01698 else { 01699 PUNFETCH; 01700 break; 01701 } 01702 } 01703 *src = p; 01704 return num; 01705 } 01706 01707 01708 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \ 01709 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT) 01710 01711 /* data format: 01712 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n] 01713 (all data size is OnigCodePoint) 01714 */ 01715 static int 01716 new_code_range(BBuf** pbuf) 01717 { 01718 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5) 01719 int r; 01720 OnigCodePoint n; 01721 BBuf* bbuf; 01722 01723 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf)); 01724 CHECK_NULL_RETURN_MEMERR(*pbuf); 01725 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE); 01726 if (r) return r; 01727 01728 n = 0; 01729 BBUF_WRITE_CODE_POINT(bbuf, 0, n); 01730 return 0; 01731 } 01732 01733 static int 01734 add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, 01735 int checkdup) 01736 { 01737 int r, inc_n, pos; 01738 OnigCodePoint low, high, bound, x; 01739 OnigCodePoint n, *data; 01740 BBuf* bbuf; 01741 01742 if (from > to) { 01743 n = from; from = to; to = n; 01744 } 01745 01746 if (IS_NULL(*pbuf)) { 01747 r = new_code_range(pbuf); 01748 if (r) return r; 01749 bbuf = *pbuf; 01750 n = 0; 01751 } 01752 else { 01753 bbuf = *pbuf; 01754 GET_CODE_POINT(n, bbuf->p); 01755 } 01756 data = (OnigCodePoint* )(bbuf->p); 01757 data++; 01758 01759 bound = (from == 0) ? 0 : n; 01760 for (low = 0; low < bound; ) { 01761 x = (low + bound) >> 1; 01762 if (from - 1 > data[x*2 + 1]) 01763 low = x + 1; 01764 else 01765 bound = x; 01766 } 01767 01768 high = (to == ONIG_LAST_CODE_POINT) ? n : low; 01769 for (bound = n; high < bound; ) { 01770 x = (high + bound) >> 1; 01771 if (to + 1 >= data[x*2]) 01772 high = x + 1; 01773 else 01774 bound = x; 01775 } 01776 /* data[(low-1)*2+1] << from <= data[low*2] 01777 * data[(high-1)*2+1] <= to << data[high*2] 01778 */ 01779 01780 inc_n = low + 1 - high; 01781 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM) 01782 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES; 01783 01784 if (inc_n != 1) { 01785 if (checkdup && from <= data[low*2+1] 01786 && (data[low*2] <= from || data[low*2+1] <= to)) 01787 CC_DUP_WARN(env); 01788 if (from > data[low*2]) 01789 from = data[low*2]; 01790 if (to < data[(high - 1)*2 + 1]) 01791 to = data[(high - 1)*2 + 1]; 01792 } 01793 01794 if (inc_n != 0) { 01795 int from_pos = SIZE_CODE_POINT * (1 + high * 2); 01796 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2); 01797 01798 if (inc_n > 0) { 01799 if (high < n) { 01800 int size = (n - high) * 2 * SIZE_CODE_POINT; 01801 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size); 01802 } 01803 } 01804 else { 01805 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos); 01806 } 01807 } 01808 01809 pos = SIZE_CODE_POINT * (1 + low * 2); 01810 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2); 01811 BBUF_WRITE_CODE_POINT(bbuf, pos, from); 01812 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to); 01813 n += inc_n; 01814 BBUF_WRITE_CODE_POINT(bbuf, 0, n); 01815 01816 return 0; 01817 } 01818 01819 static int 01820 add_code_range_to_buf(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) 01821 { 01822 return add_code_range_to_buf0(pbuf, env, from, to, 1); 01823 } 01824 01825 static int 01826 add_code_range0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, int checkdup) 01827 { 01828 if (from > to) { 01829 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) 01830 return 0; 01831 else 01832 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; 01833 } 01834 01835 return add_code_range_to_buf0(pbuf, env, from, to, checkdup); 01836 } 01837 01838 static int 01839 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) 01840 { 01841 return add_code_range0(pbuf, env, from, to, 1); 01842 } 01843 01844 static int 01845 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf, ScanEnv* env) 01846 { 01847 int r, i, n; 01848 OnigCodePoint pre, from, *data, to = 0; 01849 01850 *pbuf = (BBuf* )NULL; 01851 if (IS_NULL(bbuf)) { 01852 set_all: 01853 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); 01854 } 01855 01856 data = (OnigCodePoint* )(bbuf->p); 01857 GET_CODE_POINT(n, data); 01858 data++; 01859 if (n <= 0) goto set_all; 01860 01861 r = 0; 01862 pre = MBCODE_START_POS(enc); 01863 for (i = 0; i < n; i++) { 01864 from = data[i*2]; 01865 to = data[i*2+1]; 01866 if (pre <= from - 1) { 01867 r = add_code_range_to_buf(pbuf, env, pre, from - 1); 01868 if (r != 0) return r; 01869 } 01870 if (to == ONIG_LAST_CODE_POINT) break; 01871 pre = to + 1; 01872 } 01873 if (to < ONIG_LAST_CODE_POINT) { 01874 r = add_code_range_to_buf(pbuf, env, to + 1, ONIG_LAST_CODE_POINT); 01875 } 01876 return r; 01877 } 01878 01879 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\ 01880 BBuf *tbuf; \ 01881 int tnot; \ 01882 tnot = not1; not1 = not2; not2 = tnot; \ 01883 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \ 01884 } while (0) 01885 01886 static int 01887 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1, 01888 BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env) 01889 { 01890 int r; 01891 OnigCodePoint i, n1, *data1; 01892 OnigCodePoint from, to; 01893 01894 *pbuf = (BBuf* )NULL; 01895 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { 01896 if (not1 != 0 || not2 != 0) 01897 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); 01898 return 0; 01899 } 01900 01901 r = 0; 01902 if (IS_NULL(bbuf2)) 01903 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); 01904 01905 if (IS_NULL(bbuf1)) { 01906 if (not1 != 0) { 01907 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); 01908 } 01909 else { 01910 if (not2 == 0) { 01911 return bbuf_clone(pbuf, bbuf2); 01912 } 01913 else { 01914 return not_code_range_buf(enc, bbuf2, pbuf, env); 01915 } 01916 } 01917 } 01918 01919 if (not1 != 0) 01920 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); 01921 01922 data1 = (OnigCodePoint* )(bbuf1->p); 01923 GET_CODE_POINT(n1, data1); 01924 data1++; 01925 01926 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */ 01927 r = bbuf_clone(pbuf, bbuf2); 01928 } 01929 else if (not1 == 0) { /* 1 OR (not 2) */ 01930 r = not_code_range_buf(enc, bbuf2, pbuf, env); 01931 } 01932 if (r != 0) return r; 01933 01934 for (i = 0; i < n1; i++) { 01935 from = data1[i*2]; 01936 to = data1[i*2+1]; 01937 r = add_code_range_to_buf(pbuf, env, from, to); 01938 if (r != 0) return r; 01939 } 01940 return 0; 01941 } 01942 01943 static int 01944 and_code_range1(BBuf** pbuf, ScanEnv* env, OnigCodePoint from1, OnigCodePoint to1, 01945 OnigCodePoint* data, int n) 01946 { 01947 int i, r; 01948 OnigCodePoint from2, to2; 01949 01950 for (i = 0; i < n; i++) { 01951 from2 = data[i*2]; 01952 to2 = data[i*2+1]; 01953 if (from2 < from1) { 01954 if (to2 < from1) continue; 01955 else { 01956 from1 = to2 + 1; 01957 } 01958 } 01959 else if (from2 <= to1) { 01960 if (to2 < to1) { 01961 if (from1 <= from2 - 1) { 01962 r = add_code_range_to_buf(pbuf, env, from1, from2-1); 01963 if (r != 0) return r; 01964 } 01965 from1 = to2 + 1; 01966 } 01967 else { 01968 to1 = from2 - 1; 01969 } 01970 } 01971 else { 01972 from1 = from2; 01973 } 01974 if (from1 > to1) break; 01975 } 01976 if (from1 <= to1) { 01977 r = add_code_range_to_buf(pbuf, env, from1, to1); 01978 if (r != 0) return r; 01979 } 01980 return 0; 01981 } 01982 01983 static int 01984 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env) 01985 { 01986 int r; 01987 OnigCodePoint i, j, n1, n2, *data1, *data2; 01988 OnigCodePoint from, to, from1, to1, from2, to2; 01989 01990 *pbuf = (BBuf* )NULL; 01991 if (IS_NULL(bbuf1)) { 01992 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */ 01993 return bbuf_clone(pbuf, bbuf2); 01994 return 0; 01995 } 01996 else if (IS_NULL(bbuf2)) { 01997 if (not2 != 0) 01998 return bbuf_clone(pbuf, bbuf1); 01999 return 0; 02000 } 02001 02002 if (not1 != 0) 02003 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); 02004 02005 data1 = (OnigCodePoint* )(bbuf1->p); 02006 data2 = (OnigCodePoint* )(bbuf2->p); 02007 GET_CODE_POINT(n1, data1); 02008 GET_CODE_POINT(n2, data2); 02009 data1++; 02010 data2++; 02011 02012 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */ 02013 for (i = 0; i < n1; i++) { 02014 from1 = data1[i*2]; 02015 to1 = data1[i*2+1]; 02016 for (j = 0; j < n2; j++) { 02017 from2 = data2[j*2]; 02018 to2 = data2[j*2+1]; 02019 if (from2 > to1) break; 02020 if (to2 < from1) continue; 02021 from = MAX(from1, from2); 02022 to = MIN(to1, to2); 02023 r = add_code_range_to_buf(pbuf, env, from, to); 02024 if (r != 0) return r; 02025 } 02026 } 02027 } 02028 else if (not1 == 0) { /* 1 AND (not 2) */ 02029 for (i = 0; i < n1; i++) { 02030 from1 = data1[i*2]; 02031 to1 = data1[i*2+1]; 02032 r = and_code_range1(pbuf, env, from1, to1, data2, n2); 02033 if (r != 0) return r; 02034 } 02035 } 02036 02037 return 0; 02038 } 02039 02040 static int 02041 and_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env) 02042 { 02043 OnigEncoding enc = env->enc; 02044 int r, not1, not2; 02045 BBuf *buf1, *buf2, *pbuf = 0; 02046 BitSetRef bsr1, bsr2; 02047 BitSet bs1, bs2; 02048 02049 not1 = IS_NCCLASS_NOT(dest); 02050 bsr1 = dest->bs; 02051 buf1 = dest->mbuf; 02052 not2 = IS_NCCLASS_NOT(cc); 02053 bsr2 = cc->bs; 02054 buf2 = cc->mbuf; 02055 02056 if (not1 != 0) { 02057 bitset_invert_to(bsr1, bs1); 02058 bsr1 = bs1; 02059 } 02060 if (not2 != 0) { 02061 bitset_invert_to(bsr2, bs2); 02062 bsr2 = bs2; 02063 } 02064 bitset_and(bsr1, bsr2); 02065 if (bsr1 != dest->bs) { 02066 bitset_copy(dest->bs, bsr1); 02067 bsr1 = dest->bs; 02068 } 02069 if (not1 != 0) { 02070 bitset_invert(dest->bs); 02071 } 02072 02073 if (! ONIGENC_IS_SINGLEBYTE(enc)) { 02074 if (not1 != 0 && not2 != 0) { 02075 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf, env); 02076 } 02077 else { 02078 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf, env); 02079 if (r == 0 && not1 != 0) { 02080 BBuf *tbuf = 0; 02081 r = not_code_range_buf(enc, pbuf, &tbuf, env); 02082 bbuf_free(pbuf); 02083 pbuf = tbuf; 02084 } 02085 } 02086 if (r != 0) { 02087 bbuf_free(pbuf); 02088 return r; 02089 } 02090 02091 dest->mbuf = pbuf; 02092 bbuf_free(buf1); 02093 return r; 02094 } 02095 return 0; 02096 } 02097 02098 static int 02099 or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env) 02100 { 02101 OnigEncoding enc = env->enc; 02102 int r, not1, not2; 02103 BBuf *buf1, *buf2, *pbuf = 0; 02104 BitSetRef bsr1, bsr2; 02105 BitSet bs1, bs2; 02106 02107 not1 = IS_NCCLASS_NOT(dest); 02108 bsr1 = dest->bs; 02109 buf1 = dest->mbuf; 02110 not2 = IS_NCCLASS_NOT(cc); 02111 bsr2 = cc->bs; 02112 buf2 = cc->mbuf; 02113 02114 if (not1 != 0) { 02115 bitset_invert_to(bsr1, bs1); 02116 bsr1 = bs1; 02117 } 02118 if (not2 != 0) { 02119 bitset_invert_to(bsr2, bs2); 02120 bsr2 = bs2; 02121 } 02122 bitset_or(bsr1, bsr2); 02123 if (bsr1 != dest->bs) { 02124 bitset_copy(dest->bs, bsr1); 02125 bsr1 = dest->bs; 02126 } 02127 if (not1 != 0) { 02128 bitset_invert(dest->bs); 02129 } 02130 02131 if (! ONIGENC_IS_SINGLEBYTE(enc)) { 02132 if (not1 != 0 && not2 != 0) { 02133 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf, env); 02134 } 02135 else { 02136 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf, env); 02137 if (r == 0 && not1 != 0) { 02138 BBuf *tbuf = 0; 02139 r = not_code_range_buf(enc, pbuf, &tbuf, env); 02140 bbuf_free(pbuf); 02141 pbuf = tbuf; 02142 } 02143 } 02144 if (r != 0) { 02145 bbuf_free(pbuf); 02146 return r; 02147 } 02148 02149 dest->mbuf = pbuf; 02150 bbuf_free(buf1); 02151 return r; 02152 } 02153 else 02154 return 0; 02155 } 02156 02157 static void UNKNOWN_ESC_WARN(ScanEnv *env, int c); 02158 02159 static int 02160 conv_backslash_value(int c, ScanEnv* env) 02161 { 02162 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { 02163 switch (c) { 02164 case 'n': return '\n'; 02165 case 't': return '\t'; 02166 case 'r': return '\r'; 02167 case 'f': return '\f'; 02168 case 'a': return '\007'; 02169 case 'b': return '\010'; 02170 case 'e': return '\033'; 02171 case 'v': 02172 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB)) 02173 return '\v'; 02174 break; 02175 02176 default: 02177 if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) 02178 UNKNOWN_ESC_WARN(env, c); 02179 break; 02180 } 02181 } 02182 return c; 02183 } 02184 02185 #ifdef USE_NO_INVALID_QUANTIFIER 02186 #define is_invalid_quantifier_target(node) 0 02187 #else 02188 static int 02189 is_invalid_quantifier_target(Node* node) 02190 { 02191 switch (NTYPE(node)) { 02192 case NT_ANCHOR: 02193 return 1; 02194 break; 02195 02196 case NT_ENCLOSE: 02197 /* allow enclosed elements */ 02198 /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */ 02199 break; 02200 02201 case NT_LIST: 02202 do { 02203 if (! is_invalid_quantifier_target(NCAR(node))) return 0; 02204 } while (IS_NOT_NULL(node = NCDR(node))); 02205 return 0; 02206 break; 02207 02208 case NT_ALT: 02209 do { 02210 if (is_invalid_quantifier_target(NCAR(node))) return 1; 02211 } while (IS_NOT_NULL(node = NCDR(node))); 02212 break; 02213 02214 default: 02215 break; 02216 } 02217 return 0; 02218 } 02219 #endif 02220 02221 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */ 02222 static int 02223 popular_quantifier_num(QtfrNode* q) 02224 { 02225 if (q->greedy) { 02226 if (q->lower == 0) { 02227 if (q->upper == 1) return 0; 02228 else if (IS_REPEAT_INFINITE(q->upper)) return 1; 02229 } 02230 else if (q->lower == 1) { 02231 if (IS_REPEAT_INFINITE(q->upper)) return 2; 02232 } 02233 } 02234 else { 02235 if (q->lower == 0) { 02236 if (q->upper == 1) return 3; 02237 else if (IS_REPEAT_INFINITE(q->upper)) return 4; 02238 } 02239 else if (q->lower == 1) { 02240 if (IS_REPEAT_INFINITE(q->upper)) return 5; 02241 } 02242 } 02243 return -1; 02244 } 02245 02246 02247 enum ReduceType { 02248 RQ_ASIS = 0, /* as is */ 02249 RQ_DEL = 1, /* delete parent */ 02250 RQ_A, /* to '*' */ 02251 RQ_AQ, /* to '*?' */ 02252 RQ_QQ, /* to '??' */ 02253 RQ_P_QQ, /* to '+)??' */ 02254 RQ_PQ_Q /* to '+?)?' */ 02255 }; 02256 02257 static enum ReduceType const ReduceTypeTable[6][6] = { 02258 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */ 02259 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */ 02260 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */ 02261 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */ 02262 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */ 02263 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ 02264 }; 02265 02266 extern void 02267 onig_reduce_nested_quantifier(Node* pnode, Node* cnode) 02268 { 02269 int pnum, cnum; 02270 QtfrNode *p, *c; 02271 02272 p = NQTFR(pnode); 02273 c = NQTFR(cnode); 02274 pnum = popular_quantifier_num(p); 02275 cnum = popular_quantifier_num(c); 02276 if (pnum < 0 || cnum < 0) return ; 02277 02278 switch (ReduceTypeTable[cnum][pnum]) { 02279 case RQ_DEL: 02280 *pnode = *cnode; 02281 break; 02282 case RQ_A: 02283 p->target = c->target; 02284 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; 02285 break; 02286 case RQ_AQ: 02287 p->target = c->target; 02288 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; 02289 break; 02290 case RQ_QQ: 02291 p->target = c->target; 02292 p->lower = 0; p->upper = 1; p->greedy = 0; 02293 break; 02294 case RQ_P_QQ: 02295 p->target = cnode; 02296 p->lower = 0; p->upper = 1; p->greedy = 0; 02297 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; 02298 return ; 02299 break; 02300 case RQ_PQ_Q: 02301 p->target = cnode; 02302 p->lower = 0; p->upper = 1; p->greedy = 1; 02303 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; 02304 return ; 02305 break; 02306 case RQ_ASIS: 02307 p->target = cnode; 02308 return ; 02309 break; 02310 } 02311 02312 c->target = NULL_NODE; 02313 onig_node_free(cnode); 02314 } 02315 02316 02317 enum TokenSyms { 02318 TK_EOT = 0, /* end of token */ 02319 TK_RAW_BYTE = 1, 02320 TK_CHAR, 02321 TK_STRING, 02322 TK_CODE_POINT, 02323 TK_ANYCHAR, 02324 TK_CHAR_TYPE, 02325 TK_BACKREF, 02326 TK_CALL, 02327 TK_ANCHOR, 02328 TK_OP_REPEAT, 02329 TK_INTERVAL, 02330 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */ 02331 TK_ALT, 02332 TK_SUBEXP_OPEN, 02333 TK_SUBEXP_CLOSE, 02334 TK_CC_OPEN, 02335 TK_QUOTE_OPEN, 02336 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ 02337 TK_LINEBREAK, 02338 TK_EXTENDED_GRAPHEME_CLUSTER, 02339 TK_KEEP, 02340 /* in cc */ 02341 TK_CC_CLOSE, 02342 TK_CC_RANGE, 02343 TK_POSIX_BRACKET_OPEN, 02344 TK_CC_AND, /* && */ 02345 TK_CC_CC_OPEN /* [ */ 02346 }; 02347 02348 typedef struct { 02349 enum TokenSyms type; 02350 int escaped; 02351 int base; /* is number: 8, 16 (used in [....]) */ 02352 UChar* backp; 02353 union { 02354 UChar* s; 02355 int c; 02356 OnigCodePoint code; 02357 struct { 02358 int subtype; 02359 int ascii_range; 02360 } anchor; 02361 struct { 02362 int lower; 02363 int upper; 02364 int greedy; 02365 int possessive; 02366 } repeat; 02367 struct { 02368 int num; 02369 int ref1; 02370 int* refs; 02371 int by_name; 02372 #ifdef USE_BACKREF_WITH_LEVEL 02373 int exist_level; 02374 int level; /* \k<name+n> */ 02375 #endif 02376 } backref; 02377 struct { 02378 UChar* name; 02379 UChar* name_end; 02380 int gnum; 02381 int rel; 02382 } call; 02383 struct { 02384 int ctype; 02385 int not; 02386 } prop; 02387 } u; 02388 } OnigToken; 02389 02390 02391 static int 02392 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) 02393 { 02394 int low, up, syn_allow, non_low = 0; 02395 int r = 0; 02396 OnigCodePoint c; 02397 OnigEncoding enc = env->enc; 02398 UChar* p = *src; 02399 PFETCH_READY; 02400 02401 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL); 02402 02403 if (PEND) { 02404 if (syn_allow) 02405 return 1; /* "....{" : OK! */ 02406 else 02407 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */ 02408 } 02409 02410 if (! syn_allow) { 02411 c = PPEEK; 02412 if (c == ')' || c == '(' || c == '|') { 02413 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; 02414 } 02415 } 02416 02417 low = onig_scan_unsigned_number(&p, end, env->enc); 02418 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 02419 if (low > ONIG_MAX_REPEAT_NUM) 02420 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 02421 02422 if (p == *src) { /* can't read low */ 02423 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) { 02424 /* allow {,n} as {0,n} */ 02425 low = 0; 02426 non_low = 1; 02427 } 02428 else 02429 goto invalid; 02430 } 02431 02432 if (PEND) goto invalid; 02433 PFETCH(c); 02434 if (c == ',') { 02435 UChar* prev = p; 02436 up = onig_scan_unsigned_number(&p, end, env->enc); 02437 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 02438 if (up > ONIG_MAX_REPEAT_NUM) 02439 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 02440 02441 if (p == prev) { 02442 if (non_low != 0) 02443 goto invalid; 02444 up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ 02445 } 02446 } 02447 else { 02448 if (non_low != 0) 02449 goto invalid; 02450 02451 PUNFETCH; 02452 up = low; /* {n} : exact n times */ 02453 r = 2; /* fixed */ 02454 } 02455 02456 if (PEND) goto invalid; 02457 PFETCH(c); 02458 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { 02459 if (c != MC_ESC(env->syntax)) goto invalid; 02460 PFETCH(c); 02461 } 02462 if (c != '}') goto invalid; 02463 02464 if (!IS_REPEAT_INFINITE(up) && low > up) { 02465 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; 02466 } 02467 02468 tok->type = TK_INTERVAL; 02469 tok->u.repeat.lower = low; 02470 tok->u.repeat.upper = up; 02471 *src = p; 02472 return r; /* 0: normal {n,m}, 2: fixed {n} */ 02473 02474 invalid: 02475 if (syn_allow) 02476 return 1; /* OK */ 02477 else 02478 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN; 02479 } 02480 02481 /* \M-, \C-, \c, or \... */ 02482 static int 02483 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) 02484 { 02485 int v; 02486 OnigCodePoint c; 02487 OnigEncoding enc = env->enc; 02488 UChar* p = *src; 02489 PFETCH_READY; 02490 02491 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; 02492 02493 PFETCH(c); 02494 switch (c) { 02495 case 'M': 02496 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) { 02497 if (PEND) return ONIGERR_END_PATTERN_AT_META; 02498 PFETCH(c); 02499 if (c != '-') return ONIGERR_META_CODE_SYNTAX; 02500 if (PEND) return ONIGERR_END_PATTERN_AT_META; 02501 PFETCH(c); 02502 if (c == MC_ESC(env->syntax)) { 02503 v = fetch_escaped_value(&p, end, env); 02504 if (v < 0) return v; 02505 c = (OnigCodePoint )v; 02506 } 02507 c = ((c & 0xff) | 0x80); 02508 } 02509 else 02510 goto backslash; 02511 break; 02512 02513 case 'C': 02514 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) { 02515 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; 02516 PFETCH(c); 02517 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX; 02518 goto control; 02519 } 02520 else 02521 goto backslash; 02522 02523 case 'c': 02524 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) { 02525 control: 02526 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; 02527 PFETCH(c); 02528 if (c == '?') { 02529 c = 0177; 02530 } 02531 else { 02532 if (c == MC_ESC(env->syntax)) { 02533 v = fetch_escaped_value(&p, end, env); 02534 if (v < 0) return v; 02535 c = (OnigCodePoint )v; 02536 } 02537 c &= 0x9f; 02538 } 02539 break; 02540 } 02541 /* fall through */ 02542 02543 default: 02544 { 02545 backslash: 02546 c = conv_backslash_value(c, env); 02547 } 02548 break; 02549 } 02550 02551 *src = p; 02552 return c; 02553 } 02554 02555 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); 02556 02557 static OnigCodePoint 02558 get_name_end_code_point(OnigCodePoint start) 02559 { 02560 switch (start) { 02561 case '<': return (OnigCodePoint )'>'; break; 02562 case '\'': return (OnigCodePoint )'\''; break; 02563 case '(': return (OnigCodePoint )')'; break; 02564 case '{': return (OnigCodePoint )'}'; break; 02565 default: 02566 break; 02567 } 02568 02569 return (OnigCodePoint )0; 02570 } 02571 02572 #ifdef USE_NAMED_GROUP 02573 #ifdef USE_BACKREF_WITH_LEVEL 02574 /* 02575 \k<name+n>, \k<name-n> 02576 \k<num+n>, \k<num-n> 02577 \k<-num+n>, \k<-num-n> 02578 */ 02579 static int 02580 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, 02581 UChar** rname_end, ScanEnv* env, 02582 int* rback_num, int* rlevel) 02583 { 02584 int r, sign, is_num, exist_level; 02585 OnigCodePoint end_code; 02586 OnigCodePoint c = 0; 02587 OnigEncoding enc = env->enc; 02588 UChar *name_end; 02589 UChar *pnum_head; 02590 UChar *p = *src; 02591 PFETCH_READY; 02592 02593 *rback_num = 0; 02594 is_num = exist_level = 0; 02595 sign = 1; 02596 pnum_head = *src; 02597 02598 end_code = get_name_end_code_point(start_code); 02599 02600 name_end = end; 02601 r = 0; 02602 if (PEND) { 02603 return ONIGERR_EMPTY_GROUP_NAME; 02604 } 02605 else { 02606 PFETCH(c); 02607 if (c == end_code) 02608 return ONIGERR_EMPTY_GROUP_NAME; 02609 02610 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 02611 is_num = 1; 02612 } 02613 else if (c == '-') { 02614 is_num = 2; 02615 sign = -1; 02616 pnum_head = p; 02617 } 02618 else if (!ONIGENC_IS_CODE_WORD(enc, c)) { 02619 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02620 } 02621 } 02622 02623 while (!PEND) { 02624 name_end = p; 02625 PFETCH(c); 02626 if (c == end_code || c == ')' || c == '+' || c == '-') { 02627 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME; 02628 break; 02629 } 02630 02631 if (is_num != 0) { 02632 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 02633 is_num = 1; 02634 } 02635 else { 02636 r = ONIGERR_INVALID_GROUP_NAME; 02637 is_num = 0; 02638 } 02639 } 02640 else if (!ONIGENC_IS_CODE_WORD(enc, c)) { 02641 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02642 } 02643 } 02644 02645 if (r == 0 && c != end_code) { 02646 if (c == '+' || c == '-') { 02647 int level; 02648 int flag = (c == '-' ? -1 : 1); 02649 02650 PFETCH(c); 02651 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err; 02652 PUNFETCH; 02653 level = onig_scan_unsigned_number(&p, end, enc); 02654 if (level < 0) return ONIGERR_TOO_BIG_NUMBER; 02655 *rlevel = (level * flag); 02656 exist_level = 1; 02657 02658 PFETCH(c); 02659 if (c == end_code) 02660 goto end; 02661 } 02662 02663 err: 02664 r = ONIGERR_INVALID_GROUP_NAME; 02665 name_end = end; 02666 } 02667 02668 end: 02669 if (r == 0) { 02670 if (is_num != 0) { 02671 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); 02672 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; 02673 else if (*rback_num == 0) goto err; 02674 02675 *rback_num *= sign; 02676 } 02677 02678 *rname_end = name_end; 02679 *src = p; 02680 return (exist_level ? 1 : 0); 02681 } 02682 else { 02683 onig_scan_env_set_error_string(env, r, *src, name_end); 02684 return r; 02685 } 02686 } 02687 #endif /* USE_BACKREF_WITH_LEVEL */ 02688 02689 /* 02690 ref: 0 -> define name (don't allow number name) 02691 1 -> reference name (allow number name) 02692 */ 02693 static int 02694 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, 02695 UChar** rname_end, ScanEnv* env, int* rback_num, int ref) 02696 { 02697 int r, is_num, sign; 02698 OnigCodePoint end_code; 02699 OnigCodePoint c = 0; 02700 OnigEncoding enc = env->enc; 02701 UChar *name_end; 02702 UChar *pnum_head; 02703 UChar *p = *src; 02704 PFETCH_READY; 02705 02706 *rback_num = 0; 02707 02708 end_code = get_name_end_code_point(start_code); 02709 02710 name_end = end; 02711 pnum_head = *src; 02712 r = 0; 02713 is_num = 0; 02714 sign = 1; 02715 if (PEND) { 02716 return ONIGERR_EMPTY_GROUP_NAME; 02717 } 02718 else { 02719 PFETCH(c); 02720 if (c == end_code) 02721 return ONIGERR_EMPTY_GROUP_NAME; 02722 02723 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 02724 if (ref == 1) 02725 is_num = 1; 02726 else { 02727 r = ONIGERR_INVALID_GROUP_NAME; 02728 is_num = 0; 02729 } 02730 } 02731 else if (c == '-') { 02732 if (ref == 1) { 02733 is_num = 2; 02734 sign = -1; 02735 pnum_head = p; 02736 } 02737 else { 02738 r = ONIGERR_INVALID_GROUP_NAME; 02739 is_num = 0; 02740 } 02741 } 02742 else if (!ONIGENC_IS_CODE_WORD(enc, c)) { 02743 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02744 } 02745 } 02746 02747 if (r == 0) { 02748 while (!PEND) { 02749 name_end = p; 02750 PFETCH(c); 02751 if (c == end_code || c == ')') { 02752 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME; 02753 break; 02754 } 02755 02756 if (is_num != 0) { 02757 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 02758 is_num = 1; 02759 } 02760 else { 02761 if (!ONIGENC_IS_CODE_WORD(enc, c)) 02762 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02763 else 02764 r = ONIGERR_INVALID_GROUP_NAME; 02765 02766 is_num = 0; 02767 } 02768 } 02769 else { 02770 if (!ONIGENC_IS_CODE_WORD(enc, c)) { 02771 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02772 } 02773 } 02774 } 02775 02776 if (c != end_code) { 02777 r = ONIGERR_INVALID_GROUP_NAME; 02778 name_end = end; 02779 } 02780 02781 if (is_num != 0) { 02782 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); 02783 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; 02784 else if (*rback_num == 0) { 02785 r = ONIGERR_INVALID_GROUP_NAME; 02786 goto err; 02787 } 02788 02789 *rback_num *= sign; 02790 } 02791 02792 *rname_end = name_end; 02793 *src = p; 02794 return 0; 02795 } 02796 else { 02797 while (!PEND) { 02798 name_end = p; 02799 PFETCH(c); 02800 if (c == end_code || c == ')') 02801 break; 02802 } 02803 if (PEND) 02804 name_end = end; 02805 02806 err: 02807 onig_scan_env_set_error_string(env, r, *src, name_end); 02808 return r; 02809 } 02810 } 02811 #else 02812 static int 02813 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, 02814 UChar** rname_end, ScanEnv* env, int* rback_num, int ref) 02815 { 02816 int r, is_num, sign; 02817 OnigCodePoint end_code; 02818 OnigCodePoint c = 0; 02819 UChar *name_end; 02820 OnigEncoding enc = env->enc; 02821 UChar *pnum_head; 02822 UChar *p = *src; 02823 PFETCH_READY; 02824 02825 *rback_num = 0; 02826 02827 end_code = get_name_end_code_point(start_code); 02828 02829 *rname_end = name_end = end; 02830 r = 0; 02831 pnum_head = *src; 02832 is_num = 0; 02833 sign = 1; 02834 02835 if (PEND) { 02836 return ONIGERR_EMPTY_GROUP_NAME; 02837 } 02838 else { 02839 PFETCH(c); 02840 if (c == end_code) 02841 return ONIGERR_EMPTY_GROUP_NAME; 02842 02843 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 02844 is_num = 1; 02845 } 02846 else if (c == '-') { 02847 is_num = 2; 02848 sign = -1; 02849 pnum_head = p; 02850 } 02851 else { 02852 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02853 } 02854 } 02855 02856 while (!PEND) { 02857 name_end = p; 02858 02859 PFETCH(c); 02860 if (c == end_code || c == ')') break; 02861 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) 02862 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 02863 } 02864 if (r == 0 && c != end_code) { 02865 r = ONIGERR_INVALID_GROUP_NAME; 02866 name_end = end; 02867 } 02868 02869 if (r == 0) { 02870 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); 02871 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; 02872 else if (*rback_num == 0) { 02873 r = ONIGERR_INVALID_GROUP_NAME; 02874 goto err; 02875 } 02876 *rback_num *= sign; 02877 02878 *rname_end = name_end; 02879 *src = p; 02880 return 0; 02881 } 02882 else { 02883 err: 02884 onig_scan_env_set_error_string(env, r, *src, name_end); 02885 return r; 02886 } 02887 } 02888 #endif /* USE_NAMED_GROUP */ 02889 02890 void onig_vsnprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, 02891 UChar* pat, UChar* pat_end, const UChar *fmt, va_list args); 02892 02893 static void 02894 onig_syntax_warn(ScanEnv *env, const char *fmt, ...) 02895 { 02896 va_list args; 02897 UChar buf[WARN_BUFSIZE]; 02898 va_start(args, fmt); 02899 onig_vsnprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, 02900 env->pattern, env->pattern_end, 02901 (const UChar *)fmt, args); 02902 va_end(args); 02903 if (env->sourcefile == NULL) 02904 rb_warn("%s", (char *)buf); 02905 else 02906 rb_compile_warn(env->sourcefile, env->sourceline, "%s", (char *)buf); 02907 } 02908 02909 static void 02910 CC_ESC_WARN(ScanEnv *env, UChar *c) 02911 { 02912 if (onig_warn == onig_null_warn) return ; 02913 02914 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) && 02915 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) { 02916 onig_syntax_warn(env, "character class has '%s' without escape", c); 02917 } 02918 } 02919 02920 static void 02921 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c) 02922 { 02923 if (onig_warn == onig_null_warn) return ; 02924 02925 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) { 02926 onig_syntax_warn(env, "regular expression has '%s' without escape", c); 02927 } 02928 } 02929 02930 static void 02931 CC_DUP_WARN(ScanEnv *env) 02932 { 02933 if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ; 02934 02935 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_DUP) && 02936 !(env->warnings_flag & ONIG_SYN_WARN_CC_DUP)) { 02937 env->warnings_flag |= ONIG_SYN_WARN_CC_DUP; 02938 onig_syntax_warn(env, "character class has duplicated range"); 02939 } 02940 } 02941 02942 static void 02943 UNKNOWN_ESC_WARN(ScanEnv *env, int c) 02944 { 02945 if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ; 02946 onig_syntax_warn(env, "Unknown escape \\%c is ignored", c); 02947 } 02948 02949 static UChar* 02950 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, 02951 UChar **next, OnigEncoding enc) 02952 { 02953 int i; 02954 OnigCodePoint x; 02955 UChar *q; 02956 UChar *p = from; 02957 02958 while (p < to) { 02959 x = ONIGENC_MBC_TO_CODE(enc, p, to); 02960 q = p + enclen(enc, p, to); 02961 if (x == s[0]) { 02962 for (i = 1; i < n && q < to; i++) { 02963 x = ONIGENC_MBC_TO_CODE(enc, q, to); 02964 if (x != s[i]) break; 02965 q += enclen(enc, q, to); 02966 } 02967 if (i >= n) { 02968 if (IS_NOT_NULL(next)) 02969 *next = q; 02970 return p; 02971 } 02972 } 02973 p = q; 02974 } 02975 return NULL_UCHARP; 02976 } 02977 02978 static int 02979 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, 02980 OnigCodePoint bad, OnigEncoding enc, const OnigSyntaxType* syn) 02981 { 02982 int i, in_esc; 02983 OnigCodePoint x; 02984 UChar *q; 02985 UChar *p = from; 02986 02987 in_esc = 0; 02988 while (p < to) { 02989 if (in_esc) { 02990 in_esc = 0; 02991 p += enclen(enc, p, to); 02992 } 02993 else { 02994 x = ONIGENC_MBC_TO_CODE(enc, p, to); 02995 q = p + enclen(enc, p, to); 02996 if (x == s[0]) { 02997 for (i = 1; i < n && q < to; i++) { 02998 x = ONIGENC_MBC_TO_CODE(enc, q, to); 02999 if (x != s[i]) break; 03000 q += enclen(enc, q, to); 03001 } 03002 if (i >= n) return 1; 03003 p += enclen(enc, p, to); 03004 } 03005 else { 03006 x = ONIGENC_MBC_TO_CODE(enc, p, to); 03007 if (x == bad) return 0; 03008 else if (x == MC_ESC(syn)) in_esc = 1; 03009 p = q; 03010 } 03011 } 03012 } 03013 return 0; 03014 } 03015 03016 static int 03017 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) 03018 { 03019 int num; 03020 OnigCodePoint c, c2; 03021 const OnigSyntaxType* syn = env->syntax; 03022 OnigEncoding enc = env->enc; 03023 UChar* prev; 03024 UChar* p = *src; 03025 PFETCH_READY; 03026 03027 if (PEND) { 03028 tok->type = TK_EOT; 03029 return tok->type; 03030 } 03031 03032 PFETCH(c); 03033 tok->type = TK_CHAR; 03034 tok->base = 0; 03035 tok->u.c = c; 03036 tok->escaped = 0; 03037 03038 if (c == ']') { 03039 tok->type = TK_CC_CLOSE; 03040 } 03041 else if (c == '-') { 03042 tok->type = TK_CC_RANGE; 03043 } 03044 else if (c == MC_ESC(syn)) { 03045 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) 03046 goto end; 03047 03048 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; 03049 03050 PFETCH(c); 03051 tok->escaped = 1; 03052 tok->u.c = c; 03053 switch (c) { 03054 case 'w': 03055 tok->type = TK_CHAR_TYPE; 03056 tok->u.prop.ctype = ONIGENC_CTYPE_WORD; 03057 tok->u.prop.not = 0; 03058 break; 03059 case 'W': 03060 tok->type = TK_CHAR_TYPE; 03061 tok->u.prop.ctype = ONIGENC_CTYPE_WORD; 03062 tok->u.prop.not = 1; 03063 break; 03064 case 'd': 03065 tok->type = TK_CHAR_TYPE; 03066 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; 03067 tok->u.prop.not = 0; 03068 break; 03069 case 'D': 03070 tok->type = TK_CHAR_TYPE; 03071 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; 03072 tok->u.prop.not = 1; 03073 break; 03074 case 's': 03075 tok->type = TK_CHAR_TYPE; 03076 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; 03077 tok->u.prop.not = 0; 03078 break; 03079 case 'S': 03080 tok->type = TK_CHAR_TYPE; 03081 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; 03082 tok->u.prop.not = 1; 03083 break; 03084 case 'h': 03085 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 03086 tok->type = TK_CHAR_TYPE; 03087 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 03088 tok->u.prop.not = 0; 03089 break; 03090 case 'H': 03091 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 03092 tok->type = TK_CHAR_TYPE; 03093 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 03094 tok->u.prop.not = 1; 03095 break; 03096 03097 case 'p': 03098 case 'P': 03099 c2 = PPEEK; 03100 if (c2 == '{' && 03101 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { 03102 PINC; 03103 tok->type = TK_CHAR_PROPERTY; 03104 tok->u.prop.not = (c == 'P' ? 1 : 0); 03105 03106 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { 03107 PFETCH(c2); 03108 if (c2 == '^') { 03109 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); 03110 } 03111 else 03112 PUNFETCH; 03113 } 03114 } 03115 else { 03116 onig_syntax_warn(env, "invalid Unicode Property \\%c", c); 03117 } 03118 break; 03119 03120 case 'x': 03121 if (PEND) break; 03122 03123 prev = p; 03124 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { 03125 PINC; 03126 num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); 03127 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; 03128 if (!PEND) { 03129 c2 = PPEEK; 03130 if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) 03131 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; 03132 } 03133 03134 if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) { 03135 PINC; 03136 tok->type = TK_CODE_POINT; 03137 tok->base = 16; 03138 tok->u.code = (OnigCodePoint )num; 03139 } 03140 else { 03141 /* can't read nothing or invalid format */ 03142 p = prev; 03143 } 03144 } 03145 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { 03146 num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); 03147 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 03148 if (p == prev) { /* can't read nothing. */ 03149 num = 0; /* but, it's not error */ 03150 } 03151 tok->type = TK_RAW_BYTE; 03152 tok->base = 16; 03153 tok->u.c = num; 03154 } 03155 break; 03156 03157 case 'u': 03158 if (PEND) break; 03159 03160 prev = p; 03161 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { 03162 num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); 03163 if (num < -1) return ONIGERR_TOO_SHORT_DIGITS; 03164 else if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 03165 if (p == prev) { /* can't read nothing. */ 03166 num = 0; /* but, it's not error */ 03167 } 03168 tok->type = TK_CODE_POINT; 03169 tok->base = 16; 03170 tok->u.code = (OnigCodePoint )num; 03171 } 03172 break; 03173 03174 case '0': 03175 case '1': case '2': case '3': case '4': case '5': case '6': case '7': 03176 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { 03177 PUNFETCH; 03178 prev = p; 03179 num = scan_unsigned_octal_number(&p, end, 3, enc); 03180 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 03181 if (p == prev) { /* can't read nothing. */ 03182 num = 0; /* but, it's not error */ 03183 } 03184 tok->type = TK_RAW_BYTE; 03185 tok->base = 8; 03186 tok->u.c = num; 03187 } 03188 break; 03189 03190 default: 03191 PUNFETCH; 03192 num = fetch_escaped_value(&p, end, env); 03193 if (num < 0) return num; 03194 if (tok->u.c != num) { 03195 tok->u.code = (OnigCodePoint )num; 03196 tok->type = TK_CODE_POINT; 03197 } 03198 break; 03199 } 03200 } 03201 else if (c == '[') { 03202 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) { 03203 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' }; 03204 tok->backp = p; /* point at '[' is read */ 03205 PINC; 03206 if (str_exist_check_with_esc(send, 2, p, end, 03207 (OnigCodePoint )']', enc, syn)) { 03208 tok->type = TK_POSIX_BRACKET_OPEN; 03209 } 03210 else { 03211 PUNFETCH; 03212 goto cc_in_cc; 03213 } 03214 } 03215 else { 03216 cc_in_cc: 03217 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) { 03218 tok->type = TK_CC_CC_OPEN; 03219 } 03220 else { 03221 CC_ESC_WARN(env, (UChar* )"["); 03222 } 03223 } 03224 } 03225 else if (c == '&') { 03226 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) && 03227 !PEND && (PPEEK_IS('&'))) { 03228 PINC; 03229 tok->type = TK_CC_AND; 03230 } 03231 } 03232 03233 end: 03234 *src = p; 03235 return tok->type; 03236 } 03237 03238 #ifdef USE_NAMED_GROUP 03239 static int 03240 fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src, 03241 UChar* end, ScanEnv* env) 03242 { 03243 int r, num; 03244 const OnigSyntaxType* syn = env->syntax; 03245 UChar* prev; 03246 UChar* p = *src; 03247 UChar* name_end; 03248 int* backs; 03249 int back_num; 03250 03251 prev = p; 03252 03253 #ifdef USE_BACKREF_WITH_LEVEL 03254 name_end = NULL_UCHARP; /* no need. escape gcc warning. */ 03255 r = fetch_name_with_level(c, &p, end, &name_end, 03256 env, &back_num, &tok->u.backref.level); 03257 if (r == 1) tok->u.backref.exist_level = 1; 03258 else tok->u.backref.exist_level = 0; 03259 #else 03260 r = fetch_name(&p, end, &name_end, env, &back_num, 1); 03261 #endif 03262 if (r < 0) return r; 03263 03264 if (back_num != 0) { 03265 if (back_num < 0) { 03266 back_num = BACKREF_REL_TO_ABS(back_num, env); 03267 if (back_num <= 0) 03268 return ONIGERR_INVALID_BACKREF; 03269 } 03270 03271 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { 03272 if (back_num > env->num_mem || 03273 IS_NULL(SCANENV_MEM_NODES(env)[back_num])) 03274 return ONIGERR_INVALID_BACKREF; 03275 } 03276 tok->type = TK_BACKREF; 03277 tok->u.backref.by_name = 0; 03278 tok->u.backref.num = 1; 03279 tok->u.backref.ref1 = back_num; 03280 } 03281 else { 03282 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); 03283 if (num <= 0) { 03284 onig_scan_env_set_error_string(env, 03285 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); 03286 return ONIGERR_UNDEFINED_NAME_REFERENCE; 03287 } 03288 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { 03289 int i; 03290 for (i = 0; i < num; i++) { 03291 if (backs[i] > env->num_mem || 03292 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) 03293 return ONIGERR_INVALID_BACKREF; 03294 } 03295 } 03296 03297 tok->type = TK_BACKREF; 03298 tok->u.backref.by_name = 1; 03299 if (num == 1) { 03300 tok->u.backref.num = 1; 03301 tok->u.backref.ref1 = backs[0]; 03302 } 03303 else { 03304 tok->u.backref.num = num; 03305 tok->u.backref.refs = backs; 03306 } 03307 } 03308 *src = p; 03309 return 0; 03310 } 03311 #endif 03312 03313 static int 03314 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) 03315 { 03316 int r, num; 03317 OnigCodePoint c; 03318 OnigEncoding enc = env->enc; 03319 const OnigSyntaxType* syn = env->syntax; 03320 UChar* prev; 03321 UChar* p = *src; 03322 PFETCH_READY; 03323 03324 start: 03325 if (PEND) { 03326 tok->type = TK_EOT; 03327 return tok->type; 03328 } 03329 03330 tok->type = TK_STRING; 03331 tok->base = 0; 03332 tok->backp = p; 03333 03334 PFETCH(c); 03335 if (IS_MC_ESC_CODE(c, syn)) { 03336 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; 03337 03338 tok->backp = p; 03339 PFETCH(c); 03340 03341 tok->u.c = c; 03342 tok->escaped = 1; 03343 switch (c) { 03344 case '*': 03345 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break; 03346 tok->type = TK_OP_REPEAT; 03347 tok->u.repeat.lower = 0; 03348 tok->u.repeat.upper = REPEAT_INFINITE; 03349 goto greedy_check; 03350 break; 03351 03352 case '+': 03353 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break; 03354 tok->type = TK_OP_REPEAT; 03355 tok->u.repeat.lower = 1; 03356 tok->u.repeat.upper = REPEAT_INFINITE; 03357 goto greedy_check; 03358 break; 03359 03360 case '?': 03361 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break; 03362 tok->type = TK_OP_REPEAT; 03363 tok->u.repeat.lower = 0; 03364 tok->u.repeat.upper = 1; 03365 greedy_check: 03366 if (!PEND && PPEEK_IS('?') && 03367 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) { 03368 PFETCH(c); 03369 tok->u.repeat.greedy = 0; 03370 tok->u.repeat.possessive = 0; 03371 } 03372 else { 03373 possessive_check: 03374 if (!PEND && PPEEK_IS('+') && 03375 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && 03376 tok->type != TK_INTERVAL) || 03377 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && 03378 tok->type == TK_INTERVAL))) { 03379 PFETCH(c); 03380 tok->u.repeat.greedy = 1; 03381 tok->u.repeat.possessive = 1; 03382 } 03383 else { 03384 tok->u.repeat.greedy = 1; 03385 tok->u.repeat.possessive = 0; 03386 } 03387 } 03388 break; 03389 03390 case '{': 03391 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; 03392 r = fetch_range_quantifier(&p, end, tok, env); 03393 if (r < 0) return r; /* error */ 03394 if (r == 0) goto greedy_check; 03395 else if (r == 2) { /* {n} */ 03396 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) 03397 goto possessive_check; 03398 03399 goto greedy_check; 03400 } 03401 /* r == 1 : normal char */ 03402 break; 03403 03404 case '|': 03405 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break; 03406 tok->type = TK_ALT; 03407 break; 03408 03409 case '(': 03410 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; 03411 tok->type = TK_SUBEXP_OPEN; 03412 break; 03413 03414 case ')': 03415 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; 03416 tok->type = TK_SUBEXP_CLOSE; 03417 break; 03418 03419 case 'w': 03420 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; 03421 tok->type = TK_CHAR_TYPE; 03422 tok->u.prop.ctype = ONIGENC_CTYPE_WORD; 03423 tok->u.prop.not = 0; 03424 break; 03425 03426 case 'W': 03427 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; 03428 tok->type = TK_CHAR_TYPE; 03429 tok->u.prop.ctype = ONIGENC_CTYPE_WORD; 03430 tok->u.prop.not = 1; 03431 break; 03432 03433 case 'b': 03434 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; 03435 tok->type = TK_ANCHOR; 03436 tok->u.anchor.subtype = ANCHOR_WORD_BOUND; 03437 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option) 03438 && ! IS_WORD_BOUND_ALL_RANGE(env->option); 03439 break; 03440 03441 case 'B': 03442 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; 03443 tok->type = TK_ANCHOR; 03444 tok->u.anchor.subtype = ANCHOR_NOT_WORD_BOUND; 03445 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option) 03446 && ! IS_WORD_BOUND_ALL_RANGE(env->option); 03447 break; 03448 03449 #ifdef USE_WORD_BEGIN_END 03450 case '<': 03451 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; 03452 tok->type = TK_ANCHOR; 03453 tok->u.anchor.subtype = ANCHOR_WORD_BEGIN; 03454 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option); 03455 break; 03456 03457 case '>': 03458 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; 03459 tok->type = TK_ANCHOR; 03460 tok->u.anchor.subtype = ANCHOR_WORD_END; 03461 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option); 03462 break; 03463 #endif 03464 03465 case 's': 03466 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; 03467 tok->type = TK_CHAR_TYPE; 03468 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; 03469 tok->u.prop.not = 0; 03470 break; 03471 03472 case 'S': 03473 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; 03474 tok->type = TK_CHAR_TYPE; 03475 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; 03476 tok->u.prop.not = 1; 03477 break; 03478 03479 case 'd': 03480 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; 03481 tok->type = TK_CHAR_TYPE; 03482 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; 03483 tok->u.prop.not = 0; 03484 break; 03485 03486 case 'D': 03487 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; 03488 tok->type = TK_CHAR_TYPE; 03489 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; 03490 tok->u.prop.not = 1; 03491 break; 03492 03493 case 'h': 03494 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 03495 tok->type = TK_CHAR_TYPE; 03496 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 03497 tok->u.prop.not = 0; 03498 break; 03499 03500 case 'H': 03501 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 03502 tok->type = TK_CHAR_TYPE; 03503 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 03504 tok->u.prop.not = 1; 03505 break; 03506 03507 case 'A': 03508 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; 03509 begin_buf: 03510 tok->type = TK_ANCHOR; 03511 tok->u.anchor.subtype = ANCHOR_BEGIN_BUF; 03512 break; 03513 03514 case 'Z': 03515 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; 03516 tok->type = TK_ANCHOR; 03517 tok->u.anchor.subtype = ANCHOR_SEMI_END_BUF; 03518 break; 03519 03520 case 'z': 03521 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; 03522 end_buf: 03523 tok->type = TK_ANCHOR; 03524 tok->u.anchor.subtype = ANCHOR_END_BUF; 03525 break; 03526 03527 case 'G': 03528 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break; 03529 tok->type = TK_ANCHOR; 03530 tok->u.anchor.subtype = ANCHOR_BEGIN_POSITION; 03531 break; 03532 03533 case '`': 03534 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; 03535 goto begin_buf; 03536 break; 03537 03538 case '\'': 03539 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; 03540 goto end_buf; 03541 break; 03542 03543 case 'x': 03544 if (PEND) break; 03545 03546 prev = p; 03547 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { 03548 PINC; 03549 num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); 03550 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; 03551 if (!PEND) { 03552 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK)) 03553 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; 03554 } 03555 03556 if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) { 03557 PINC; 03558 tok->type = TK_CODE_POINT; 03559 tok->u.code = (OnigCodePoint )num; 03560 } 03561 else { 03562 /* can't read nothing or invalid format */ 03563 p = prev; 03564 } 03565 } 03566 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { 03567 num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); 03568 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 03569 if (p == prev) { /* can't read nothing. */ 03570 num = 0; /* but, it's not error */ 03571 } 03572 tok->type = TK_RAW_BYTE; 03573 tok->base = 16; 03574 tok->u.c = num; 03575 } 03576 break; 03577 03578 case 'u': 03579 if (PEND) break; 03580 03581 prev = p; 03582 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { 03583 num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); 03584 if (num < -1) return ONIGERR_TOO_SHORT_DIGITS; 03585 else if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 03586 if (p == prev) { /* can't read nothing. */ 03587 num = 0; /* but, it's not error */ 03588 } 03589 tok->type = TK_CODE_POINT; 03590 tok->base = 16; 03591 tok->u.code = (OnigCodePoint )num; 03592 } 03593 break; 03594 03595 case '1': case '2': case '3': case '4': 03596 case '5': case '6': case '7': case '8': case '9': 03597 PUNFETCH; 03598 prev = p; 03599 num = onig_scan_unsigned_number(&p, end, enc); 03600 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { 03601 goto skip_backref; 03602 } 03603 03604 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && 03605 (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ 03606 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { 03607 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num])) 03608 return ONIGERR_INVALID_BACKREF; 03609 } 03610 03611 tok->type = TK_BACKREF; 03612 tok->u.backref.num = 1; 03613 tok->u.backref.ref1 = num; 03614 tok->u.backref.by_name = 0; 03615 #ifdef USE_BACKREF_WITH_LEVEL 03616 tok->u.backref.exist_level = 0; 03617 #endif 03618 break; 03619 } 03620 03621 skip_backref: 03622 if (c == '8' || c == '9') { 03623 /* normal char */ 03624 p = prev; PINC; 03625 break; 03626 } 03627 03628 p = prev; 03629 /* fall through */ 03630 case '0': 03631 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { 03632 prev = p; 03633 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); 03634 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 03635 if (p == prev) { /* can't read nothing. */ 03636 num = 0; /* but, it's not error */ 03637 } 03638 tok->type = TK_RAW_BYTE; 03639 tok->base = 8; 03640 tok->u.c = num; 03641 } 03642 else if (c != '0') { 03643 PINC; 03644 } 03645 break; 03646 03647 #ifdef USE_NAMED_GROUP 03648 case 'k': 03649 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { 03650 PFETCH(c); 03651 if (c == '<' || c == '\'') { 03652 r = fetch_named_backref_token(c, tok, &p, end, env); 03653 if (r < 0) return r; 03654 } 03655 else { 03656 PUNFETCH; 03657 onig_syntax_warn(env, "invalid back reference"); 03658 } 03659 } 03660 break; 03661 #endif 03662 03663 #if defined(USE_SUBEXP_CALL) || defined(USE_NAMED_GROUP) 03664 case 'g': 03665 #ifdef USE_NAMED_GROUP 03666 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) { 03667 PFETCH(c); 03668 if (c == '{') { 03669 r = fetch_named_backref_token(c, tok, &p, end, env); 03670 if (r < 0) return r; 03671 } 03672 else 03673 PUNFETCH; 03674 } 03675 #endif 03676 #ifdef USE_SUBEXP_CALL 03677 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { 03678 PFETCH(c); 03679 if (c == '<' || c == '\'') { 03680 int gnum = -1, rel = 0; 03681 UChar* name_end; 03682 OnigCodePoint cnext; 03683 03684 cnext = PPEEK; 03685 if (cnext == '0') { 03686 PINC; 03687 if (PPEEK_IS(get_name_end_code_point(c))) { /* \g<0>, \g'0' */ 03688 PINC; 03689 name_end = p; 03690 gnum = 0; 03691 } 03692 } 03693 else if (cnext == '+') { 03694 PINC; 03695 rel = 1; 03696 } 03697 prev = p; 03698 if (gnum < 0) { 03699 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1); 03700 if (r < 0) return r; 03701 } 03702 03703 tok->type = TK_CALL; 03704 tok->u.call.name = prev; 03705 tok->u.call.name_end = name_end; 03706 tok->u.call.gnum = gnum; 03707 tok->u.call.rel = rel; 03708 } 03709 else { 03710 onig_syntax_warn(env, "invalid subexp call"); 03711 PUNFETCH; 03712 } 03713 } 03714 #endif 03715 break; 03716 #endif 03717 03718 case 'Q': 03719 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) { 03720 tok->type = TK_QUOTE_OPEN; 03721 } 03722 break; 03723 03724 case 'p': 03725 case 'P': 03726 if (PPEEK_IS('{') && 03727 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { 03728 PINC; 03729 tok->type = TK_CHAR_PROPERTY; 03730 tok->u.prop.not = (c == 'P' ? 1 : 0); 03731 03732 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { 03733 PFETCH(c); 03734 if (c == '^') { 03735 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); 03736 } 03737 else 03738 PUNFETCH; 03739 } 03740 } 03741 else { 03742 onig_syntax_warn(env, "invalid Unicode Property \\%c", c); 03743 } 03744 break; 03745 03746 case 'R': 03747 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK)) { 03748 tok->type = TK_LINEBREAK; 03749 } 03750 break; 03751 03752 case 'X': 03753 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER)) { 03754 tok->type = TK_EXTENDED_GRAPHEME_CLUSTER; 03755 } 03756 break; 03757 03758 case 'K': 03759 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) { 03760 tok->type = TK_KEEP; 03761 } 03762 break; 03763 03764 default: 03765 PUNFETCH; 03766 num = fetch_escaped_value(&p, end, env); 03767 if (num < 0) return num; 03768 /* set_raw: */ 03769 if (tok->u.c != num) { 03770 tok->type = TK_CODE_POINT; 03771 tok->u.code = (OnigCodePoint )num; 03772 } 03773 else { /* string */ 03774 p = tok->backp + enclen(enc, tok->backp, end); 03775 } 03776 break; 03777 } 03778 } 03779 else { 03780 tok->u.c = c; 03781 tok->escaped = 0; 03782 03783 #ifdef USE_VARIABLE_META_CHARS 03784 if ((c != ONIG_INEFFECTIVE_META_CHAR) && 03785 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { 03786 if (c == MC_ANYCHAR(syn)) 03787 goto any_char; 03788 else if (c == MC_ANYTIME(syn)) 03789 goto anytime; 03790 else if (c == MC_ZERO_OR_ONE_TIME(syn)) 03791 goto zero_or_one_time; 03792 else if (c == MC_ONE_OR_MORE_TIME(syn)) 03793 goto one_or_more_time; 03794 else if (c == MC_ANYCHAR_ANYTIME(syn)) { 03795 tok->type = TK_ANYCHAR_ANYTIME; 03796 goto out; 03797 } 03798 } 03799 #endif 03800 03801 switch (c) { 03802 case '.': 03803 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break; 03804 #ifdef USE_VARIABLE_META_CHARS 03805 any_char: 03806 #endif 03807 tok->type = TK_ANYCHAR; 03808 break; 03809 03810 case '*': 03811 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break; 03812 #ifdef USE_VARIABLE_META_CHARS 03813 anytime: 03814 #endif 03815 tok->type = TK_OP_REPEAT; 03816 tok->u.repeat.lower = 0; 03817 tok->u.repeat.upper = REPEAT_INFINITE; 03818 goto greedy_check; 03819 break; 03820 03821 case '+': 03822 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break; 03823 #ifdef USE_VARIABLE_META_CHARS 03824 one_or_more_time: 03825 #endif 03826 tok->type = TK_OP_REPEAT; 03827 tok->u.repeat.lower = 1; 03828 tok->u.repeat.upper = REPEAT_INFINITE; 03829 goto greedy_check; 03830 break; 03831 03832 case '?': 03833 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break; 03834 #ifdef USE_VARIABLE_META_CHARS 03835 zero_or_one_time: 03836 #endif 03837 tok->type = TK_OP_REPEAT; 03838 tok->u.repeat.lower = 0; 03839 tok->u.repeat.upper = 1; 03840 goto greedy_check; 03841 break; 03842 03843 case '{': 03844 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; 03845 r = fetch_range_quantifier(&p, end, tok, env); 03846 if (r < 0) return r; /* error */ 03847 if (r == 0) goto greedy_check; 03848 else if (r == 2) { /* {n} */ 03849 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) 03850 goto possessive_check; 03851 03852 goto greedy_check; 03853 } 03854 /* r == 1 : normal char */ 03855 break; 03856 03857 case '|': 03858 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break; 03859 tok->type = TK_ALT; 03860 break; 03861 03862 case '(': 03863 if (PPEEK_IS('?') && 03864 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { 03865 PINC; 03866 if (PPEEK_IS('#')) { 03867 PFETCH(c); 03868 while (1) { 03869 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 03870 PFETCH(c); 03871 if (c == MC_ESC(syn)) { 03872 if (!PEND) PFETCH(c); 03873 } 03874 else { 03875 if (c == ')') break; 03876 } 03877 } 03878 goto start; 03879 } 03880 #ifdef USE_PERL_SUBEXP_CALL 03881 /* (?&name), (?n), (?R), (?0), (?+n), (?-n) */ 03882 c = PPEEK; 03883 if ((c == '&' || c == 'R' || ONIGENC_IS_CODE_DIGIT(enc, c)) && 03884 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) { 03885 /* (?&name), (?n), (?R), (?0) */ 03886 int gnum; 03887 UChar *name; 03888 UChar *name_end; 03889 03890 if (c == 'R' || c == '0') { 03891 PINC; /* skip 'R' / '0' */ 03892 if (!PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME; 03893 PINC; /* skip ')' */ 03894 name_end = name = p; 03895 gnum = 0; 03896 } 03897 else { 03898 int numref = 1; 03899 if (c == '&') { /* (?&name) */ 03900 PINC; 03901 numref = 0; /* don't allow number name */ 03902 } 03903 name = p; 03904 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, numref); 03905 if (r < 0) return r; 03906 } 03907 03908 tok->type = TK_CALL; 03909 tok->u.call.name = name; 03910 tok->u.call.name_end = name_end; 03911 tok->u.call.gnum = gnum; 03912 tok->u.call.rel = 0; 03913 break; 03914 } 03915 else if ((c == '-' || c == '+') && 03916 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) { 03917 /* (?+n), (?-n) */ 03918 int gnum; 03919 UChar *name; 03920 UChar *name_end; 03921 OnigCodePoint cnext; 03922 PFETCH_READY; 03923 03924 PINC; /* skip '-' / '+' */ 03925 cnext = PPEEK; 03926 if (ONIGENC_IS_CODE_DIGIT(enc, cnext)) { 03927 if (c == '-') PUNFETCH; 03928 name = p; 03929 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 1); 03930 if (r < 0) return r; 03931 03932 tok->type = TK_CALL; 03933 tok->u.call.name = name; 03934 tok->u.call.name_end = name_end; 03935 tok->u.call.gnum = gnum; 03936 tok->u.call.rel = 1; 03937 break; 03938 } 03939 } 03940 #endif /* USE_PERL_SUBEXP_CALL */ 03941 #ifdef USE_CAPITAL_P_NAMED_GROUP 03942 if (PPEEK_IS('P') && 03943 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) { 03944 int gnum; 03945 UChar *name; 03946 UChar *name_end; 03947 PFETCH_READY; 03948 03949 PINC; /* skip 'P' */ 03950 PFETCH(c); 03951 if (c == '=') { /* (?P=name): backref */ 03952 r = fetch_named_backref_token((OnigCodePoint )'(', tok, &p, end, env); 03953 if (r < 0) return r; 03954 break; 03955 } 03956 else if (c == '>') { /* (?P>name): subexp call */ 03957 name = p; 03958 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 0); 03959 if (r < 0) return r; 03960 03961 tok->type = TK_CALL; 03962 tok->u.call.name = name; 03963 tok->u.call.name_end = name_end; 03964 tok->u.call.gnum = gnum; 03965 tok->u.call.rel = 0; 03966 break; 03967 } 03968 PUNFETCH; 03969 } 03970 #endif /* USE_CAPITAL_P_NAMED_GROUP */ 03971 PUNFETCH; 03972 } 03973 03974 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; 03975 tok->type = TK_SUBEXP_OPEN; 03976 break; 03977 03978 case ')': 03979 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; 03980 tok->type = TK_SUBEXP_CLOSE; 03981 break; 03982 03983 case '^': 03984 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; 03985 tok->type = TK_ANCHOR; 03986 tok->u.anchor.subtype = (IS_SINGLELINE(env->option) 03987 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE); 03988 break; 03989 03990 case '$': 03991 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; 03992 tok->type = TK_ANCHOR; 03993 tok->u.anchor.subtype = (IS_SINGLELINE(env->option) 03994 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE); 03995 break; 03996 03997 case '[': 03998 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break; 03999 tok->type = TK_CC_OPEN; 04000 break; 04001 04002 case ']': 04003 if (*src > env->pattern) /* /].../ is allowed. */ 04004 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); 04005 break; 04006 04007 case '#': 04008 if (IS_EXTEND(env->option)) { 04009 while (!PEND) { 04010 PFETCH(c); 04011 if (ONIGENC_IS_CODE_NEWLINE(enc, c)) 04012 break; 04013 } 04014 goto start; 04015 break; 04016 } 04017 break; 04018 04019 case ' ': case '\t': case '\n': case '\r': case '\f': 04020 if (IS_EXTEND(env->option)) 04021 goto start; 04022 break; 04023 04024 default: 04025 /* string */ 04026 break; 04027 } 04028 } 04029 04030 #ifdef USE_VARIABLE_META_CHARS 04031 out: 04032 #endif 04033 *src = p; 04034 return tok->type; 04035 } 04036 04037 static int 04038 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not, 04039 ScanEnv* env, 04040 OnigCodePoint sb_out, const OnigCodePoint mbr[]) 04041 { 04042 int i, r; 04043 OnigCodePoint j; 04044 04045 int n = ONIGENC_CODE_RANGE_NUM(mbr); 04046 04047 if (not == 0) { 04048 for (i = 0; i < n; i++) { 04049 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i); 04050 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) { 04051 if (j >= sb_out) { 04052 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) { 04053 r = add_code_range_to_buf(&(cc->mbuf), env, j, 04054 ONIGENC_CODE_RANGE_TO(mbr, i)); 04055 if (r != 0) return r; 04056 i++; 04057 } 04058 04059 goto sb_end; 04060 } 04061 BITSET_SET_BIT_CHKDUP(cc->bs, j); 04062 } 04063 } 04064 04065 sb_end: 04066 for ( ; i < n; i++) { 04067 r = add_code_range_to_buf(&(cc->mbuf), env, 04068 ONIGENC_CODE_RANGE_FROM(mbr, i), 04069 ONIGENC_CODE_RANGE_TO(mbr, i)); 04070 if (r != 0) return r; 04071 } 04072 } 04073 else { 04074 OnigCodePoint prev = 0; 04075 04076 for (i = 0; i < n; i++) { 04077 for (j = prev; 04078 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) { 04079 if (j >= sb_out) { 04080 goto sb_end2; 04081 } 04082 BITSET_SET_BIT_CHKDUP(cc->bs, j); 04083 } 04084 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; 04085 } 04086 for (j = prev; j < sb_out; j++) { 04087 BITSET_SET_BIT_CHKDUP(cc->bs, j); 04088 } 04089 04090 sb_end2: 04091 prev = sb_out; 04092 04093 for (i = 0; i < n; i++) { 04094 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) { 04095 r = add_code_range_to_buf(&(cc->mbuf), env, prev, 04096 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1); 04097 if (r != 0) return r; 04098 } 04099 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; 04100 } 04101 if (prev < 0x7fffffff) { 04102 r = add_code_range_to_buf(&(cc->mbuf), env, prev, 0x7fffffff); 04103 if (r != 0) return r; 04104 } 04105 } 04106 04107 return 0; 04108 } 04109 04110 static int 04111 add_ctype_to_cc(CClassNode* cc, int ctype, int not, int char_prop, ScanEnv* env) 04112 { 04113 int maxcode, ascii_range; 04114 int c, r; 04115 const OnigCodePoint *ranges; 04116 OnigCodePoint sb_out; 04117 OnigEncoding enc = env->enc; 04118 OnigOptionType option = env->option; 04119 04120 ascii_range = IS_ASCII_RANGE(option) && (char_prop == 0); 04121 04122 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); 04123 if (r == 0) { 04124 if (ascii_range) { 04125 CClassNode ccwork; 04126 initialize_cclass(&ccwork); 04127 r = add_ctype_to_cc_by_range(&ccwork, ctype, not, env, sb_out, 04128 ranges); 04129 if (r == 0) { 04130 if (not) { 04131 r = add_code_range_to_buf0(&(ccwork.mbuf), env, 0x80, ONIG_LAST_CODE_POINT, FALSE); 04132 } 04133 else { 04134 CClassNode ccascii; 04135 initialize_cclass(&ccascii); 04136 if (ONIGENC_MBC_MINLEN(env->enc) > 1) { 04137 add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F); 04138 } 04139 else { 04140 bitset_set_range(env, ccascii.bs, 0x00, 0x7F); 04141 } 04142 r = and_cclass(&ccwork, &ccascii, env); 04143 if (IS_NOT_NULL(ccascii.mbuf)) bbuf_free(ccascii.mbuf); 04144 } 04145 if (r == 0) { 04146 r = or_cclass(cc, &ccwork, env); 04147 } 04148 if (IS_NOT_NULL(ccwork.mbuf)) bbuf_free(ccwork.mbuf); 04149 } 04150 } 04151 else { 04152 r = add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges); 04153 } 04154 return r; 04155 } 04156 else if (r != ONIG_NO_SUPPORT_CONFIG) { 04157 return r; 04158 } 04159 04160 maxcode = ascii_range ? 0x80 : SINGLE_BYTE_SIZE; 04161 r = 0; 04162 switch (ctype) { 04163 case ONIGENC_CTYPE_ALPHA: 04164 case ONIGENC_CTYPE_BLANK: 04165 case ONIGENC_CTYPE_CNTRL: 04166 case ONIGENC_CTYPE_DIGIT: 04167 case ONIGENC_CTYPE_LOWER: 04168 case ONIGENC_CTYPE_PUNCT: 04169 case ONIGENC_CTYPE_SPACE: 04170 case ONIGENC_CTYPE_UPPER: 04171 case ONIGENC_CTYPE_XDIGIT: 04172 case ONIGENC_CTYPE_ASCII: 04173 case ONIGENC_CTYPE_ALNUM: 04174 if (not != 0) { 04175 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 04176 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 04177 BITSET_SET_BIT_CHKDUP(cc->bs, c); 04178 } 04179 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 04180 } 04181 else { 04182 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 04183 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 04184 BITSET_SET_BIT_CHKDUP(cc->bs, c); 04185 } 04186 } 04187 break; 04188 04189 case ONIGENC_CTYPE_GRAPH: 04190 case ONIGENC_CTYPE_PRINT: 04191 if (not != 0) { 04192 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 04193 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype) 04194 || c >= maxcode) 04195 BITSET_SET_BIT_CHKDUP(cc->bs, c); 04196 } 04197 if (ascii_range) 04198 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 04199 } 04200 else { 04201 for (c = 0; c < maxcode; c++) { 04202 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 04203 BITSET_SET_BIT_CHKDUP(cc->bs, c); 04204 } 04205 if (! ascii_range) 04206 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 04207 } 04208 break; 04209 04210 case ONIGENC_CTYPE_WORD: 04211 if (not == 0) { 04212 for (c = 0; c < maxcode; c++) { 04213 if (ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c); 04214 } 04215 if (! ascii_range) 04216 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 04217 } 04218 else { 04219 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 04220 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */ 04221 && (! ONIGENC_IS_CODE_WORD(enc, c) || c >= maxcode)) 04222 BITSET_SET_BIT_CHKDUP(cc->bs, c); 04223 } 04224 if (ascii_range) 04225 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 04226 } 04227 break; 04228 04229 default: 04230 return ONIGERR_PARSER_BUG; 04231 break; 04232 } 04233 04234 return r; 04235 } 04236 04237 static int 04238 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) 04239 { 04240 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 04241 #define POSIX_BRACKET_NAME_MIN_LEN 4 04242 04243 static const PosixBracketEntryType PBS[] = { 04244 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 }, 04245 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 }, 04246 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 }, 04247 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 }, 04248 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 }, 04249 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 }, 04250 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 }, 04251 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 }, 04252 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 }, 04253 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 }, 04254 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 }, 04255 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 }, 04256 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 }, 04257 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 }, 04258 { (UChar* )NULL, -1, 0 } 04259 }; 04260 04261 const PosixBracketEntryType *pb; 04262 int not, i, r; 04263 OnigCodePoint c; 04264 OnigEncoding enc = env->enc; 04265 UChar *p = *src; 04266 PFETCH_READY; 04267 04268 if (PPEEK_IS('^')) { 04269 PINC; 04270 not = 1; 04271 } 04272 else 04273 not = 0; 04274 04275 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3) 04276 goto not_posix_bracket; 04277 04278 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { 04279 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { 04280 p = (UChar* )onigenc_step(enc, p, end, pb->len); 04281 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0) 04282 return ONIGERR_INVALID_POSIX_BRACKET_TYPE; 04283 04284 r = add_ctype_to_cc(cc, pb->ctype, not, 04285 IS_POSIX_BRACKET_ALL_RANGE(env->option), 04286 env); 04287 if (r != 0) return r; 04288 04289 PINC; PINC; 04290 *src = p; 04291 return 0; 04292 } 04293 } 04294 04295 not_posix_bracket: 04296 c = 0; 04297 i = 0; 04298 while (!PEND && ((c = PPEEK) != ':') && c != ']') { 04299 PINC; 04300 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; 04301 } 04302 if (c == ':' && ! PEND) { 04303 PINC; 04304 if (! PEND) { 04305 PFETCH(c); 04306 if (c == ']') 04307 return ONIGERR_INVALID_POSIX_BRACKET_TYPE; 04308 } 04309 } 04310 04311 return 1; /* 1: is not POSIX bracket, but no error. */ 04312 } 04313 04314 static int 04315 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) 04316 { 04317 int r; 04318 OnigCodePoint c; 04319 OnigEncoding enc = env->enc; 04320 UChar *prev, *start, *p = *src; 04321 PFETCH_READY; 04322 04323 r = 0; 04324 start = prev = p; 04325 04326 while (!PEND) { 04327 prev = p; 04328 PFETCH(c); 04329 if (c == '}') { 04330 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev); 04331 if (r < 0) break; 04332 04333 *src = p; 04334 return r; 04335 } 04336 else if (c == '(' || c == ')' || c == '{' || c == '|') { 04337 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME; 04338 break; 04339 } 04340 } 04341 04342 onig_scan_env_set_error_string(env, r, *src, prev); 04343 return r; 04344 } 04345 04346 static int 04347 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end, 04348 ScanEnv* env) 04349 { 04350 int r, ctype; 04351 CClassNode* cc; 04352 04353 ctype = fetch_char_property_to_ctype(src, end, env); 04354 if (ctype < 0) return ctype; 04355 04356 *np = node_new_cclass(); 04357 CHECK_NULL_RETURN_MEMERR(*np); 04358 cc = NCCLASS(*np); 04359 r = add_ctype_to_cc(cc, ctype, 0, 1, env); 04360 if (r != 0) return r; 04361 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); 04362 04363 return 0; 04364 } 04365 04366 04367 enum CCSTATE { 04368 CCS_VALUE, 04369 CCS_RANGE, 04370 CCS_COMPLETE, 04371 CCS_START 04372 }; 04373 04374 enum CCVALTYPE { 04375 CCV_SB, 04376 CCV_CODE_POINT, 04377 CCV_CLASS 04378 }; 04379 04380 static int 04381 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type, 04382 enum CCSTATE* state, ScanEnv* env) 04383 { 04384 int r; 04385 04386 if (*state == CCS_RANGE) 04387 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; 04388 04389 if (*state == CCS_VALUE && *type != CCV_CLASS) { 04390 if (*type == CCV_SB) 04391 BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs)); 04392 else if (*type == CCV_CODE_POINT) { 04393 r = add_code_range(&(cc->mbuf), env, *vs, *vs); 04394 if (r < 0) return r; 04395 } 04396 } 04397 04398 *state = CCS_VALUE; 04399 *type = CCV_CLASS; 04400 return 0; 04401 } 04402 04403 static int 04404 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, 04405 int* vs_israw, int v_israw, 04406 enum CCVALTYPE intype, enum CCVALTYPE* type, 04407 enum CCSTATE* state, ScanEnv* env) 04408 { 04409 int r; 04410 04411 switch (*state) { 04412 case CCS_VALUE: 04413 if (*type == CCV_SB) 04414 BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs)); 04415 else if (*type == CCV_CODE_POINT) { 04416 r = add_code_range(&(cc->mbuf), env, *vs, *vs); 04417 if (r < 0) return r; 04418 } 04419 break; 04420 04421 case CCS_RANGE: 04422 if (intype == *type) { 04423 if (intype == CCV_SB) { 04424 if (*vs > 0xff || v > 0xff) 04425 return ONIGERR_INVALID_CODE_POINT_VALUE; 04426 04427 if (*vs > v) { 04428 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) 04429 goto ccs_range_end; 04430 else 04431 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; 04432 } 04433 bitset_set_range(env, cc->bs, (int )*vs, (int )v); 04434 } 04435 else { 04436 r = add_code_range(&(cc->mbuf), env, *vs, v); 04437 if (r < 0) return r; 04438 } 04439 } 04440 else { 04441 #if 0 04442 if (intype == CCV_CODE_POINT && *type == CCV_SB) { 04443 #endif 04444 if (*vs > v) { 04445 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) 04446 goto ccs_range_end; 04447 else 04448 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; 04449 } 04450 bitset_set_range(env, cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff)); 04451 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v); 04452 if (r < 0) return r; 04453 #if 0 04454 } 04455 else 04456 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE; 04457 #endif 04458 } 04459 ccs_range_end: 04460 *state = CCS_COMPLETE; 04461 break; 04462 04463 case CCS_COMPLETE: 04464 case CCS_START: 04465 *state = CCS_VALUE; 04466 break; 04467 04468 default: 04469 break; 04470 } 04471 04472 *vs_israw = v_israw; 04473 *vs = v; 04474 *type = intype; 04475 return 0; 04476 } 04477 04478 static int 04479 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, 04480 ScanEnv* env) 04481 { 04482 int in_esc; 04483 OnigCodePoint code; 04484 OnigEncoding enc = env->enc; 04485 UChar* p = from; 04486 PFETCH_READY; 04487 04488 in_esc = 0; 04489 while (! PEND) { 04490 if (ignore_escaped && in_esc) { 04491 in_esc = 0; 04492 } 04493 else { 04494 PFETCH(code); 04495 if (code == c) return 1; 04496 if (code == MC_ESC(env->syntax)) in_esc = 1; 04497 } 04498 } 04499 return 0; 04500 } 04501 04502 static int 04503 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, 04504 ScanEnv* env) 04505 { 04506 int r, neg, len, fetched, and_start; 04507 OnigCodePoint v, vs; 04508 UChar *p; 04509 Node* node; 04510 CClassNode *cc, *prev_cc; 04511 CClassNode work_cc; 04512 04513 enum CCSTATE state; 04514 enum CCVALTYPE val_type, in_type; 04515 int val_israw, in_israw; 04516 04517 prev_cc = (CClassNode* )NULL; 04518 *np = NULL_NODE; 04519 r = fetch_token_in_cc(tok, src, end, env); 04520 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { 04521 neg = 1; 04522 r = fetch_token_in_cc(tok, src, end, env); 04523 } 04524 else { 04525 neg = 0; 04526 } 04527 04528 if (r < 0) return r; 04529 if (r == TK_CC_CLOSE) { 04530 if (! code_exist_check((OnigCodePoint )']', 04531 *src, env->pattern_end, 1, env)) 04532 return ONIGERR_EMPTY_CHAR_CLASS; 04533 04534 CC_ESC_WARN(env, (UChar* )"]"); 04535 r = tok->type = TK_CHAR; /* allow []...] */ 04536 } 04537 04538 *np = node = node_new_cclass(); 04539 CHECK_NULL_RETURN_MEMERR(node); 04540 cc = NCCLASS(node); 04541 04542 and_start = 0; 04543 state = CCS_START; 04544 p = *src; 04545 while (r != TK_CC_CLOSE) { 04546 fetched = 0; 04547 switch (r) { 04548 case TK_CHAR: 04549 if ((tok->u.code >= SINGLE_BYTE_SIZE) || 04550 (len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c)) > 1) { 04551 in_type = CCV_CODE_POINT; 04552 } 04553 else if (len < 0) { 04554 r = len; 04555 goto err; 04556 } 04557 else { 04558 sb_char: 04559 in_type = CCV_SB; 04560 } 04561 v = (OnigCodePoint )tok->u.c; 04562 in_israw = 0; 04563 goto val_entry2; 04564 break; 04565 04566 case TK_RAW_BYTE: 04567 /* tok->base != 0 : octal or hexadec. */ 04568 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { 04569 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; 04570 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; 04571 UChar* psave = p; 04572 int i, base = tok->base; 04573 04574 buf[0] = (UChar )tok->u.c; 04575 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { 04576 r = fetch_token_in_cc(tok, &p, end, env); 04577 if (r < 0) goto err; 04578 if (r != TK_RAW_BYTE || tok->base != base) { 04579 fetched = 1; 04580 break; 04581 } 04582 buf[i] = (UChar )tok->u.c; 04583 } 04584 04585 if (i < ONIGENC_MBC_MINLEN(env->enc)) { 04586 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; 04587 goto err; 04588 } 04589 04590 len = enclen(env->enc, buf, buf+i); 04591 if (i < len) { 04592 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; 04593 goto err; 04594 } 04595 else if (i > len) { /* fetch back */ 04596 p = psave; 04597 for (i = 1; i < len; i++) { 04598 r = fetch_token_in_cc(tok, &p, end, env); 04599 } 04600 fetched = 0; 04601 } 04602 04603 if (i == 1) { 04604 v = (OnigCodePoint )buf[0]; 04605 goto raw_single; 04606 } 04607 else { 04608 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); 04609 in_type = CCV_CODE_POINT; 04610 } 04611 } 04612 else { 04613 v = (OnigCodePoint )tok->u.c; 04614 raw_single: 04615 in_type = CCV_SB; 04616 } 04617 in_israw = 1; 04618 goto val_entry2; 04619 break; 04620 04621 case TK_CODE_POINT: 04622 v = tok->u.code; 04623 in_israw = 1; 04624 val_entry: 04625 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); 04626 if (len < 0) { 04627 r = len; 04628 goto err; 04629 } 04630 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); 04631 val_entry2: 04632 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type, 04633 &state, env); 04634 if (r != 0) goto err; 04635 break; 04636 04637 case TK_POSIX_BRACKET_OPEN: 04638 r = parse_posix_bracket(cc, &p, end, env); 04639 if (r < 0) goto err; 04640 if (r == 1) { /* is not POSIX bracket */ 04641 CC_ESC_WARN(env, (UChar* )"["); 04642 p = tok->backp; 04643 v = (OnigCodePoint )tok->u.c; 04644 in_israw = 0; 04645 goto val_entry; 04646 } 04647 goto next_class; 04648 break; 04649 04650 case TK_CHAR_TYPE: 04651 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, 0, env); 04652 if (r != 0) return r; 04653 04654 next_class: 04655 r = next_state_class(cc, &vs, &val_type, &state, env); 04656 if (r != 0) goto err; 04657 break; 04658 04659 case TK_CHAR_PROPERTY: 04660 { 04661 int ctype; 04662 04663 ctype = fetch_char_property_to_ctype(&p, end, env); 04664 if (ctype < 0) return ctype; 04665 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, 1, env); 04666 if (r != 0) return r; 04667 goto next_class; 04668 } 04669 break; 04670 04671 case TK_CC_RANGE: 04672 if (state == CCS_VALUE) { 04673 r = fetch_token_in_cc(tok, &p, end, env); 04674 if (r < 0) goto err; 04675 fetched = 1; 04676 if (r == TK_CC_CLOSE) { /* allow [x-] */ 04677 range_end_val: 04678 v = (OnigCodePoint )'-'; 04679 in_israw = 0; 04680 goto val_entry; 04681 } 04682 else if (r == TK_CC_AND) { 04683 CC_ESC_WARN(env, (UChar* )"-"); 04684 goto range_end_val; 04685 } 04686 state = CCS_RANGE; 04687 } 04688 else if (state == CCS_START) { 04689 /* [-xa] is allowed */ 04690 v = (OnigCodePoint )tok->u.c; 04691 in_israw = 0; 04692 04693 r = fetch_token_in_cc(tok, &p, end, env); 04694 if (r < 0) goto err; 04695 fetched = 1; 04696 /* [--x] or [a&&-x] is warned. */ 04697 if (r == TK_CC_RANGE || and_start != 0) 04698 CC_ESC_WARN(env, (UChar* )"-"); 04699 04700 goto val_entry; 04701 } 04702 else if (state == CCS_RANGE) { 04703 CC_ESC_WARN(env, (UChar* )"-"); 04704 goto sb_char; /* [!--x] is allowed */ 04705 } 04706 else { /* CCS_COMPLETE */ 04707 r = fetch_token_in_cc(tok, &p, end, env); 04708 if (r < 0) goto err; 04709 fetched = 1; 04710 if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ 04711 else if (r == TK_CC_AND) { 04712 CC_ESC_WARN(env, (UChar* )"-"); 04713 goto range_end_val; 04714 } 04715 04716 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) { 04717 CC_ESC_WARN(env, (UChar* )"-"); 04718 goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */ 04719 } 04720 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; 04721 goto err; 04722 } 04723 break; 04724 04725 case TK_CC_CC_OPEN: /* [ */ 04726 { 04727 Node *anode; 04728 CClassNode* acc; 04729 04730 r = parse_char_class(&anode, tok, &p, end, env); 04731 if (r == 0) { 04732 acc = NCCLASS(anode); 04733 r = or_cclass(cc, acc, env); 04734 } 04735 onig_node_free(anode); 04736 if (r != 0) goto err; 04737 } 04738 break; 04739 04740 case TK_CC_AND: /* && */ 04741 { 04742 if (state == CCS_VALUE) { 04743 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, 04744 &val_type, &state, env); 04745 if (r != 0) goto err; 04746 } 04747 /* initialize local variables */ 04748 and_start = 1; 04749 state = CCS_START; 04750 04751 if (IS_NOT_NULL(prev_cc)) { 04752 r = and_cclass(prev_cc, cc, env); 04753 if (r != 0) goto err; 04754 bbuf_free(cc->mbuf); 04755 } 04756 else { 04757 prev_cc = cc; 04758 cc = &work_cc; 04759 } 04760 initialize_cclass(cc); 04761 } 04762 break; 04763 04764 case TK_EOT: 04765 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS; 04766 goto err; 04767 break; 04768 default: 04769 r = ONIGERR_PARSER_BUG; 04770 goto err; 04771 break; 04772 } 04773 04774 if (fetched) 04775 r = tok->type; 04776 else { 04777 r = fetch_token_in_cc(tok, &p, end, env); 04778 if (r < 0) goto err; 04779 } 04780 } 04781 04782 if (state == CCS_VALUE) { 04783 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, 04784 &val_type, &state, env); 04785 if (r != 0) goto err; 04786 } 04787 04788 if (IS_NOT_NULL(prev_cc)) { 04789 r = and_cclass(prev_cc, cc, env); 04790 if (r != 0) goto err; 04791 bbuf_free(cc->mbuf); 04792 cc = prev_cc; 04793 } 04794 04795 if (neg != 0) 04796 NCCLASS_SET_NOT(cc); 04797 else 04798 NCCLASS_CLEAR_NOT(cc); 04799 if (IS_NCCLASS_NOT(cc) && 04800 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) { 04801 int is_empty; 04802 04803 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); 04804 if (is_empty != 0) 04805 BITSET_IS_EMPTY(cc->bs, is_empty); 04806 04807 if (is_empty == 0) { 04808 #define NEWLINE_CODE 0x0a 04809 04810 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { 04811 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) 04812 BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE); 04813 else { 04814 r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); 04815 if (r < 0) goto err; 04816 } 04817 } 04818 } 04819 } 04820 *src = p; 04821 return 0; 04822 04823 err: 04824 if (cc != NCCLASS(*np)) 04825 bbuf_free(cc->mbuf); 04826 return r; 04827 } 04828 04829 static int parse_subexp(Node** top, OnigToken* tok, int term, 04830 UChar** src, UChar* end, ScanEnv* env); 04831 04832 static int 04833 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, 04834 ScanEnv* env) 04835 { 04836 int r = 0, num; 04837 Node *target, *work1 = NULL, *work2 = NULL; 04838 OnigOptionType option; 04839 OnigCodePoint c; 04840 OnigEncoding enc = env->enc; 04841 04842 #ifdef USE_NAMED_GROUP 04843 int list_capture; 04844 #endif 04845 04846 UChar* p = *src; 04847 PFETCH_READY; 04848 04849 *np = NULL; 04850 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; 04851 04852 option = env->option; 04853 if (PPEEK_IS('?') && 04854 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { 04855 PINC; 04856 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 04857 04858 PFETCH(c); 04859 switch (c) { 04860 case ':': /* (?:...) grouping only */ 04861 group: 04862 r = fetch_token(tok, &p, end, env); 04863 if (r < 0) return r; 04864 r = parse_subexp(np, tok, term, &p, end, env); 04865 if (r < 0) return r; 04866 *src = p; 04867 return 1; /* group */ 04868 break; 04869 04870 case '=': 04871 *np = onig_node_new_anchor(ANCHOR_PREC_READ); 04872 break; 04873 case '!': /* preceding read */ 04874 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT); 04875 break; 04876 case '>': /* (?>...) stop backtrack */ 04877 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); 04878 break; 04879 04880 #ifdef USE_NAMED_GROUP 04881 case '\'': 04882 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { 04883 goto named_group1; 04884 } 04885 else 04886 return ONIGERR_UNDEFINED_GROUP_OPTION; 04887 break; 04888 04889 #ifdef USE_CAPITAL_P_NAMED_GROUP 04890 case 'P': /* (?P<name>...) */ 04891 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) { 04892 PFETCH(c); 04893 if (c == '<') goto named_group1; 04894 } 04895 return ONIGERR_UNDEFINED_GROUP_OPTION; 04896 break; 04897 #endif 04898 #endif 04899 04900 case '<': /* look behind (?<=...), (?<!...) */ 04901 PFETCH(c); 04902 if (c == '=') 04903 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND); 04904 else if (c == '!') 04905 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT); 04906 #ifdef USE_NAMED_GROUP 04907 else { /* (?<name>...) */ 04908 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { 04909 UChar *name; 04910 UChar *name_end; 04911 04912 PUNFETCH; 04913 c = '<'; 04914 04915 named_group1: 04916 list_capture = 0; 04917 04918 named_group2: 04919 name = p; 04920 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0); 04921 if (r < 0) return r; 04922 04923 num = scan_env_add_mem_entry(env); 04924 if (num < 0) return num; 04925 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM) 04926 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; 04927 04928 r = name_add(env->reg, name, name_end, num, env); 04929 if (r != 0) return r; 04930 *np = node_new_enclose_memory(env->option, 1); 04931 CHECK_NULL_RETURN_MEMERR(*np); 04932 NENCLOSE(*np)->regnum = num; 04933 if (list_capture != 0) 04934 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); 04935 env->num_named++; 04936 } 04937 else { 04938 return ONIGERR_UNDEFINED_GROUP_OPTION; 04939 } 04940 } 04941 #else 04942 else { 04943 return ONIGERR_UNDEFINED_GROUP_OPTION; 04944 } 04945 #endif 04946 break; 04947 04948 case '@': 04949 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { 04950 #ifdef USE_NAMED_GROUP 04951 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { 04952 PFETCH(c); 04953 if (c == '<' || c == '\'') { 04954 list_capture = 1; 04955 goto named_group2; /* (?@<name>...) */ 04956 } 04957 PUNFETCH; 04958 } 04959 #endif 04960 *np = node_new_enclose_memory(env->option, 0); 04961 CHECK_NULL_RETURN_MEMERR(*np); 04962 num = scan_env_add_mem_entry(env); 04963 if (num < 0) { 04964 onig_node_free(*np); 04965 return num; 04966 } 04967 else if (num >= (int )BIT_STATUS_BITS_NUM) { 04968 onig_node_free(*np); 04969 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; 04970 } 04971 NENCLOSE(*np)->regnum = num; 04972 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); 04973 } 04974 else { 04975 return ONIGERR_UNDEFINED_GROUP_OPTION; 04976 } 04977 break; 04978 04979 case '(': /* conditional expression: (?(cond)yes), (?(cond)yes|no) */ 04980 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) { 04981 UChar *name = NULL; 04982 UChar *name_end; 04983 PFETCH(c); 04984 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { /* (n) */ 04985 PUNFETCH; 04986 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &num, 1); 04987 if (r < 0) return r; 04988 if (num < 0) { 04989 num = BACKREF_REL_TO_ABS(num, env); 04990 if (num <= 0) 04991 return ONIGERR_INVALID_BACKREF; 04992 } 04993 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { 04994 if (num > env->num_mem || 04995 IS_NULL(SCANENV_MEM_NODES(env)[num])) 04996 return ONIGERR_INVALID_BACKREF; 04997 } 04998 } 04999 #ifdef USE_NAMED_GROUP 05000 else if (c == '<' || c == '\'') { /* (<name>), ('name') */ 05001 int nums; 05002 int *backs; 05003 05004 name = p; 05005 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0); 05006 if (r < 0) return r; 05007 PFETCH(c); 05008 if (c != ')') return ONIGERR_UNDEFINED_GROUP_OPTION; 05009 05010 nums = onig_name_to_group_numbers(env->reg, name, name_end, &backs); 05011 if (nums <= 0) { 05012 onig_scan_env_set_error_string(env, 05013 ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end); 05014 return ONIGERR_UNDEFINED_NAME_REFERENCE; 05015 } 05016 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { 05017 int i; 05018 for (i = 0; i < nums; i++) { 05019 if (backs[i] > env->num_mem || 05020 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) 05021 return ONIGERR_INVALID_BACKREF; 05022 } 05023 } 05024 num = backs[0]; /* XXX: use left most named group as Perl */ 05025 } 05026 #endif 05027 else 05028 return ONIGERR_INVALID_CONDITION_PATTERN; 05029 *np = node_new_enclose(ENCLOSE_CONDITION); 05030 CHECK_NULL_RETURN_MEMERR(*np); 05031 NENCLOSE(*np)->regnum = num; 05032 if (IS_NOT_NULL(name)) NENCLOSE(*np)->state |= NST_NAME_REF; 05033 } 05034 else 05035 return ONIGERR_UNDEFINED_GROUP_OPTION; 05036 break; 05037 05038 #if 0 05039 case '|': /* branch reset: (?|...) */ 05040 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET)) { 05041 /* TODO */ 05042 } 05043 else 05044 return ONIGERR_UNDEFINED_GROUP_OPTION; 05045 break; 05046 #endif 05047 05048 case '^': /* loads default options */ 05049 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { 05050 /* d-imsx */ 05051 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); 05052 ONOFF(option, ONIG_OPTION_IGNORECASE, 1); 05053 ONOFF(option, ONIG_OPTION_SINGLELINE, 0); 05054 ONOFF(option, ONIG_OPTION_MULTILINE, 1); 05055 ONOFF(option, ONIG_OPTION_EXTEND, 1); 05056 PFETCH(c); 05057 } 05058 #if 0 05059 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { 05060 /* d-imx */ 05061 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); 05062 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0); 05063 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0); 05064 ONOFF(option, ONIG_OPTION_IGNORECASE, 1); 05065 ONOFF(option, ONIG_OPTION_MULTILINE, 1); 05066 ONOFF(option, ONIG_OPTION_EXTEND, 1); 05067 PFETCH(c); 05068 } 05069 #endif 05070 else { 05071 return ONIGERR_UNDEFINED_GROUP_OPTION; 05072 } 05073 /* fall through */ 05074 #ifdef USE_POSIXLINE_OPTION 05075 case 'p': 05076 #endif 05077 case '-': case 'i': case 'm': case 's': case 'x': 05078 case 'a': case 'd': case 'l': case 'u': 05079 { 05080 int neg = 0; 05081 05082 while (1) { 05083 switch (c) { 05084 case ':': 05085 case ')': 05086 break; 05087 05088 case '-': neg = 1; break; 05089 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break; 05090 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break; 05091 case 's': 05092 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { 05093 ONOFF(option, ONIG_OPTION_MULTILINE, neg); 05094 } 05095 else 05096 return ONIGERR_UNDEFINED_GROUP_OPTION; 05097 break; 05098 05099 case 'm': 05100 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { 05101 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0)); 05102 } 05103 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { 05104 ONOFF(option, ONIG_OPTION_MULTILINE, neg); 05105 } 05106 else 05107 return ONIGERR_UNDEFINED_GROUP_OPTION; 05108 break; 05109 #ifdef USE_POSIXLINE_OPTION 05110 case 'p': 05111 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg); 05112 break; 05113 #endif 05114 05115 case 'a': /* limits \d, \s, \w and POSIX brackets to ASCII range */ 05116 if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) || 05117 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && 05118 (neg == 0)) { 05119 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); 05120 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1); 05121 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1); 05122 } 05123 else 05124 return ONIGERR_UNDEFINED_GROUP_OPTION; 05125 break; 05126 05127 case 'u': 05128 if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) || 05129 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && 05130 (neg == 0)) { 05131 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); 05132 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1); 05133 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1); 05134 } 05135 else 05136 return ONIGERR_UNDEFINED_GROUP_OPTION; 05137 break; 05138 05139 case 'd': 05140 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && 05141 (neg == 0)) { 05142 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); 05143 } 05144 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) && 05145 (neg == 0)) { 05146 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); 05147 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0); 05148 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0); 05149 } 05150 else 05151 return ONIGERR_UNDEFINED_GROUP_OPTION; 05152 break; 05153 05154 case 'l': 05155 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && (neg == 0)) { 05156 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); 05157 } 05158 else 05159 return ONIGERR_UNDEFINED_GROUP_OPTION; 05160 break; 05161 05162 default: 05163 return ONIGERR_UNDEFINED_GROUP_OPTION; 05164 } 05165 05166 if (c == ')') { 05167 *np = node_new_option(option); 05168 CHECK_NULL_RETURN_MEMERR(*np); 05169 *src = p; 05170 return 2; /* option only */ 05171 } 05172 else if (c == ':') { 05173 OnigOptionType prev = env->option; 05174 05175 env->option = option; 05176 r = fetch_token(tok, &p, end, env); 05177 if (r < 0) return r; 05178 r = parse_subexp(&target, tok, term, &p, end, env); 05179 env->option = prev; 05180 if (r < 0) return r; 05181 *np = node_new_option(option); 05182 CHECK_NULL_RETURN_MEMERR(*np); 05183 NENCLOSE(*np)->target = target; 05184 *src = p; 05185 return 0; 05186 } 05187 05188 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 05189 PFETCH(c); 05190 } 05191 } 05192 break; 05193 05194 default: 05195 return ONIGERR_UNDEFINED_GROUP_OPTION; 05196 } 05197 } 05198 else { 05199 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP)) 05200 goto group; 05201 05202 *np = node_new_enclose_memory(env->option, 0); 05203 CHECK_NULL_RETURN_MEMERR(*np); 05204 num = scan_env_add_mem_entry(env); 05205 if (num < 0) return num; 05206 NENCLOSE(*np)->regnum = num; 05207 } 05208 05209 CHECK_NULL_RETURN_MEMERR(*np); 05210 r = fetch_token(tok, &p, end, env); 05211 if (r < 0) return r; 05212 r = parse_subexp(&target, tok, term, &p, end, env); 05213 if (r < 0) { 05214 onig_node_free(target); 05215 return r; 05216 } 05217 05218 if (NTYPE(*np) == NT_ANCHOR) 05219 NANCHOR(*np)->target = target; 05220 else { 05221 NENCLOSE(*np)->target = target; 05222 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) { 05223 /* Don't move this to previous of parse_subexp() */ 05224 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np); 05225 if (r != 0) return r; 05226 } 05227 else if (NENCLOSE(*np)->type == ENCLOSE_CONDITION) { 05228 if (NTYPE(target) != NT_ALT) { 05229 /* convert (?(cond)yes) to (?(cond)yes|empty) */ 05230 work1 = node_new_empty(); 05231 if (IS_NULL(work1)) goto err; 05232 work2 = onig_node_new_alt(work1, NULL_NODE); 05233 if (IS_NULL(work2)) goto err; 05234 work1 = onig_node_new_alt(target, work2); 05235 if (IS_NULL(work1)) goto err; 05236 NENCLOSE(*np)->target = work1; 05237 } 05238 } 05239 } 05240 05241 *src = p; 05242 return 0; 05243 05244 err: 05245 onig_node_free(work1); 05246 onig_node_free(work2); 05247 onig_node_free(*np); 05248 *np = NULL; 05249 return ONIGERR_MEMORY; 05250 } 05251 05252 static const char* const PopularQStr[] = { 05253 "?", "*", "+", "??", "*?", "+?" 05254 }; 05255 05256 static const char* const ReduceQStr[] = { 05257 "", "", "*", "*?", "??", "+ and ??", "+? and ?" 05258 }; 05259 05260 static int 05261 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) 05262 { 05263 QtfrNode* qn; 05264 05265 qn = NQTFR(qnode); 05266 if (qn->lower == 1 && qn->upper == 1) { 05267 return 1; 05268 } 05269 05270 switch (NTYPE(target)) { 05271 case NT_STR: 05272 if (! group) { 05273 StrNode* sn = NSTR(target); 05274 if (str_node_can_be_split(sn, env->enc)) { 05275 Node* n = str_node_split_last_char(sn, env->enc); 05276 if (IS_NOT_NULL(n)) { 05277 qn->target = n; 05278 return 2; 05279 } 05280 } 05281 } 05282 break; 05283 05284 case NT_QTFR: 05285 { /* check redundant double repeat. */ 05286 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */ 05287 QtfrNode* qnt = NQTFR(target); 05288 int nestq_num = popular_quantifier_num(qn); 05289 int targetq_num = popular_quantifier_num(qnt); 05290 05291 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR 05292 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) && 05293 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { 05294 UChar buf[WARN_BUFSIZE]; 05295 05296 switch (ReduceTypeTable[targetq_num][nestq_num]) { 05297 case RQ_ASIS: 05298 break; 05299 05300 case RQ_DEL: 05301 if (onig_verb_warn != onig_null_warn) { 05302 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, 05303 env->pattern, env->pattern_end, 05304 (UChar* )"redundant nested repeat operator"); 05305 (*onig_verb_warn)((char* )buf); 05306 } 05307 goto warn_exit; 05308 break; 05309 05310 default: 05311 if (onig_verb_warn != onig_null_warn) { 05312 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, 05313 env->pattern, env->pattern_end, 05314 (UChar* )"nested repeat operator %s and %s was replaced with '%s'", 05315 PopularQStr[targetq_num], PopularQStr[nestq_num], 05316 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); 05317 (*onig_verb_warn)((char* )buf); 05318 } 05319 goto warn_exit; 05320 break; 05321 } 05322 } 05323 05324 warn_exit: 05325 #endif 05326 if (targetq_num >= 0) { 05327 if (nestq_num >= 0) { 05328 onig_reduce_nested_quantifier(qnode, target); 05329 goto q_exit; 05330 } 05331 else if (targetq_num == 1 || targetq_num == 2) { /* * or + */ 05332 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ 05333 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { 05334 qn->upper = (qn->lower == 0 ? 1 : qn->lower); 05335 } 05336 } 05337 } 05338 } 05339 break; 05340 05341 default: 05342 break; 05343 } 05344 05345 qn->target = target; 05346 q_exit: 05347 return 0; 05348 } 05349 05350 05351 #ifdef USE_SHARED_CCLASS_TABLE 05352 05353 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8 05354 05355 /* for ctype node hash table */ 05356 05357 typedef struct { 05358 OnigEncoding enc; 05359 int not; 05360 int type; 05361 } type_cclass_key; 05362 05363 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y) 05364 { 05365 if (x->type != y->type) return 1; 05366 if (x->enc != y->enc) return 1; 05367 if (x->not != y->not) return 1; 05368 return 0; 05369 } 05370 05371 static st_index_t type_cclass_hash(type_cclass_key* key) 05372 { 05373 int i, val; 05374 UChar *p; 05375 05376 val = 0; 05377 05378 p = (UChar* )&(key->enc); 05379 for (i = 0; i < (int )sizeof(key->enc); i++) { 05380 val = val * 997 + (int )*p++; 05381 } 05382 05383 p = (UChar* )(&key->type); 05384 for (i = 0; i < (int )sizeof(key->type); i++) { 05385 val = val * 997 + (int )*p++; 05386 } 05387 05388 val += key->not; 05389 return val + (val >> 5); 05390 } 05391 05392 static const struct st_hash_type type_type_cclass_hash = { 05393 type_cclass_cmp, 05394 type_cclass_hash, 05395 }; 05396 05397 static st_table* OnigTypeCClassTable; 05398 05399 05400 static int 05401 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED) 05402 { 05403 if (IS_NOT_NULL(node)) { 05404 CClassNode* cc = NCCLASS(node); 05405 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf); 05406 xfree(node); 05407 } 05408 05409 if (IS_NOT_NULL(key)) xfree(key); 05410 return ST_DELETE; 05411 } 05412 05413 extern int 05414 onig_free_shared_cclass_table(void) 05415 { 05416 THREAD_ATOMIC_START; 05417 if (IS_NOT_NULL(OnigTypeCClassTable)) { 05418 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0); 05419 onig_st_free_table(OnigTypeCClassTable); 05420 OnigTypeCClassTable = NULL; 05421 } 05422 THREAD_ATOMIC_END; 05423 05424 return 0; 05425 } 05426 05427 #endif /* USE_SHARED_CCLASS_TABLE */ 05428 05429 05430 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 05431 static int 05432 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) 05433 { 05434 BBuf *tbuf; 05435 int r; 05436 05437 if (IS_NCCLASS_NOT(cc)) { 05438 bitset_invert(cc->bs); 05439 05440 if (! ONIGENC_IS_SINGLEBYTE(enc)) { 05441 r = not_code_range_buf(enc, cc->mbuf, &tbuf); 05442 if (r != 0) return r; 05443 05444 bbuf_free(cc->mbuf); 05445 cc->mbuf = tbuf; 05446 } 05447 05448 NCCLASS_CLEAR_NOT(cc); 05449 } 05450 05451 return 0; 05452 } 05453 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ 05454 05455 typedef struct { 05456 ScanEnv* env; 05457 CClassNode* cc; 05458 Node* alt_root; 05459 Node** ptail; 05460 } IApplyCaseFoldArg; 05461 05462 static int 05463 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], 05464 int to_len, void* arg) 05465 { 05466 IApplyCaseFoldArg* iarg; 05467 ScanEnv* env; 05468 CClassNode* cc; 05469 BitSetRef bs; 05470 05471 iarg = (IApplyCaseFoldArg* )arg; 05472 env = iarg->env; 05473 cc = iarg->cc; 05474 bs = cc->bs; 05475 05476 if (to_len == 1) { 05477 int is_in = onig_is_code_in_cc(env->enc, from, cc); 05478 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 05479 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || 05480 (is_in == 0 && IS_NCCLASS_NOT(cc))) { 05481 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { 05482 add_code_range0(&(cc->mbuf), env, *to, *to, 0); 05483 } 05484 else { 05485 BITSET_SET_BIT(bs, *to); 05486 } 05487 } 05488 #else 05489 if (is_in != 0) { 05490 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { 05491 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); 05492 add_code_range0(&(cc->mbuf), env, *to, *to, 0); 05493 } 05494 else { 05495 if (IS_NCCLASS_NOT(cc)) { 05496 BITSET_CLEAR_BIT(bs, *to); 05497 } 05498 else 05499 BITSET_SET_BIT(bs, *to); 05500 } 05501 } 05502 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ 05503 } 05504 else { 05505 int r, i, len; 05506 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; 05507 Node *snode = NULL_NODE; 05508 05509 if (onig_is_code_in_cc(env->enc, from, cc) 05510 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 05511 && !IS_NCCLASS_NOT(cc) 05512 #endif 05513 ) { 05514 for (i = 0; i < to_len; i++) { 05515 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); 05516 if (i == 0) { 05517 snode = onig_node_new_str(buf, buf + len); 05518 CHECK_NULL_RETURN_MEMERR(snode); 05519 05520 /* char-class expanded multi-char only 05521 compare with string folded at match time. */ 05522 NSTRING_SET_AMBIG(snode); 05523 } 05524 else { 05525 r = onig_node_str_cat(snode, buf, buf + len); 05526 if (r < 0) { 05527 onig_node_free(snode); 05528 return r; 05529 } 05530 } 05531 } 05532 05533 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE); 05534 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail)); 05535 iarg->ptail = &(NCDR((*(iarg->ptail)))); 05536 } 05537 } 05538 05539 return 0; 05540 } 05541 05542 static int 05543 node_linebreak(Node** np, ScanEnv* env) 05544 { 05545 /* same as (?>\x0D\x0A|[\x0A-\x0D\x{85}\x{2028}\x{2029}]) */ 05546 Node* left = NULL; 05547 Node* right = NULL; 05548 Node* target1 = NULL; 05549 Node* target2 = NULL; 05550 CClassNode* cc; 05551 int num1, num2; 05552 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2]; 05553 05554 /* \x0D\x0A */ 05555 num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf); 05556 if (num1 < 0) return num1; 05557 num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1); 05558 if (num2 < 0) return num2; 05559 left = node_new_str_raw(buf, buf + num1 + num2); 05560 if (IS_NULL(left)) goto err; 05561 05562 /* [\x0A-\x0D] or [\x0A-\x0D\x{85}\x{2028}\x{2029}] */ 05563 right = node_new_cclass(); 05564 if (IS_NULL(right)) goto err; 05565 cc = NCCLASS(right); 05566 if (ONIGENC_MBC_MINLEN(env->enc) > 1) { 05567 add_code_range(&(cc->mbuf), env, 0x0A, 0x0D); 05568 } 05569 else { 05570 bitset_set_range(env, cc->bs, 0x0A, 0x0D); 05571 } 05572 05573 /* TODO: move this block to enc/unicode.c */ 05574 if (ONIGENC_IS_UNICODE(env->enc)) { 05575 /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */ 05576 add_code_range(&(cc->mbuf), env, 0x85, 0x85); 05577 add_code_range(&(cc->mbuf), env, 0x2028, 0x2029); 05578 } 05579 05580 /* ...|... */ 05581 target1 = onig_node_new_alt(right, NULL_NODE); 05582 if (IS_NULL(target1)) goto err; 05583 right = NULL; 05584 target2 = onig_node_new_alt(left, target1); 05585 if (IS_NULL(target2)) goto err; 05586 left = NULL; 05587 target1 = NULL; 05588 05589 /* (?>...) */ 05590 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); 05591 if (IS_NULL(*np)) goto err; 05592 NENCLOSE(*np)->target = target2; 05593 return ONIG_NORMAL; 05594 05595 err: 05596 onig_node_free(left); 05597 onig_node_free(right); 05598 onig_node_free(target1); 05599 onig_node_free(target2); 05600 return ONIGERR_MEMORY; 05601 } 05602 05603 static int 05604 node_extended_grapheme_cluster(Node** np, ScanEnv* env) 05605 { 05606 /* same as (?>\P{M}\p{M}*) */ 05607 Node* np1 = NULL; 05608 Node* np2 = NULL; 05609 Node* qn = NULL; 05610 Node* list1 = NULL; 05611 Node* list2 = NULL; 05612 int r = 0; 05613 05614 #ifdef USE_UNICODE_PROPERTIES 05615 if (ONIGENC_IS_UNICODE(env->enc)) { 05616 /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */ 05617 CClassNode* cc1; 05618 CClassNode* cc2; 05619 UChar* propname = (UChar* )"M"; 05620 int ctype = env->enc->property_name_to_ctype(ONIG_ENCODING_ASCII, 05621 propname, propname + 1); 05622 if (ctype >= 0) { 05623 /* \P{M} */ 05624 np1 = node_new_cclass(); 05625 if (IS_NULL(np1)) goto err; 05626 cc1 = NCCLASS(np1); 05627 r = add_ctype_to_cc(cc1, ctype, 0, 1, env); 05628 if (r != 0) goto err; 05629 NCCLASS_SET_NOT(cc1); 05630 05631 /* \p{M}* */ 05632 np2 = node_new_cclass(); 05633 if (IS_NULL(np2)) goto err; 05634 cc2 = NCCLASS(np2); 05635 r = add_ctype_to_cc(cc2, ctype, 0, 1, env); 05636 if (r != 0) goto err; 05637 05638 qn = node_new_quantifier(0, REPEAT_INFINITE, 0); 05639 if (IS_NULL(qn)) goto err; 05640 NQTFR(qn)->target = np2; 05641 np2 = NULL; 05642 05643 /* \P{M}\p{M}* */ 05644 list2 = node_new_list(qn, NULL_NODE); 05645 if (IS_NULL(list2)) goto err; 05646 qn = NULL; 05647 list1 = node_new_list(np1, list2); 05648 if (IS_NULL(list1)) goto err; 05649 np1 = NULL; 05650 list2 = NULL; 05651 05652 /* (?>...) */ 05653 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); 05654 if (IS_NULL(*np)) goto err; 05655 NENCLOSE(*np)->target = list1; 05656 return ONIG_NORMAL; 05657 } 05658 } 05659 #endif /* USE_UNICODE_PROPERTIES */ 05660 if (IS_NULL(*np)) { 05661 /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */ 05662 OnigOptionType option; 05663 np1 = node_new_anychar(); 05664 if (IS_NULL(np1)) goto err; 05665 05666 option = env->option; 05667 ONOFF(option, ONIG_OPTION_MULTILINE, 0); 05668 *np = node_new_option(option); 05669 if (IS_NULL(*np)) goto err; 05670 NENCLOSE(*np)->target = np1; 05671 } 05672 return ONIG_NORMAL; 05673 05674 err: 05675 onig_node_free(np1); 05676 onig_node_free(np2); 05677 onig_node_free(qn); 05678 onig_node_free(list1); 05679 onig_node_free(list2); 05680 return (r == 0) ? ONIGERR_MEMORY : r; 05681 } 05682 05683 static int 05684 countbits(unsigned int bits) 05685 { 05686 bits = (bits & 0x55555555) + ((bits >> 1) & 0x55555555); 05687 bits = (bits & 0x33333333) + ((bits >> 2) & 0x33333333); 05688 bits = (bits & 0x0f0f0f0f) + ((bits >> 4) & 0x0f0f0f0f); 05689 bits = (bits & 0x00ff00ff) + ((bits >> 8) & 0x00ff00ff); 05690 return (bits & 0x0000ffff) + ((bits >>16) & 0x0000ffff); 05691 } 05692 05693 static int 05694 is_onechar_cclass(CClassNode* cc, OnigCodePoint* code) 05695 { 05696 const OnigCodePoint not_found = ONIG_LAST_CODE_POINT; 05697 OnigCodePoint c = not_found; 05698 int i; 05699 BBuf *bbuf = cc->mbuf; 05700 05701 if (IS_NCCLASS_NOT(cc)) return 0; 05702 05703 /* check bbuf */ 05704 if (IS_NOT_NULL(bbuf)) { 05705 OnigCodePoint n, *data; 05706 GET_CODE_POINT(n, bbuf->p); 05707 data = (OnigCodePoint* )(bbuf->p) + 1; 05708 if ((n == 1) && (data[0] == data[1])) { 05709 /* only one char found in the bbuf, save the code point. */ 05710 c = data[0]; 05711 if (((c < SINGLE_BYTE_SIZE) && BITSET_AT(cc->bs, c))) { 05712 /* skip if c is included in the bitset */ 05713 c = not_found; 05714 } 05715 } 05716 else { 05717 return 0; /* the bbuf contains multiple chars */ 05718 } 05719 } 05720 05721 /* check bitset */ 05722 for (i = 0; i < BITSET_SIZE; i++) { 05723 Bits b1 = cc->bs[i]; 05724 if (b1 != 0) { 05725 if (((b1 & (b1 - 1)) == 0) && (c == not_found)) { 05726 c = BITS_IN_ROOM * i + countbits(b1 - 1); 05727 } else { 05728 return 0; /* the character class contains multiple chars */ 05729 } 05730 } 05731 } 05732 05733 if (c != not_found) { 05734 *code = c; 05735 return 1; 05736 } 05737 05738 /* the character class contains no char. */ 05739 return 0; 05740 } 05741 05742 05743 static int 05744 parse_exp(Node** np, OnigToken* tok, int term, 05745 UChar** src, UChar* end, ScanEnv* env) 05746 { 05747 int r, len, group = 0; 05748 Node* qn; 05749 Node** targetp; 05750 05751 *np = NULL; 05752 if (tok->type == (enum TokenSyms )term) 05753 goto end_of_token; 05754 05755 switch (tok->type) { 05756 case TK_ALT: 05757 case TK_EOT: 05758 end_of_token: 05759 *np = node_new_empty(); 05760 return tok->type; 05761 break; 05762 05763 case TK_SUBEXP_OPEN: 05764 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env); 05765 if (r < 0) return r; 05766 if (r == 1) group = 1; 05767 else if (r == 2) { /* option only */ 05768 Node* target; 05769 OnigOptionType prev = env->option; 05770 05771 env->option = NENCLOSE(*np)->option; 05772 r = fetch_token(tok, src, end, env); 05773 if (r < 0) return r; 05774 r = parse_subexp(&target, tok, term, src, end, env); 05775 env->option = prev; 05776 if (r < 0) { 05777 onig_node_free(target); 05778 return r; 05779 } 05780 NENCLOSE(*np)->target = target; 05781 return tok->type; 05782 } 05783 break; 05784 05785 case TK_SUBEXP_CLOSE: 05786 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) 05787 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; 05788 05789 if (tok->escaped) goto tk_raw_byte; 05790 else goto tk_byte; 05791 break; 05792 05793 case TK_LINEBREAK: 05794 r = node_linebreak(np, env); 05795 if (r < 0) return r; 05796 break; 05797 05798 case TK_EXTENDED_GRAPHEME_CLUSTER: 05799 r = node_extended_grapheme_cluster(np, env); 05800 if (r < 0) return r; 05801 break; 05802 05803 case TK_KEEP: 05804 *np = onig_node_new_anchor(ANCHOR_KEEP); 05805 CHECK_NULL_RETURN_MEMERR(*np); 05806 break; 05807 05808 case TK_STRING: 05809 tk_byte: 05810 { 05811 *np = node_new_str(tok->backp, *src); 05812 CHECK_NULL_RETURN_MEMERR(*np); 05813 05814 string_loop: 05815 while (1) { 05816 r = fetch_token(tok, src, end, env); 05817 if (r < 0) return r; 05818 if (r == TK_STRING) { 05819 r = onig_node_str_cat(*np, tok->backp, *src); 05820 } 05821 #ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG 05822 else if (r == TK_CODE_POINT) { 05823 r = node_str_cat_codepoint(*np, env->enc, tok->u.code); 05824 } 05825 #endif 05826 else { 05827 break; 05828 } 05829 if (r < 0) return r; 05830 } 05831 05832 string_end: 05833 targetp = np; 05834 goto repeat; 05835 } 05836 break; 05837 05838 case TK_RAW_BYTE: 05839 tk_raw_byte: 05840 { 05841 *np = node_new_str_raw_char((UChar )tok->u.c); 05842 CHECK_NULL_RETURN_MEMERR(*np); 05843 len = 1; 05844 while (1) { 05845 if (len >= ONIGENC_MBC_MINLEN(env->enc)) { 05846 if (len == enclen(env->enc, NSTR(*np)->s, NSTR(*np)->end)) { 05847 r = fetch_token(tok, src, end, env); 05848 NSTRING_CLEAR_RAW(*np); 05849 goto string_end; 05850 } 05851 } 05852 05853 r = fetch_token(tok, src, end, env); 05854 if (r < 0) return r; 05855 if (r != TK_RAW_BYTE) { 05856 /* Don't use this, it is wrong for little endian encodings. */ 05857 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR 05858 int rem; 05859 if (len < ONIGENC_MBC_MINLEN(env->enc)) { 05860 rem = ONIGENC_MBC_MINLEN(env->enc) - len; 05861 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0); 05862 if (len + rem == enclen(env->enc, NSTR(*np)->s)) { 05863 NSTRING_CLEAR_RAW(*np); 05864 goto string_end; 05865 } 05866 } 05867 #endif 05868 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; 05869 } 05870 05871 r = node_str_cat_char(*np, (UChar )tok->u.c); 05872 if (r < 0) return r; 05873 05874 len++; 05875 } 05876 } 05877 break; 05878 05879 case TK_CODE_POINT: 05880 { 05881 *np = node_new_empty(); 05882 CHECK_NULL_RETURN_MEMERR(*np); 05883 r = node_str_cat_codepoint(*np, env->enc, tok->u.code); 05884 if (r != 0) return r; 05885 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG 05886 NSTRING_SET_RAW(*np); 05887 #else 05888 goto string_loop; 05889 #endif 05890 } 05891 break; 05892 05893 case TK_QUOTE_OPEN: 05894 { 05895 OnigCodePoint end_op[2]; 05896 UChar *qstart, *qend, *nextp; 05897 05898 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax); 05899 end_op[1] = (OnigCodePoint )'E'; 05900 qstart = *src; 05901 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); 05902 if (IS_NULL(qend)) { 05903 nextp = qend = end; 05904 } 05905 *np = node_new_str(qstart, qend); 05906 CHECK_NULL_RETURN_MEMERR(*np); 05907 *src = nextp; 05908 } 05909 break; 05910 05911 case TK_CHAR_TYPE: 05912 { 05913 switch (tok->u.prop.ctype) { 05914 case ONIGENC_CTYPE_WORD: 05915 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not, 05916 IS_ASCII_RANGE(env->option)); 05917 CHECK_NULL_RETURN_MEMERR(*np); 05918 break; 05919 05920 case ONIGENC_CTYPE_SPACE: 05921 case ONIGENC_CTYPE_DIGIT: 05922 case ONIGENC_CTYPE_XDIGIT: 05923 { 05924 CClassNode* cc; 05925 05926 #ifdef USE_SHARED_CCLASS_TABLE 05927 const OnigCodePoint *mbr; 05928 OnigCodePoint sb_out; 05929 05930 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype, 05931 &sb_out, &mbr); 05932 if (r == 0 && 05933 ! IS_ASCII_RANGE(env->option) && 05934 ONIGENC_CODE_RANGE_NUM(mbr) 05935 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) { 05936 type_cclass_key key; 05937 type_cclass_key* new_key; 05938 05939 key.enc = env->enc; 05940 key.not = tok->u.prop.not; 05941 key.type = tok->u.prop.ctype; 05942 05943 THREAD_ATOMIC_START; 05944 05945 if (IS_NULL(OnigTypeCClassTable)) { 05946 OnigTypeCClassTable 05947 = onig_st_init_table_with_size(&type_type_cclass_hash, 10); 05948 if (IS_NULL(OnigTypeCClassTable)) { 05949 THREAD_ATOMIC_END; 05950 return ONIGERR_MEMORY; 05951 } 05952 } 05953 else { 05954 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key, 05955 (st_data_t* )np)) { 05956 THREAD_ATOMIC_END; 05957 break; 05958 } 05959 } 05960 05961 *np = node_new_cclass_by_codepoint_range(tok->u.prop.not, 05962 sb_out, mbr); 05963 if (IS_NULL(*np)) { 05964 THREAD_ATOMIC_END; 05965 return ONIGERR_MEMORY; 05966 } 05967 05968 cc = NCCLASS(*np); 05969 NCCLASS_SET_SHARE(cc); 05970 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key)); 05971 xmemcpy(new_key, &key, sizeof(type_cclass_key)); 05972 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key, 05973 (st_data_t )*np); 05974 05975 THREAD_ATOMIC_END; 05976 } 05977 else { 05978 #endif 05979 *np = node_new_cclass(); 05980 CHECK_NULL_RETURN_MEMERR(*np); 05981 cc = NCCLASS(*np); 05982 r = add_ctype_to_cc(cc, tok->u.prop.ctype, 0, 0, env); 05983 if (r != 0) return r; 05984 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); 05985 #ifdef USE_SHARED_CCLASS_TABLE 05986 } 05987 #endif 05988 } 05989 break; 05990 05991 default: 05992 return ONIGERR_PARSER_BUG; 05993 break; 05994 } 05995 } 05996 break; 05997 05998 case TK_CHAR_PROPERTY: 05999 r = parse_char_property(np, tok, src, end, env); 06000 if (r != 0) return r; 06001 break; 06002 06003 case TK_CC_OPEN: 06004 { 06005 CClassNode* cc; 06006 OnigCodePoint code; 06007 06008 r = parse_char_class(np, tok, src, end, env); 06009 if (r != 0) return r; 06010 06011 cc = NCCLASS(*np); 06012 if (is_onechar_cclass(cc, &code)) { 06013 onig_node_free(*np); 06014 *np = node_new_empty(); 06015 CHECK_NULL_RETURN_MEMERR(*np); 06016 r = node_str_cat_codepoint(*np, env->enc, code); 06017 if (r != 0) return r; 06018 goto string_loop; 06019 } 06020 if (IS_IGNORECASE(env->option)) { 06021 IApplyCaseFoldArg iarg; 06022 06023 iarg.env = env; 06024 iarg.cc = cc; 06025 iarg.alt_root = NULL_NODE; 06026 iarg.ptail = &(iarg.alt_root); 06027 06028 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag, 06029 i_apply_case_fold, &iarg); 06030 if (r != 0) { 06031 onig_node_free(iarg.alt_root); 06032 return r; 06033 } 06034 if (IS_NOT_NULL(iarg.alt_root)) { 06035 Node* work = onig_node_new_alt(*np, iarg.alt_root); 06036 if (IS_NULL(work)) { 06037 onig_node_free(iarg.alt_root); 06038 return ONIGERR_MEMORY; 06039 } 06040 *np = work; 06041 } 06042 } 06043 } 06044 break; 06045 06046 case TK_ANYCHAR: 06047 *np = node_new_anychar(); 06048 CHECK_NULL_RETURN_MEMERR(*np); 06049 break; 06050 06051 case TK_ANYCHAR_ANYTIME: 06052 *np = node_new_anychar(); 06053 CHECK_NULL_RETURN_MEMERR(*np); 06054 qn = node_new_quantifier(0, REPEAT_INFINITE, 0); 06055 CHECK_NULL_RETURN_MEMERR(qn); 06056 NQTFR(qn)->target = *np; 06057 *np = qn; 06058 break; 06059 06060 case TK_BACKREF: 06061 len = tok->u.backref.num; 06062 *np = node_new_backref(len, 06063 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)), 06064 tok->u.backref.by_name, 06065 #ifdef USE_BACKREF_WITH_LEVEL 06066 tok->u.backref.exist_level, 06067 tok->u.backref.level, 06068 #endif 06069 env); 06070 CHECK_NULL_RETURN_MEMERR(*np); 06071 break; 06072 06073 #ifdef USE_SUBEXP_CALL 06074 case TK_CALL: 06075 { 06076 int gnum = tok->u.call.gnum; 06077 06078 if (gnum < 0 || tok->u.call.rel != 0) { 06079 if (gnum > 0) gnum--; 06080 gnum = BACKREF_REL_TO_ABS(gnum, env); 06081 if (gnum <= 0) 06082 return ONIGERR_INVALID_BACKREF; 06083 } 06084 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum); 06085 CHECK_NULL_RETURN_MEMERR(*np); 06086 env->num_call++; 06087 } 06088 break; 06089 #endif 06090 06091 case TK_ANCHOR: 06092 *np = onig_node_new_anchor(tok->u.anchor.subtype); 06093 CHECK_NULL_RETURN_MEMERR(*np); 06094 NANCHOR(*np)->ascii_range = tok->u.anchor.ascii_range; 06095 break; 06096 06097 case TK_OP_REPEAT: 06098 case TK_INTERVAL: 06099 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) { 06100 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS)) 06101 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED; 06102 else 06103 *np = node_new_empty(); 06104 } 06105 else { 06106 goto tk_byte; 06107 } 06108 break; 06109 06110 default: 06111 return ONIGERR_PARSER_BUG; 06112 break; 06113 } 06114 06115 { 06116 targetp = np; 06117 06118 re_entry: 06119 r = fetch_token(tok, src, end, env); 06120 if (r < 0) return r; 06121 06122 repeat: 06123 if (r == TK_OP_REPEAT || r == TK_INTERVAL) { 06124 if (is_invalid_quantifier_target(*targetp)) 06125 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; 06126 06127 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, 06128 (r == TK_INTERVAL ? 1 : 0)); 06129 CHECK_NULL_RETURN_MEMERR(qn); 06130 NQTFR(qn)->greedy = tok->u.repeat.greedy; 06131 r = set_quantifier(qn, *targetp, group, env); 06132 if (r < 0) { 06133 onig_node_free(qn); 06134 return r; 06135 } 06136 06137 if (tok->u.repeat.possessive != 0) { 06138 Node* en; 06139 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK); 06140 if (IS_NULL(en)) { 06141 onig_node_free(qn); 06142 return ONIGERR_MEMORY; 06143 } 06144 NENCLOSE(en)->target = qn; 06145 qn = en; 06146 } 06147 06148 if (r == 0) { 06149 *targetp = qn; 06150 } 06151 else if (r == 1) { 06152 onig_node_free(qn); 06153 } 06154 else if (r == 2) { /* split case: /abc+/ */ 06155 Node *tmp; 06156 06157 *targetp = node_new_list(*targetp, NULL); 06158 if (IS_NULL(*targetp)) { 06159 onig_node_free(qn); 06160 return ONIGERR_MEMORY; 06161 } 06162 tmp = NCDR(*targetp) = node_new_list(qn, NULL); 06163 if (IS_NULL(tmp)) { 06164 onig_node_free(qn); 06165 return ONIGERR_MEMORY; 06166 } 06167 targetp = &(NCAR(tmp)); 06168 } 06169 goto re_entry; 06170 } 06171 } 06172 06173 return r; 06174 } 06175 06176 static int 06177 parse_branch(Node** top, OnigToken* tok, int term, 06178 UChar** src, UChar* end, ScanEnv* env) 06179 { 06180 int r; 06181 Node *node, **headp; 06182 06183 *top = NULL; 06184 r = parse_exp(&node, tok, term, src, end, env); 06185 if (r < 0) { 06186 onig_node_free(node); 06187 return r; 06188 } 06189 06190 if (r == TK_EOT || r == term || r == TK_ALT) { 06191 *top = node; 06192 } 06193 else { 06194 *top = node_new_list(node, NULL); 06195 headp = &(NCDR(*top)); 06196 while (r != TK_EOT && r != term && r != TK_ALT) { 06197 r = parse_exp(&node, tok, term, src, end, env); 06198 if (r < 0) { 06199 onig_node_free(node); 06200 return r; 06201 } 06202 06203 if (NTYPE(node) == NT_LIST) { 06204 *headp = node; 06205 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node); 06206 headp = &(NCDR(node)); 06207 } 06208 else { 06209 *headp = node_new_list(node, NULL); 06210 headp = &(NCDR(*headp)); 06211 } 06212 } 06213 } 06214 06215 return r; 06216 } 06217 06218 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ 06219 static int 06220 parse_subexp(Node** top, OnigToken* tok, int term, 06221 UChar** src, UChar* end, ScanEnv* env) 06222 { 06223 int r; 06224 Node *node, **headp; 06225 06226 *top = NULL; 06227 r = parse_branch(&node, tok, term, src, end, env); 06228 if (r < 0) { 06229 onig_node_free(node); 06230 return r; 06231 } 06232 06233 if (r == term) { 06234 *top = node; 06235 } 06236 else if (r == TK_ALT) { 06237 *top = onig_node_new_alt(node, NULL); 06238 headp = &(NCDR(*top)); 06239 while (r == TK_ALT) { 06240 r = fetch_token(tok, src, end, env); 06241 if (r < 0) return r; 06242 r = parse_branch(&node, tok, term, src, end, env); 06243 if (r < 0) { 06244 onig_node_free(node); 06245 return r; 06246 } 06247 06248 *headp = onig_node_new_alt(node, NULL); 06249 headp = &(NCDR(*headp)); 06250 } 06251 06252 if (tok->type != (enum TokenSyms )term) 06253 goto err; 06254 } 06255 else { 06256 onig_node_free(node); 06257 err: 06258 if (term == TK_SUBEXP_CLOSE) 06259 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; 06260 else 06261 return ONIGERR_PARSER_BUG; 06262 } 06263 06264 return r; 06265 } 06266 06267 static int 06268 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) 06269 { 06270 int r; 06271 OnigToken tok; 06272 06273 r = fetch_token(&tok, src, end, env); 06274 if (r < 0) return r; 06275 r = parse_subexp(top, &tok, TK_EOT, src, end, env); 06276 if (r < 0) return r; 06277 06278 #ifdef USE_SUBEXP_CALL 06279 if (env->num_call > 0) { 06280 /* Capture the pattern itself. It is used for (?R), (?0) and \g<0>. */ 06281 const int num = 0; 06282 Node* np; 06283 np = node_new_enclose_memory(env->option, 0); 06284 CHECK_NULL_RETURN_MEMERR(np); 06285 NENCLOSE(np)->regnum = num; 06286 NENCLOSE(np)->target = *top; 06287 r = scan_env_set_mem_node(env, num, np); 06288 if (r != 0) return r; 06289 *top = np; 06290 } 06291 #endif 06292 return 0; 06293 } 06294 06295 extern int 06296 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, 06297 regex_t* reg, ScanEnv* env) 06298 { 06299 int r; 06300 UChar* p; 06301 06302 #ifdef USE_NAMED_GROUP 06303 names_clear(reg); 06304 #endif 06305 06306 scan_env_clear(env); 06307 env->option = reg->options; 06308 env->case_fold_flag = reg->case_fold_flag; 06309 env->enc = reg->enc; 06310 env->syntax = reg->syntax; 06311 env->pattern = (UChar* )pattern; 06312 env->pattern_end = (UChar* )end; 06313 env->reg = reg; 06314 06315 *root = NULL; 06316 p = (UChar* )pattern; 06317 r = parse_regexp(root, &p, (UChar* )end, env); 06318 reg->num_mem = env->num_mem; 06319 return r; 06320 } 06321 06322 extern void 06323 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED, 06324 UChar* arg, UChar* arg_end) 06325 { 06326 env->error = arg; 06327 env->error_end = arg_end; 06328 } 06329