Ruby
2.0.0p247(2013-06-27revision41674)
|
00001 /********************************************************************** 00002 00003 encoding.c - 00004 00005 $Author: nobu $ 00006 created at: Thu May 24 17:23:27 JST 2007 00007 00008 Copyright (C) 2007 Yukihiro Matsumoto 00009 00010 **********************************************************************/ 00011 00012 #include "ruby/ruby.h" 00013 #include "ruby/encoding.h" 00014 #include "internal.h" 00015 #include "regenc.h" 00016 #include <ctype.h> 00017 #ifndef NO_LOCALE_CHARMAP 00018 #ifdef __CYGWIN__ 00019 #include <windows.h> 00020 #endif 00021 #ifdef HAVE_LANGINFO_H 00022 #include <langinfo.h> 00023 #endif 00024 #endif 00025 #include "ruby/util.h" 00026 00027 #if defined __GNUC__ && __GNUC__ >= 4 00028 #pragma GCC visibility push(default) 00029 int rb_enc_register(const char *name, rb_encoding *encoding); 00030 void rb_enc_set_base(const char *name, const char *orig); 00031 void rb_encdb_declare(const char *name); 00032 int rb_encdb_replicate(const char *name, const char *orig); 00033 int rb_encdb_dummy(const char *name); 00034 int rb_encdb_alias(const char *alias, const char *orig); 00035 void rb_encdb_set_unicode(int index); 00036 #pragma GCC visibility pop 00037 #endif 00038 00039 static ID id_encoding; 00040 VALUE rb_cEncoding; 00041 static VALUE rb_encoding_list; 00042 00043 struct rb_encoding_entry { 00044 const char *name; 00045 rb_encoding *enc; 00046 rb_encoding *base; 00047 }; 00048 00049 static struct { 00050 struct rb_encoding_entry *list; 00051 int count; 00052 int size; 00053 st_table *names; 00054 } enc_table; 00055 00056 void rb_enc_init(void); 00057 00058 #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX 00059 #define UNSPECIFIED_ENCODING INT_MAX 00060 00061 #define ENCODING_NAMELEN_MAX 63 00062 #define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX) 00063 00064 #define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc)) 00065 00066 static int load_encoding(const char *name); 00067 00068 static size_t 00069 enc_memsize(const void *p) 00070 { 00071 return 0; 00072 } 00073 00074 static const rb_data_type_t encoding_data_type = { 00075 "encoding", 00076 {0, 0, enc_memsize,}, 00077 }; 00078 00079 #define is_data_encoding(obj) (RTYPEDDATA_P(obj) && RTYPEDDATA_TYPE(obj) == &encoding_data_type) 00080 00081 static VALUE 00082 enc_new(rb_encoding *encoding) 00083 { 00084 return TypedData_Wrap_Struct(rb_cEncoding, &encoding_data_type, encoding); 00085 } 00086 00087 static VALUE 00088 rb_enc_from_encoding_index(int idx) 00089 { 00090 VALUE list, enc; 00091 00092 if (!(list = rb_encoding_list)) { 00093 rb_bug("rb_enc_from_encoding_index(%d): no rb_encoding_list", idx); 00094 } 00095 enc = rb_ary_entry(list, idx); 00096 if (NIL_P(enc)) { 00097 rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx); 00098 } 00099 return enc; 00100 } 00101 00102 VALUE 00103 rb_enc_from_encoding(rb_encoding *encoding) 00104 { 00105 int idx; 00106 if (!encoding) return Qnil; 00107 idx = ENC_TO_ENCINDEX(encoding); 00108 return rb_enc_from_encoding_index(idx); 00109 } 00110 00111 static int enc_autoload(rb_encoding *); 00112 00113 static int 00114 check_encoding(rb_encoding *enc) 00115 { 00116 int index = rb_enc_to_index(enc); 00117 if (rb_enc_from_index(index) != enc) 00118 return -1; 00119 if (enc_autoload_p(enc)) { 00120 index = enc_autoload(enc); 00121 } 00122 return index; 00123 } 00124 00125 static int 00126 enc_check_encoding(VALUE obj) 00127 { 00128 if (SPECIAL_CONST_P(obj) || !rb_typeddata_is_kind_of(obj, &encoding_data_type)) { 00129 return -1; 00130 } 00131 return check_encoding(RDATA(obj)->data); 00132 } 00133 00134 static int 00135 must_encoding(VALUE enc) 00136 { 00137 int index = enc_check_encoding(enc); 00138 if (index < 0) { 00139 rb_raise(rb_eTypeError, "wrong argument type %s (expected Encoding)", 00140 rb_obj_classname(enc)); 00141 } 00142 return index; 00143 } 00144 00145 int 00146 rb_to_encoding_index(VALUE enc) 00147 { 00148 int idx; 00149 00150 idx = enc_check_encoding(enc); 00151 if (idx >= 0) { 00152 return idx; 00153 } 00154 else if (NIL_P(enc = rb_check_string_type(enc))) { 00155 return -1; 00156 } 00157 if (!rb_enc_asciicompat(rb_enc_get(enc))) { 00158 return -1; 00159 } 00160 return rb_enc_find_index(StringValueCStr(enc)); 00161 } 00162 00163 /* Returns encoding index or UNSPECIFIED_ENCODING */ 00164 static int 00165 str_find_encindex(VALUE enc) 00166 { 00167 int idx; 00168 00169 StringValue(enc); 00170 if (!rb_enc_asciicompat(rb_enc_get(enc))) { 00171 rb_raise(rb_eArgError, "invalid name encoding (non ASCII)"); 00172 } 00173 idx = rb_enc_find_index(StringValueCStr(enc)); 00174 return idx; 00175 } 00176 00177 static int 00178 str_to_encindex(VALUE enc) 00179 { 00180 int idx = str_find_encindex(enc); 00181 if (idx < 0) { 00182 rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_PTR(enc)); 00183 } 00184 return idx; 00185 } 00186 00187 static rb_encoding * 00188 str_to_encoding(VALUE enc) 00189 { 00190 return rb_enc_from_index(str_to_encindex(enc)); 00191 } 00192 00193 rb_encoding * 00194 rb_to_encoding(VALUE enc) 00195 { 00196 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data; 00197 return str_to_encoding(enc); 00198 } 00199 00200 rb_encoding * 00201 rb_find_encoding(VALUE enc) 00202 { 00203 int idx; 00204 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data; 00205 idx = str_find_encindex(enc); 00206 if (idx < 0) return NULL; 00207 return rb_enc_from_index(idx); 00208 } 00209 00210 void 00211 rb_gc_mark_encodings(void) 00212 { 00213 } 00214 00215 static int 00216 enc_table_expand(int newsize) 00217 { 00218 struct rb_encoding_entry *ent; 00219 int count = newsize; 00220 00221 if (enc_table.size >= newsize) return newsize; 00222 newsize = (newsize + 7) / 8 * 8; 00223 ent = realloc(enc_table.list, sizeof(*enc_table.list) * newsize); 00224 if (!ent) return -1; 00225 memset(ent + enc_table.size, 0, sizeof(*ent)*(newsize - enc_table.size)); 00226 enc_table.list = ent; 00227 enc_table.size = newsize; 00228 return count; 00229 } 00230 00231 static int 00232 enc_register_at(int index, const char *name, rb_encoding *encoding) 00233 { 00234 struct rb_encoding_entry *ent = &enc_table.list[index]; 00235 VALUE list; 00236 00237 if (!valid_encoding_name_p(name)) return -1; 00238 if (!ent->name) { 00239 ent->name = name = strdup(name); 00240 } 00241 else if (STRCASECMP(name, ent->name)) { 00242 return -1; 00243 } 00244 if (!ent->enc) { 00245 ent->enc = xmalloc(sizeof(rb_encoding)); 00246 } 00247 if (encoding) { 00248 *ent->enc = *encoding; 00249 } 00250 else { 00251 memset(ent->enc, 0, sizeof(*ent->enc)); 00252 } 00253 encoding = ent->enc; 00254 encoding->name = name; 00255 encoding->ruby_encoding_index = index; 00256 st_insert(enc_table.names, (st_data_t)name, (st_data_t)index); 00257 list = rb_encoding_list; 00258 if (list && NIL_P(rb_ary_entry(list, index))) { 00259 /* initialize encoding data */ 00260 rb_ary_store(list, index, enc_new(encoding)); 00261 } 00262 return index; 00263 } 00264 00265 static int 00266 enc_register(const char *name, rb_encoding *encoding) 00267 { 00268 int index = enc_table.count; 00269 00270 if ((index = enc_table_expand(index + 1)) < 0) return -1; 00271 enc_table.count = index; 00272 return enc_register_at(index - 1, name, encoding); 00273 } 00274 00275 static void set_encoding_const(const char *, rb_encoding *); 00276 int rb_enc_registered(const char *name); 00277 00278 int 00279 rb_enc_register(const char *name, rb_encoding *encoding) 00280 { 00281 int index = rb_enc_registered(name); 00282 00283 if (index >= 0) { 00284 rb_encoding *oldenc = rb_enc_from_index(index); 00285 if (STRCASECMP(name, rb_enc_name(oldenc))) { 00286 index = enc_register(name, encoding); 00287 } 00288 else if (enc_autoload_p(oldenc) || !ENC_DUMMY_P(oldenc)) { 00289 enc_register_at(index, name, encoding); 00290 } 00291 else { 00292 rb_raise(rb_eArgError, "encoding %s is already registered", name); 00293 } 00294 } 00295 else { 00296 index = enc_register(name, encoding); 00297 set_encoding_const(name, rb_enc_from_index(index)); 00298 } 00299 return index; 00300 } 00301 00302 void 00303 rb_encdb_declare(const char *name) 00304 { 00305 int idx = rb_enc_registered(name); 00306 if (idx < 0) { 00307 idx = enc_register(name, 0); 00308 } 00309 set_encoding_const(name, rb_enc_from_index(idx)); 00310 } 00311 00312 static void 00313 enc_check_duplication(const char *name) 00314 { 00315 if (rb_enc_registered(name) >= 0) { 00316 rb_raise(rb_eArgError, "encoding %s is already registered", name); 00317 } 00318 } 00319 00320 static rb_encoding* 00321 set_base_encoding(int index, rb_encoding *base) 00322 { 00323 rb_encoding *enc = enc_table.list[index].enc; 00324 00325 enc_table.list[index].base = base; 00326 if (rb_enc_dummy_p(base)) ENC_SET_DUMMY(enc); 00327 return enc; 00328 } 00329 00330 /* for encdb.h 00331 * Set base encoding for encodings which are not replicas 00332 * but not in their own files. 00333 */ 00334 void 00335 rb_enc_set_base(const char *name, const char *orig) 00336 { 00337 int idx = rb_enc_registered(name); 00338 int origidx = rb_enc_registered(orig); 00339 set_base_encoding(idx, rb_enc_from_index(origidx)); 00340 } 00341 00342 int 00343 rb_enc_replicate(const char *name, rb_encoding *encoding) 00344 { 00345 int idx; 00346 00347 enc_check_duplication(name); 00348 idx = enc_register(name, encoding); 00349 set_base_encoding(idx, encoding); 00350 set_encoding_const(name, rb_enc_from_index(idx)); 00351 return idx; 00352 } 00353 00354 /* 00355 * call-seq: 00356 * enc.replicate(name) -> encoding 00357 * 00358 * Returns a replicated encoding of _enc_ whose name is _name_. 00359 * The new encoding should have the same byte structure of _enc_. 00360 * If _name_ is used by another encoding, raise ArgumentError. 00361 * 00362 */ 00363 static VALUE 00364 enc_replicate(VALUE encoding, VALUE name) 00365 { 00366 return rb_enc_from_encoding_index( 00367 rb_enc_replicate(StringValueCStr(name), 00368 rb_to_encoding(encoding))); 00369 } 00370 00371 static int 00372 enc_replicate_with_index(const char *name, rb_encoding *origenc, int idx) 00373 { 00374 if (idx < 0) { 00375 idx = enc_register(name, origenc); 00376 } 00377 else { 00378 idx = enc_register_at(idx, name, origenc); 00379 } 00380 if (idx >= 0) { 00381 set_base_encoding(idx, origenc); 00382 set_encoding_const(name, rb_enc_from_index(idx)); 00383 } 00384 return idx; 00385 } 00386 00387 int 00388 rb_encdb_replicate(const char *name, const char *orig) 00389 { 00390 int origidx = rb_enc_registered(orig); 00391 int idx = rb_enc_registered(name); 00392 00393 if (origidx < 0) { 00394 origidx = enc_register(orig, 0); 00395 } 00396 return enc_replicate_with_index(name, rb_enc_from_index(origidx), idx); 00397 } 00398 00399 int 00400 rb_define_dummy_encoding(const char *name) 00401 { 00402 int index = rb_enc_replicate(name, rb_ascii8bit_encoding()); 00403 rb_encoding *enc = enc_table.list[index].enc; 00404 00405 ENC_SET_DUMMY(enc); 00406 return index; 00407 } 00408 00409 int 00410 rb_encdb_dummy(const char *name) 00411 { 00412 int index = enc_replicate_with_index(name, rb_ascii8bit_encoding(), 00413 rb_enc_registered(name)); 00414 rb_encoding *enc = enc_table.list[index].enc; 00415 00416 ENC_SET_DUMMY(enc); 00417 return index; 00418 } 00419 00420 /* 00421 * call-seq: 00422 * enc.dummy? -> true or false 00423 * 00424 * Returns true for dummy encodings. 00425 * A dummy encoding is an encoding for which character handling is not properly 00426 * implemented. 00427 * It is used for stateful encodings. 00428 * 00429 * Encoding::ISO_2022_JP.dummy? #=> true 00430 * Encoding::UTF_8.dummy? #=> false 00431 * 00432 */ 00433 static VALUE 00434 enc_dummy_p(VALUE enc) 00435 { 00436 return ENC_DUMMY_P(enc_table.list[must_encoding(enc)].enc) ? Qtrue : Qfalse; 00437 } 00438 00439 /* 00440 * call-seq: 00441 * enc.ascii_compatible? -> true or false 00442 * 00443 * Returns whether ASCII-compatible or not. 00444 * 00445 * Encoding::UTF_8.ascii_compatible? #=> true 00446 * Encoding::UTF_16BE.ascii_compatible? #=> false 00447 * 00448 */ 00449 static VALUE 00450 enc_ascii_compatible_p(VALUE enc) 00451 { 00452 return rb_enc_asciicompat(enc_table.list[must_encoding(enc)].enc) ? Qtrue : Qfalse; 00453 } 00454 00455 /* 00456 * Returns 1 when the encoding is Unicode series other than UTF-7 else 0. 00457 */ 00458 int 00459 rb_enc_unicode_p(rb_encoding *enc) 00460 { 00461 return ONIGENC_IS_UNICODE(enc); 00462 } 00463 00464 static st_data_t 00465 enc_dup_name(st_data_t name) 00466 { 00467 return (st_data_t)strdup((const char *)name); 00468 } 00469 00470 /* 00471 * Returns copied alias name when the key is added for st_table, 00472 * else returns NULL. 00473 */ 00474 static int 00475 enc_alias_internal(const char *alias, int idx) 00476 { 00477 return st_insert2(enc_table.names, (st_data_t)alias, (st_data_t)idx, 00478 enc_dup_name); 00479 } 00480 00481 static int 00482 enc_alias(const char *alias, int idx) 00483 { 00484 if (!valid_encoding_name_p(alias)) return -1; 00485 if (!enc_alias_internal(alias, idx)) 00486 set_encoding_const(alias, rb_enc_from_index(idx)); 00487 return idx; 00488 } 00489 00490 int 00491 rb_enc_alias(const char *alias, const char *orig) 00492 { 00493 int idx; 00494 00495 enc_check_duplication(alias); 00496 if (!enc_table.list) { 00497 rb_enc_init(); 00498 } 00499 if ((idx = rb_enc_find_index(orig)) < 0) { 00500 return -1; 00501 } 00502 return enc_alias(alias, idx); 00503 } 00504 00505 int 00506 rb_encdb_alias(const char *alias, const char *orig) 00507 { 00508 int idx = rb_enc_registered(orig); 00509 00510 if (idx < 0) { 00511 idx = enc_register(orig, 0); 00512 } 00513 return enc_alias(alias, idx); 00514 } 00515 00516 void 00517 rb_encdb_set_unicode(int index) 00518 { 00519 rb_enc_from_index(index)->flags |= ONIGENC_FLAG_UNICODE; 00520 } 00521 00522 enum { 00523 ENCINDEX_ASCII, 00524 ENCINDEX_UTF_8, 00525 ENCINDEX_US_ASCII, 00526 ENCINDEX_BUILTIN_MAX 00527 }; 00528 00529 extern rb_encoding OnigEncodingUTF_8; 00530 extern rb_encoding OnigEncodingUS_ASCII; 00531 00532 void 00533 rb_enc_init(void) 00534 { 00535 enc_table_expand(ENCODING_COUNT + 1); 00536 if (!enc_table.names) { 00537 enc_table.names = st_init_strcasetable(); 00538 } 00539 #define ENC_REGISTER(enc) enc_register_at(ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc) 00540 ENC_REGISTER(ASCII); 00541 ENC_REGISTER(UTF_8); 00542 ENC_REGISTER(US_ASCII); 00543 #undef ENC_REGISTER 00544 enc_table.count = ENCINDEX_BUILTIN_MAX; 00545 } 00546 00547 rb_encoding * 00548 rb_enc_from_index(int index) 00549 { 00550 if (!enc_table.list) { 00551 rb_enc_init(); 00552 } 00553 if (index < 0 || enc_table.count <= index) { 00554 return 0; 00555 } 00556 return enc_table.list[index].enc; 00557 } 00558 00559 int 00560 rb_enc_registered(const char *name) 00561 { 00562 st_data_t idx = 0; 00563 00564 if (!name) return -1; 00565 if (!enc_table.list) return -1; 00566 if (st_lookup(enc_table.names, (st_data_t)name, &idx)) { 00567 return (int)idx; 00568 } 00569 return -1; 00570 } 00571 00572 static VALUE 00573 require_enc(VALUE enclib) 00574 { 00575 int safe = rb_safe_level(); 00576 return rb_require_safe(enclib, safe > 3 ? 3 : safe); 00577 } 00578 00579 static int 00580 load_encoding(const char *name) 00581 { 00582 VALUE enclib = rb_sprintf("enc/%s.so", name); 00583 VALUE verbose = ruby_verbose; 00584 VALUE debug = ruby_debug; 00585 VALUE loaded; 00586 char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3; 00587 int idx; 00588 00589 while (s < e) { 00590 if (!ISALNUM(*s)) *s = '_'; 00591 else if (ISUPPER(*s)) *s = (char)TOLOWER(*s); 00592 ++s; 00593 } 00594 FL_UNSET(enclib, FL_TAINT|FL_UNTRUSTED); 00595 OBJ_FREEZE(enclib); 00596 ruby_verbose = Qfalse; 00597 ruby_debug = Qfalse; 00598 loaded = rb_protect(require_enc, enclib, 0); 00599 ruby_verbose = verbose; 00600 ruby_debug = debug; 00601 rb_set_errinfo(Qnil); 00602 if (NIL_P(loaded)) return -1; 00603 if ((idx = rb_enc_registered(name)) < 0) return -1; 00604 if (enc_autoload_p(enc_table.list[idx].enc)) return -1; 00605 return idx; 00606 } 00607 00608 static int 00609 enc_autoload(rb_encoding *enc) 00610 { 00611 int i; 00612 rb_encoding *base = enc_table.list[ENC_TO_ENCINDEX(enc)].base; 00613 00614 if (base) { 00615 i = 0; 00616 do { 00617 if (i >= enc_table.count) return -1; 00618 } while (enc_table.list[i].enc != base && (++i, 1)); 00619 if (enc_autoload_p(base)) { 00620 if (enc_autoload(base) < 0) return -1; 00621 } 00622 i = ENC_TO_ENCINDEX(enc); 00623 enc_register_at(i, rb_enc_name(enc), base); 00624 } 00625 else { 00626 i = load_encoding(rb_enc_name(enc)); 00627 } 00628 return i; 00629 } 00630 00631 /* Return encoding index or UNSPECIFIED_ENCODING from encoding name */ 00632 int 00633 rb_enc_find_index(const char *name) 00634 { 00635 int i = rb_enc_registered(name); 00636 rb_encoding *enc; 00637 00638 if (i < 0) { 00639 i = load_encoding(name); 00640 } 00641 else if (!(enc = rb_enc_from_index(i))) { 00642 if (i != UNSPECIFIED_ENCODING) { 00643 rb_raise(rb_eArgError, "encoding %s is not registered", name); 00644 } 00645 } 00646 else if (enc_autoload_p(enc)) { 00647 if (enc_autoload(enc) < 0) { 00648 rb_warn("failed to load encoding (%s); use ASCII-8BIT instead", 00649 name); 00650 return 0; 00651 } 00652 } 00653 return i; 00654 } 00655 00656 rb_encoding * 00657 rb_enc_find(const char *name) 00658 { 00659 int idx = rb_enc_find_index(name); 00660 if (idx < 0) idx = 0; 00661 return rb_enc_from_index(idx); 00662 } 00663 00664 static inline int 00665 enc_capable(VALUE obj) 00666 { 00667 if (SPECIAL_CONST_P(obj)) return SYMBOL_P(obj); 00668 switch (BUILTIN_TYPE(obj)) { 00669 case T_STRING: 00670 case T_REGEXP: 00671 case T_FILE: 00672 return TRUE; 00673 case T_DATA: 00674 if (is_data_encoding(obj)) return TRUE; 00675 default: 00676 return FALSE; 00677 } 00678 } 00679 00680 ID 00681 rb_id_encoding(void) 00682 { 00683 CONST_ID(id_encoding, "encoding"); 00684 return id_encoding; 00685 } 00686 00687 int 00688 rb_enc_get_index(VALUE obj) 00689 { 00690 int i = -1; 00691 VALUE tmp; 00692 00693 if (SPECIAL_CONST_P(obj)) { 00694 if (!SYMBOL_P(obj)) return -1; 00695 obj = rb_id2str(SYM2ID(obj)); 00696 } 00697 switch (BUILTIN_TYPE(obj)) { 00698 as_default: 00699 default: 00700 case T_STRING: 00701 case T_REGEXP: 00702 i = ENCODING_GET_INLINED(obj); 00703 if (i == ENCODING_INLINE_MAX) { 00704 VALUE iv; 00705 00706 iv = rb_ivar_get(obj, rb_id_encoding()); 00707 i = NUM2INT(iv); 00708 } 00709 break; 00710 case T_FILE: 00711 tmp = rb_funcall(obj, rb_intern("internal_encoding"), 0, 0); 00712 if (NIL_P(tmp)) obj = rb_funcall(obj, rb_intern("external_encoding"), 0, 0); 00713 else obj = tmp; 00714 if (NIL_P(obj)) break; 00715 case T_DATA: 00716 if (is_data_encoding(obj)) { 00717 i = enc_check_encoding(obj); 00718 } 00719 else { 00720 goto as_default; 00721 } 00722 break; 00723 } 00724 return i; 00725 } 00726 00727 static void 00728 enc_set_index(VALUE obj, int idx) 00729 { 00730 if (idx < ENCODING_INLINE_MAX) { 00731 ENCODING_SET_INLINED(obj, idx); 00732 return; 00733 } 00734 ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX); 00735 rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx)); 00736 } 00737 00738 void 00739 rb_enc_set_index(VALUE obj, int idx) 00740 { 00741 rb_check_frozen(obj); 00742 enc_set_index(obj, idx); 00743 } 00744 00745 VALUE 00746 rb_enc_associate_index(VALUE obj, int idx) 00747 { 00748 /* enc_check_capable(obj);*/ 00749 rb_check_frozen(obj); 00750 if (rb_enc_get_index(obj) == idx) 00751 return obj; 00752 if (SPECIAL_CONST_P(obj)) { 00753 rb_raise(rb_eArgError, "cannot set encoding"); 00754 } 00755 if (!ENC_CODERANGE_ASCIIONLY(obj) || 00756 !rb_enc_asciicompat(rb_enc_from_index(idx))) { 00757 ENC_CODERANGE_CLEAR(obj); 00758 } 00759 enc_set_index(obj, idx); 00760 return obj; 00761 } 00762 00763 VALUE 00764 rb_enc_associate(VALUE obj, rb_encoding *enc) 00765 { 00766 return rb_enc_associate_index(obj, rb_enc_to_index(enc)); 00767 } 00768 00769 rb_encoding* 00770 rb_enc_get(VALUE obj) 00771 { 00772 return rb_enc_from_index(rb_enc_get_index(obj)); 00773 } 00774 00775 rb_encoding* 00776 rb_enc_check(VALUE str1, VALUE str2) 00777 { 00778 rb_encoding *enc = rb_enc_compatible(str1, str2); 00779 if (!enc) 00780 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 00781 rb_enc_name(rb_enc_get(str1)), 00782 rb_enc_name(rb_enc_get(str2))); 00783 return enc; 00784 } 00785 00786 rb_encoding* 00787 rb_enc_compatible(VALUE str1, VALUE str2) 00788 { 00789 int idx1, idx2; 00790 rb_encoding *enc1, *enc2; 00791 int isstr1, isstr2; 00792 00793 idx1 = rb_enc_get_index(str1); 00794 idx2 = rb_enc_get_index(str2); 00795 00796 if (idx1 < 0 || idx2 < 0) 00797 return 0; 00798 00799 if (idx1 == idx2) { 00800 return rb_enc_from_index(idx1); 00801 } 00802 enc1 = rb_enc_from_index(idx1); 00803 enc2 = rb_enc_from_index(idx2); 00804 00805 isstr2 = RB_TYPE_P(str2, T_STRING); 00806 if (isstr2 && RSTRING_LEN(str2) == 0) 00807 return enc1; 00808 isstr1 = RB_TYPE_P(str1, T_STRING); 00809 if (isstr1 && RSTRING_LEN(str1) == 0) 00810 return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2; 00811 if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) { 00812 return 0; 00813 } 00814 00815 /* objects whose encoding is the same of contents */ 00816 if (!isstr2 && idx2 == ENCINDEX_US_ASCII) 00817 return enc1; 00818 if (!isstr1 && idx1 == ENCINDEX_US_ASCII) 00819 return enc2; 00820 00821 if (!isstr1) { 00822 VALUE tmp = str1; 00823 int idx0 = idx1; 00824 str1 = str2; 00825 str2 = tmp; 00826 idx1 = idx2; 00827 idx2 = idx0; 00828 idx0 = isstr1; 00829 isstr1 = isstr2; 00830 isstr2 = idx0; 00831 } 00832 if (isstr1) { 00833 int cr1, cr2; 00834 00835 cr1 = rb_enc_str_coderange(str1); 00836 if (isstr2) { 00837 cr2 = rb_enc_str_coderange(str2); 00838 if (cr1 != cr2) { 00839 /* may need to handle ENC_CODERANGE_BROKEN */ 00840 if (cr1 == ENC_CODERANGE_7BIT) return enc2; 00841 if (cr2 == ENC_CODERANGE_7BIT) return enc1; 00842 } 00843 if (cr2 == ENC_CODERANGE_7BIT) { 00844 return enc1; 00845 } 00846 } 00847 if (cr1 == ENC_CODERANGE_7BIT) 00848 return enc2; 00849 } 00850 return 0; 00851 } 00852 00853 void 00854 rb_enc_copy(VALUE obj1, VALUE obj2) 00855 { 00856 rb_enc_associate_index(obj1, rb_enc_get_index(obj2)); 00857 } 00858 00859 00860 /* 00861 * call-seq: 00862 * obj.encoding -> encoding 00863 * 00864 * Returns the Encoding object that represents the encoding of obj. 00865 */ 00866 00867 VALUE 00868 rb_obj_encoding(VALUE obj) 00869 { 00870 int idx = rb_enc_get_index(obj); 00871 if (idx < 0) { 00872 rb_raise(rb_eTypeError, "unknown encoding"); 00873 } 00874 return rb_enc_from_encoding_index(idx); 00875 } 00876 00877 int 00878 rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc) 00879 { 00880 return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); 00881 } 00882 00883 int 00884 rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc) 00885 { 00886 int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); 00887 if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p) 00888 return MBCLEN_CHARFOUND_LEN(n); 00889 else { 00890 int min = rb_enc_mbminlen(enc); 00891 return min <= e-p ? min : (int)(e-p); 00892 } 00893 } 00894 00895 int 00896 rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc) 00897 { 00898 int n; 00899 if (e <= p) 00900 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); 00901 n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); 00902 if (e-p < n) 00903 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p)); 00904 return n; 00905 } 00906 00907 int 00908 rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc) 00909 { 00910 unsigned int c, l; 00911 if (e <= p) 00912 return -1; 00913 if (rb_enc_asciicompat(enc)) { 00914 c = (unsigned char)*p; 00915 if (!ISASCII(c)) 00916 return -1; 00917 if (len) *len = 1; 00918 return c; 00919 } 00920 l = rb_enc_precise_mbclen(p, e, enc); 00921 if (!MBCLEN_CHARFOUND_P(l)) 00922 return -1; 00923 c = rb_enc_mbc_to_codepoint(p, e, enc); 00924 if (!rb_enc_isascii(c, enc)) 00925 return -1; 00926 if (len) *len = l; 00927 return c; 00928 } 00929 00930 unsigned int 00931 rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc) 00932 { 00933 int r; 00934 if (e <= p) 00935 rb_raise(rb_eArgError, "empty string"); 00936 r = rb_enc_precise_mbclen(p, e, enc); 00937 if (!MBCLEN_CHARFOUND_P(r)) { 00938 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc)); 00939 } 00940 if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r); 00941 return rb_enc_mbc_to_codepoint(p, e, enc); 00942 } 00943 00944 #undef rb_enc_codepoint 00945 unsigned int 00946 rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc) 00947 { 00948 return rb_enc_codepoint_len(p, e, 0, enc); 00949 } 00950 00951 int 00952 rb_enc_codelen(int c, rb_encoding *enc) 00953 { 00954 int n = ONIGENC_CODE_TO_MBCLEN(enc,c); 00955 if (n == 0) { 00956 rb_raise(rb_eArgError, "invalid codepoint 0x%x in %s", c, rb_enc_name(enc)); 00957 } 00958 return n; 00959 } 00960 00961 int 00962 rb_enc_toupper(int c, rb_encoding *enc) 00963 { 00964 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c)); 00965 } 00966 00967 int 00968 rb_enc_tolower(int c, rb_encoding *enc) 00969 { 00970 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c)); 00971 } 00972 00973 /* 00974 * call-seq: 00975 * enc.inspect -> string 00976 * 00977 * Returns a string which represents the encoding for programmers. 00978 * 00979 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>" 00980 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>" 00981 */ 00982 static VALUE 00983 enc_inspect(VALUE self) 00984 { 00985 VALUE str = rb_sprintf("#<%s:%s%s>", rb_obj_classname(self), 00986 rb_enc_name((rb_encoding*)DATA_PTR(self)), 00987 (enc_dummy_p(self) ? " (dummy)" : "")); 00988 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00989 return str; 00990 } 00991 00992 /* 00993 * call-seq: 00994 * enc.name -> string 00995 * 00996 * Returns the name of the encoding. 00997 * 00998 * Encoding::UTF_8.name #=> "UTF-8" 00999 */ 01000 static VALUE 01001 enc_name(VALUE self) 01002 { 01003 return rb_usascii_str_new2(rb_enc_name((rb_encoding*)DATA_PTR(self))); 01004 } 01005 01006 static int 01007 enc_names_i(st_data_t name, st_data_t idx, st_data_t args) 01008 { 01009 VALUE *arg = (VALUE *)args; 01010 01011 if ((int)idx == (int)arg[0]) { 01012 VALUE str = rb_usascii_str_new2((char *)name); 01013 OBJ_FREEZE(str); 01014 rb_ary_push(arg[1], str); 01015 } 01016 return ST_CONTINUE; 01017 } 01018 01019 /* 01020 * call-seq: 01021 * enc.names -> array 01022 * 01023 * Returns the list of name and aliases of the encoding. 01024 * 01025 * Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J"] 01026 */ 01027 static VALUE 01028 enc_names(VALUE self) 01029 { 01030 VALUE args[2]; 01031 01032 args[0] = (VALUE)rb_to_encoding_index(self); 01033 args[1] = rb_ary_new2(0); 01034 st_foreach(enc_table.names, enc_names_i, (st_data_t)args); 01035 return args[1]; 01036 } 01037 01038 /* 01039 * call-seq: 01040 * Encoding.list -> [enc1, enc2, ...] 01041 * 01042 * Returns the list of loaded encodings. 01043 * 01044 * Encoding.list 01045 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>, 01046 * #<Encoding:ISO-2022-JP (dummy)>] 01047 * 01048 * Encoding.find("US-ASCII") 01049 * #=> #<Encoding:US-ASCII> 01050 * 01051 * Encoding.list 01052 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>, 01053 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>] 01054 * 01055 */ 01056 static VALUE 01057 enc_list(VALUE klass) 01058 { 01059 VALUE ary = rb_ary_new2(0); 01060 rb_ary_replace(ary, rb_encoding_list); 01061 return ary; 01062 } 01063 01064 /* 01065 * call-seq: 01066 * Encoding.find(string) -> enc 01067 * Encoding.find(symbol) -> enc 01068 * 01069 * Search the encoding with specified <i>name</i>. 01070 * <i>name</i> should be a string or symbol. 01071 * 01072 * Encoding.find("US-ASCII") #=> #<Encoding:US-ASCII> 01073 * Encoding.find(:Shift_JIS) #=> #<Encoding:Shift_JIS> 01074 * 01075 * Names which this method accept are encoding names and aliases 01076 * including following special aliases 01077 * 01078 * "external":: default external encoding 01079 * "internal":: default internal encoding 01080 * "locale":: locale encoding 01081 * "filesystem":: filesystem encoding 01082 * 01083 * An ArgumentError is raised when no encoding with <i>name</i>. 01084 * Only <code>Encoding.find("internal")</code> however returns nil 01085 * when no encoding named "internal", in other words, when Ruby has no 01086 * default internal encoding. 01087 */ 01088 static VALUE 01089 enc_find(VALUE klass, VALUE enc) 01090 { 01091 int idx; 01092 if (RB_TYPE_P(enc, T_DATA) && is_data_encoding(enc)) 01093 return enc; 01094 idx = str_to_encindex(enc); 01095 if (idx == UNSPECIFIED_ENCODING) return Qnil; 01096 return rb_enc_from_encoding_index(idx); 01097 } 01098 01099 /* 01100 * call-seq: 01101 * Encoding.compatible?(obj1, obj2) -> enc or nil 01102 * 01103 * Checks the compatibility of two objects. 01104 * 01105 * If the objects are both strings they are compatible when they are 01106 * concatenatable. The encoding of the concatenated string will be returned 01107 * if they are compatible, nil if they are not. 01108 * 01109 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b") 01110 * #=> #<Encoding:ISO-8859-1> 01111 * 01112 * Encoding.compatible?( 01113 * "\xa1".force_encoding("iso-8859-1"), 01114 * "\xa1\xa1".force_encoding("euc-jp")) 01115 * #=> nil 01116 * 01117 * If the objects are non-strings their encodings are compatible when they 01118 * have an encoding and: 01119 * * Either encoding is US-ASCII compatible 01120 * * One of the encodings is a 7-bit encoding 01121 * 01122 */ 01123 static VALUE 01124 enc_compatible_p(VALUE klass, VALUE str1, VALUE str2) 01125 { 01126 rb_encoding *enc; 01127 01128 if (!enc_capable(str1)) return Qnil; 01129 if (!enc_capable(str2)) return Qnil; 01130 enc = rb_enc_compatible(str1, str2); 01131 if (!enc) return Qnil; 01132 return rb_enc_from_encoding(enc); 01133 } 01134 01135 /* :nodoc: */ 01136 static VALUE 01137 enc_dump(int argc, VALUE *argv, VALUE self) 01138 { 01139 rb_scan_args(argc, argv, "01", 0); 01140 return enc_name(self); 01141 } 01142 01143 /* :nodoc: */ 01144 static VALUE 01145 enc_load(VALUE klass, VALUE str) 01146 { 01147 return enc_find(klass, str); 01148 } 01149 01150 rb_encoding * 01151 rb_ascii8bit_encoding(void) 01152 { 01153 if (!enc_table.list) { 01154 rb_enc_init(); 01155 } 01156 return enc_table.list[ENCINDEX_ASCII].enc; 01157 } 01158 01159 int 01160 rb_ascii8bit_encindex(void) 01161 { 01162 return ENCINDEX_ASCII; 01163 } 01164 01165 rb_encoding * 01166 rb_utf8_encoding(void) 01167 { 01168 if (!enc_table.list) { 01169 rb_enc_init(); 01170 } 01171 return enc_table.list[ENCINDEX_UTF_8].enc; 01172 } 01173 01174 int 01175 rb_utf8_encindex(void) 01176 { 01177 return ENCINDEX_UTF_8; 01178 } 01179 01180 rb_encoding * 01181 rb_usascii_encoding(void) 01182 { 01183 if (!enc_table.list) { 01184 rb_enc_init(); 01185 } 01186 return enc_table.list[ENCINDEX_US_ASCII].enc; 01187 } 01188 01189 int 01190 rb_usascii_encindex(void) 01191 { 01192 return ENCINDEX_US_ASCII; 01193 } 01194 01195 int 01196 rb_locale_encindex(void) 01197 { 01198 VALUE charmap = rb_locale_charmap(rb_cEncoding); 01199 int idx; 01200 01201 if (NIL_P(charmap)) 01202 idx = rb_usascii_encindex(); 01203 else if ((idx = rb_enc_find_index(StringValueCStr(charmap))) < 0) 01204 idx = rb_ascii8bit_encindex(); 01205 01206 if (rb_enc_registered("locale") < 0) enc_alias_internal("locale", idx); 01207 01208 return idx; 01209 } 01210 01211 rb_encoding * 01212 rb_locale_encoding(void) 01213 { 01214 return rb_enc_from_index(rb_locale_encindex()); 01215 } 01216 01217 static int 01218 enc_set_filesystem_encoding(void) 01219 { 01220 int idx; 01221 #if defined NO_LOCALE_CHARMAP 01222 idx = rb_enc_to_index(rb_default_external_encoding()); 01223 #elif defined _WIN32 || defined __CYGWIN__ 01224 char cp[sizeof(int) * 8 / 3 + 4]; 01225 snprintf(cp, sizeof cp, "CP%d", AreFileApisANSI() ? GetACP() : GetOEMCP()); 01226 idx = rb_enc_find_index(cp); 01227 if (idx < 0) idx = rb_ascii8bit_encindex(); 01228 #else 01229 idx = rb_enc_to_index(rb_default_external_encoding()); 01230 #endif 01231 01232 enc_alias_internal("filesystem", idx); 01233 return idx; 01234 } 01235 01236 int 01237 rb_filesystem_encindex(void) 01238 { 01239 int idx = rb_enc_registered("filesystem"); 01240 if (idx < 0) 01241 idx = rb_ascii8bit_encindex(); 01242 return idx; 01243 } 01244 01245 rb_encoding * 01246 rb_filesystem_encoding(void) 01247 { 01248 return rb_enc_from_index(rb_filesystem_encindex()); 01249 } 01250 01251 struct default_encoding { 01252 int index; /* -2 => not yet set, -1 => nil */ 01253 rb_encoding *enc; 01254 }; 01255 01256 static struct default_encoding default_external = {0}; 01257 01258 static int 01259 enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const char *name) 01260 { 01261 int overridden = FALSE; 01262 01263 if (def->index != -2) 01264 /* Already set */ 01265 overridden = TRUE; 01266 01267 if (NIL_P(encoding)) { 01268 def->index = -1; 01269 def->enc = 0; 01270 st_insert(enc_table.names, (st_data_t)strdup(name), 01271 (st_data_t)UNSPECIFIED_ENCODING); 01272 } 01273 else { 01274 def->index = rb_enc_to_index(rb_to_encoding(encoding)); 01275 def->enc = 0; 01276 enc_alias_internal(name, def->index); 01277 } 01278 01279 if (def == &default_external) 01280 enc_set_filesystem_encoding(); 01281 01282 return overridden; 01283 } 01284 01285 rb_encoding * 01286 rb_default_external_encoding(void) 01287 { 01288 if (default_external.enc) return default_external.enc; 01289 01290 if (default_external.index >= 0) { 01291 default_external.enc = rb_enc_from_index(default_external.index); 01292 return default_external.enc; 01293 } 01294 else { 01295 return rb_locale_encoding(); 01296 } 01297 } 01298 01299 VALUE 01300 rb_enc_default_external(void) 01301 { 01302 return rb_enc_from_encoding(rb_default_external_encoding()); 01303 } 01304 01305 /* 01306 * call-seq: 01307 * Encoding.default_external -> enc 01308 * 01309 * Returns default external encoding. 01310 * 01311 * The default external encoding is used by default for strings created from 01312 * the following locations: 01313 * 01314 * * CSV 01315 * * File data read from disk 01316 * * SDBM 01317 * * StringIO 01318 * * Zlib::GzipReader 01319 * * Zlib::GzipWriter 01320 * * String#inspect 01321 * * Regexp#inspect 01322 * 01323 * While strings created from these locations will have this encoding, the 01324 * encoding may not be valid. Be sure to check String#valid_encoding?. 01325 * 01326 * File data written to disk will be transcoded to the default external 01327 * encoding when written. 01328 * 01329 * The default external encoding is initialized by the locale or -E option. 01330 */ 01331 static VALUE 01332 get_default_external(VALUE klass) 01333 { 01334 return rb_enc_default_external(); 01335 } 01336 01337 void 01338 rb_enc_set_default_external(VALUE encoding) 01339 { 01340 if (NIL_P(encoding)) { 01341 rb_raise(rb_eArgError, "default external can not be nil"); 01342 } 01343 enc_set_default_encoding(&default_external, encoding, 01344 "external"); 01345 } 01346 01347 /* 01348 * call-seq: 01349 * Encoding.default_external = enc 01350 * 01351 * Sets default external encoding. You should not set 01352 * Encoding::default_external in ruby code as strings created before changing 01353 * the value may have a different encoding from strings created after the value 01354 * was changed., instead you should use <tt>ruby -E</tt> to invoke ruby with 01355 * the correct default_external. 01356 * 01357 * See Encoding::default_external for information on how the default external 01358 * encoding is used. 01359 */ 01360 static VALUE 01361 set_default_external(VALUE klass, VALUE encoding) 01362 { 01363 rb_warning("setting Encoding.default_external"); 01364 rb_enc_set_default_external(encoding); 01365 return encoding; 01366 } 01367 01368 static struct default_encoding default_internal = {-2}; 01369 01370 rb_encoding * 01371 rb_default_internal_encoding(void) 01372 { 01373 if (!default_internal.enc && default_internal.index >= 0) { 01374 default_internal.enc = rb_enc_from_index(default_internal.index); 01375 } 01376 return default_internal.enc; /* can be NULL */ 01377 } 01378 01379 VALUE 01380 rb_enc_default_internal(void) 01381 { 01382 /* Note: These functions cope with default_internal not being set */ 01383 return rb_enc_from_encoding(rb_default_internal_encoding()); 01384 } 01385 01386 /* 01387 * call-seq: 01388 * Encoding.default_internal -> enc 01389 * 01390 * Returns default internal encoding. Strings will be transcoded to the 01391 * default internal encoding in the following places if the default internal 01392 * encoding is not nil: 01393 * 01394 * * CSV 01395 * * Etc.sysconfdir and Etc.systmpdir 01396 * * File data read from disk 01397 * * File names from Dir 01398 * * Integer#chr 01399 * * String#inspect and Regexp#inspect 01400 * * Strings returned from Curses 01401 * * Strings returned from Readline 01402 * * Strings returned from SDBM 01403 * * Time#zone 01404 * * Values from ENV 01405 * * Values in ARGV including $PROGRAM_NAME 01406 * * __FILE__ 01407 * 01408 * Additionally String#encode and String#encode! use the default internal 01409 * encoding if no encoding is given. 01410 * 01411 * The locale encoding (__ENCODING__), not default_internal, is used as the 01412 * encoding of created strings. 01413 * 01414 * Encoding::default_internal is initialized by the source file's 01415 * internal_encoding or -E option. 01416 */ 01417 static VALUE 01418 get_default_internal(VALUE klass) 01419 { 01420 return rb_enc_default_internal(); 01421 } 01422 01423 void 01424 rb_enc_set_default_internal(VALUE encoding) 01425 { 01426 enc_set_default_encoding(&default_internal, encoding, 01427 "internal"); 01428 } 01429 01430 /* 01431 * call-seq: 01432 * Encoding.default_internal = enc or nil 01433 * 01434 * Sets default internal encoding or removes default internal encoding when 01435 * passed nil. You should not set Encoding::default_internal in ruby code as 01436 * strings created before changing the value may have a different encoding 01437 * from strings created after the change. Instead you should use 01438 * <tt>ruby -E</tt> to invoke ruby with the correct default_internal. 01439 * 01440 * See Encoding::default_internal for information on how the default internal 01441 * encoding is used. 01442 */ 01443 static VALUE 01444 set_default_internal(VALUE klass, VALUE encoding) 01445 { 01446 rb_warning("setting Encoding.default_internal"); 01447 rb_enc_set_default_internal(encoding); 01448 return encoding; 01449 } 01450 01451 /* 01452 * call-seq: 01453 * Encoding.locale_charmap -> string 01454 * 01455 * Returns the locale charmap name. 01456 * It returns nil if no appropriate information. 01457 * 01458 * Debian GNU/Linux 01459 * LANG=C 01460 * Encoding.locale_charmap #=> "ANSI_X3.4-1968" 01461 * LANG=ja_JP.EUC-JP 01462 * Encoding.locale_charmap #=> "EUC-JP" 01463 * 01464 * SunOS 5 01465 * LANG=C 01466 * Encoding.locale_charmap #=> "646" 01467 * LANG=ja 01468 * Encoding.locale_charmap #=> "eucJP" 01469 * 01470 * The result is highly platform dependent. 01471 * So Encoding.find(Encoding.locale_charmap) may cause an error. 01472 * If you need some encoding object even for unknown locale, 01473 * Encoding.find("locale") can be used. 01474 * 01475 */ 01476 VALUE 01477 rb_locale_charmap(VALUE klass) 01478 { 01479 #if defined NO_LOCALE_CHARMAP 01480 return rb_usascii_str_new2("ASCII-8BIT"); 01481 #elif defined _WIN32 || defined __CYGWIN__ 01482 const char *codeset = 0; 01483 char cp[sizeof(int) * 3 + 4]; 01484 # ifdef __CYGWIN__ 01485 const char *nl_langinfo_codeset(void); 01486 codeset = nl_langinfo_codeset(); 01487 # endif 01488 if (!codeset) { 01489 UINT codepage = GetConsoleCP(); 01490 if (!codepage) codepage = GetACP(); 01491 snprintf(cp, sizeof(cp), "CP%d", codepage); 01492 codeset = cp; 01493 } 01494 return rb_usascii_str_new2(codeset); 01495 #elif defined HAVE_LANGINFO_H 01496 char *codeset; 01497 codeset = nl_langinfo(CODESET); 01498 return rb_usascii_str_new2(codeset); 01499 #else 01500 return Qnil; 01501 #endif 01502 } 01503 01504 static void 01505 set_encoding_const(const char *name, rb_encoding *enc) 01506 { 01507 VALUE encoding = rb_enc_from_encoding(enc); 01508 char *s = (char *)name; 01509 int haslower = 0, hasupper = 0, valid = 0; 01510 01511 if (ISDIGIT(*s)) return; 01512 if (ISUPPER(*s)) { 01513 hasupper = 1; 01514 while (*++s && (ISALNUM(*s) || *s == '_')) { 01515 if (ISLOWER(*s)) haslower = 1; 01516 } 01517 } 01518 if (!*s) { 01519 if (s - name > ENCODING_NAMELEN_MAX) return; 01520 valid = 1; 01521 rb_define_const(rb_cEncoding, name, encoding); 01522 } 01523 if (!valid || haslower) { 01524 size_t len = s - name; 01525 if (len > ENCODING_NAMELEN_MAX) return; 01526 if (!haslower || !hasupper) { 01527 do { 01528 if (ISLOWER(*s)) haslower = 1; 01529 if (ISUPPER(*s)) hasupper = 1; 01530 } while (*++s && (!haslower || !hasupper)); 01531 len = s - name; 01532 } 01533 len += strlen(s); 01534 if (len++ > ENCODING_NAMELEN_MAX) return; 01535 MEMCPY(s = ALLOCA_N(char, len), name, char, len); 01536 name = s; 01537 if (!valid) { 01538 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s); 01539 for (; *s; ++s) { 01540 if (!ISALNUM(*s)) *s = '_'; 01541 } 01542 if (hasupper) { 01543 rb_define_const(rb_cEncoding, name, encoding); 01544 } 01545 } 01546 if (haslower) { 01547 for (s = (char *)name; *s; ++s) { 01548 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s); 01549 } 01550 rb_define_const(rb_cEncoding, name, encoding); 01551 } 01552 } 01553 } 01554 01555 static int 01556 rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg) 01557 { 01558 VALUE ary = (VALUE)arg; 01559 VALUE str = rb_usascii_str_new2((char *)name); 01560 OBJ_FREEZE(str); 01561 rb_ary_push(ary, str); 01562 return ST_CONTINUE; 01563 } 01564 01565 /* 01566 * call-seq: 01567 * Encoding.name_list -> ["enc1", "enc2", ...] 01568 * 01569 * Returns the list of available encoding names. 01570 * 01571 * Encoding.name_list 01572 * #=> ["US-ASCII", "ASCII-8BIT", "UTF-8", 01573 * "ISO-8859-1", "Shift_JIS", "EUC-JP", 01574 * "Windows-31J", 01575 * "BINARY", "CP932", "eucJP"] 01576 * 01577 */ 01578 01579 static VALUE 01580 rb_enc_name_list(VALUE klass) 01581 { 01582 VALUE ary = rb_ary_new2(enc_table.names->num_entries); 01583 st_foreach(enc_table.names, rb_enc_name_list_i, (st_data_t)ary); 01584 return ary; 01585 } 01586 01587 static int 01588 rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg) 01589 { 01590 VALUE *p = (VALUE *)arg; 01591 VALUE aliases = p[0], ary = p[1]; 01592 int idx = (int)orig; 01593 VALUE key, str = rb_ary_entry(ary, idx); 01594 01595 if (NIL_P(str)) { 01596 rb_encoding *enc = rb_enc_from_index(idx); 01597 01598 if (!enc) return ST_CONTINUE; 01599 if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) { 01600 return ST_CONTINUE; 01601 } 01602 str = rb_usascii_str_new2(rb_enc_name(enc)); 01603 OBJ_FREEZE(str); 01604 rb_ary_store(ary, idx, str); 01605 } 01606 key = rb_usascii_str_new2((char *)name); 01607 OBJ_FREEZE(key); 01608 rb_hash_aset(aliases, key, str); 01609 return ST_CONTINUE; 01610 } 01611 01612 /* 01613 * call-seq: 01614 * Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...} 01615 * 01616 * Returns the hash of available encoding alias and original encoding name. 01617 * 01618 * Encoding.aliases 01619 * #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII", 01620 * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"} 01621 * 01622 */ 01623 01624 static VALUE 01625 rb_enc_aliases(VALUE klass) 01626 { 01627 VALUE aliases[2]; 01628 aliases[0] = rb_hash_new(); 01629 aliases[1] = rb_ary_new(); 01630 st_foreach(enc_table.names, rb_enc_aliases_enc_i, (st_data_t)aliases); 01631 return aliases[0]; 01632 } 01633 01634 /* 01635 * An Encoding instance represents a character encoding usable in Ruby. It is 01636 * defined as a constant under the Encoding namespace. It has a name and 01637 * optionally, aliases: 01638 * 01639 * Encoding::ISO_8859_1.name 01640 * #=> #<Encoding:ISO-8859-1> 01641 * 01642 * Encoding::ISO_8859_1.names 01643 * #=> ["ISO-8859-1", "ISO8859-1"] 01644 * 01645 * Ruby methods dealing with encodings return or accept Encoding instances as 01646 * arguments (when a method accepts an Encoding instance as an argument, it 01647 * can be passed an Encoding name or alias instead). 01648 * 01649 * "some string".encoding 01650 * #=> #<Encoding:UTF-8> 01651 * 01652 * string = "some string".encode(Encoding::ISO_8859_1) 01653 * #=> "some string" 01654 * string.encoding 01655 * #=> #<Encoding:ISO-8859-1> 01656 * 01657 * "some string".encode "ISO-8859-1" 01658 * #=> "some string" 01659 * 01660 * <code>Encoding::ASCII_8BIT</code> is a special encoding that is usually 01661 * used for a byte string, not a character string. But as the name insists, 01662 * its characters in the range of ASCII are considered as ASCII characters. 01663 * This is useful when you use ASCII-8BIT characters with other ASCII 01664 * compatible characters. 01665 * 01666 * == Changing an encoding 01667 * 01668 * The associated Encoding of a String can be changed in two different ways. 01669 * 01670 * First, it is possible to set the Encoding of a string to a new Encoding 01671 * without changing the internal byte representation of the string, with 01672 * String#force_encoding. This is how you can tell Ruby the correct encoding 01673 * of a string. 01674 * 01675 * string 01676 * #=> "R\xC3\xA9sum\xC3\xA9" 01677 * string.encoding 01678 * #=> #<Encoding:ISO-8859-1> 01679 * string.force_encoding(Encoding::UTF-8) 01680 * #=> "R\u00E9sum\u00E9" 01681 * 01682 * Second, it is possible to transcode a string, i.e. translate its internal 01683 * byte representation to another encoding. Its associated encoding is also 01684 * set to the other encoding. See String#encode for the various forms of 01685 * transcoding, and the Encoding::Converter class for additional control over 01686 * the transcoding process. 01687 * 01688 * string 01689 * #=> "R\u00E9sum\u00E9" 01690 * string.encoding 01691 * #=> #<Encoding:UTF-8> 01692 * string = string.encode!(Encoding::ISO_8859_1) 01693 * #=> "R\xE9sum\xE9" 01694 * string.encoding 01695 * #=> #<Encoding::ISO-8859-1> 01696 * 01697 * == Script encoding 01698 * 01699 * All Ruby script code has an associated Encoding which any String literal 01700 * created in the source code will be associated to. 01701 * 01702 * The default script encoding is <code>Encoding::US-ASCII</code>, but it can 01703 * be changed by a magic comment on the first line of the source code file (or 01704 * second line, if there is a shebang line on the first). The comment must 01705 * contain the word <code>coding</code> or <code>encoding</code>, followed 01706 * by a colon, space and the Encoding name or alias: 01707 * 01708 * # encoding: UTF-8 01709 * 01710 * "some string".encoding 01711 * #=> #<Encoding:UTF-8> 01712 * 01713 * The <code>__ENCODING__</code> keyword returns the script encoding of the file 01714 * which the keyword is written: 01715 * 01716 * # encoding: ISO-8859-1 01717 * 01718 * __ENCODING__ 01719 * #=> #<Encoding:ISO-8859-1> 01720 * 01721 * <code>ruby -K</code> will change the default locale encoding, but this is 01722 * not recommended. Ruby source files should declare its script encoding by a 01723 * magic comment even when they only depend on US-ASCII strings or regular 01724 * expressions. 01725 * 01726 * == Locale encoding 01727 * 01728 * The default encoding of the environment. Usually derived from locale. 01729 * 01730 * see Encoding.locale_charmap, Encoding.find('locale') 01731 * 01732 * == Filesystem encoding 01733 * 01734 * The default encoding of strings from the filesystem of the environment. 01735 * This is used for strings of file names or paths. 01736 * 01737 * see Encoding.find('filesystem') 01738 * 01739 * == External encoding 01740 * 01741 * Each IO object has an external encoding which indicates the encoding that 01742 * Ruby will use to read its data. By default Ruby sets the external encoding 01743 * of an IO object to the default external encoding. The default external 01744 * encoding is set by locale encoding or the interpreter <code>-E</code> option. 01745 * Encoding.default_external returns the current value of the external 01746 * encoding. 01747 * 01748 * ENV["LANG"] 01749 * #=> "UTF-8" 01750 * Encoding.default_external 01751 * #=> #<Encoding:UTF-8> 01752 * 01753 * $ ruby -E ISO-8859-1 -e "p Encoding.default_external" 01754 * #<Encoding:ISO-8859-1> 01755 * 01756 * $ LANG=C ruby -e 'p Encoding.default_external' 01757 * #<Encoding:US-ASCII> 01758 * 01759 * The default external encoding may also be set through 01760 * Encoding.default_external=, but you should not do this as strings created 01761 * before and after the change will have inconsistent encodings. Instead use 01762 * <code>ruby -E</code> to invoke ruby with the correct external encoding. 01763 * 01764 * When you know that the actual encoding of the data of an IO object is not 01765 * the default external encoding, you can reset its external encoding with 01766 * IO#set_encoding or set it at IO object creation (see IO.new options). 01767 * 01768 * == Internal encoding 01769 * 01770 * To process the data of an IO object which has an encoding different 01771 * from its external encoding, you can set its internal encoding. Ruby will use 01772 * this internal encoding to transcode the data when it is read from the IO 01773 * object. 01774 * 01775 * Conversely, when data is written to the IO object it is transcoded from the 01776 * internal encoding to the external encoding of the IO object. 01777 * 01778 * The internal encoding of an IO object can be set with 01779 * IO#set_encoding or at IO object creation (see IO.new options). 01780 * 01781 * The internal encoding is optional and when not set, the Ruby default 01782 * internal encoding is used. If not explicitly set this default internal 01783 * encoding is +nil+ meaning that by default, no transcoding occurs. 01784 * 01785 * The default internal encoding can be set with the interpreter option 01786 * <code>-E</code>. Encoding.default_internal returns the current internal 01787 * encoding. 01788 * 01789 * $ ruby -e 'p Encoding.default_internal' 01790 * nil 01791 * 01792 * $ ruby -E ISO-8859-1:UTF-8 -e "p [Encoding.default_external, \ 01793 * Encoding.default_internal]" 01794 * [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] 01795 * 01796 * The default internal encoding may also be set through 01797 * Encoding.default_internal=, but you should not do this as strings created 01798 * before and after the change will have inconsistent encodings. Instead use 01799 * <code>ruby -E</code> to invoke ruby with the correct internal encoding. 01800 * 01801 * == IO encoding example 01802 * 01803 * In the following example a UTF-8 encoded string "R\u00E9sum\u00E9" is transcoded for 01804 * output to ISO-8859-1 encoding, then read back in and transcoded to UTF-8: 01805 * 01806 * string = "R\u00E9sum\u00E9" 01807 * 01808 * open("transcoded.txt", "w:ISO-8859-1") do |io| 01809 * io.write(string) 01810 * end 01811 * 01812 * puts "raw text:" 01813 * p File.binread("transcoded.txt") 01814 * puts 01815 * 01816 * open("transcoded.txt", "r:ISO-8859-1:UTF-8") do |io| 01817 * puts "transcoded text:" 01818 * p io.read 01819 * end 01820 * 01821 * While writing the file, the internal encoding is not specified as it is 01822 * only necessary for reading. While reading the file both the internal and 01823 * external encoding must be specified to obtain the correct result. 01824 * 01825 * $ ruby t.rb 01826 * raw text: 01827 * "R\xE9sum\xE9" 01828 * 01829 * transcoded text: 01830 * "R\u00E9sum\u00E9" 01831 * 01832 */ 01833 01834 void 01835 Init_Encoding(void) 01836 { 01837 #undef rb_intern 01838 #define rb_intern(str) rb_intern_const(str) 01839 VALUE list; 01840 int i; 01841 01842 rb_cEncoding = rb_define_class("Encoding", rb_cObject); 01843 rb_undef_alloc_func(rb_cEncoding); 01844 rb_undef_method(CLASS_OF(rb_cEncoding), "new"); 01845 rb_define_method(rb_cEncoding, "to_s", enc_name, 0); 01846 rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0); 01847 rb_define_method(rb_cEncoding, "name", enc_name, 0); 01848 rb_define_method(rb_cEncoding, "names", enc_names, 0); 01849 rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0); 01850 rb_define_method(rb_cEncoding, "ascii_compatible?", enc_ascii_compatible_p, 0); 01851 rb_define_method(rb_cEncoding, "replicate", enc_replicate, 1); 01852 rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0); 01853 rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0); 01854 rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0); 01855 rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1); 01856 rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2); 01857 01858 rb_define_method(rb_cEncoding, "_dump", enc_dump, -1); 01859 rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1); 01860 01861 rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0); 01862 rb_define_singleton_method(rb_cEncoding, "default_external=", set_default_external, 1); 01863 rb_define_singleton_method(rb_cEncoding, "default_internal", get_default_internal, 0); 01864 rb_define_singleton_method(rb_cEncoding, "default_internal=", set_default_internal, 1); 01865 rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0); 01866 01867 list = rb_ary_new2(enc_table.count); 01868 RBASIC(list)->klass = 0; 01869 rb_encoding_list = list; 01870 rb_gc_register_mark_object(list); 01871 01872 for (i = 0; i < enc_table.count; ++i) { 01873 rb_ary_push(list, enc_new(enc_table.list[i].enc)); 01874 } 01875 } 01876 01877 /* locale insensitive ctype functions */ 01878 01879 #define ctype_test(c, ctype) \ 01880 (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), (ctype))) 01881 01882 int rb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); } 01883 int rb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); } 01884 int rb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); } 01885 int rb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); } 01886 int rb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); } 01887 int rb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); } 01888 int rb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); } 01889 int rb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); } 01890 int rb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); } 01891 int rb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); } 01892 int rb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); } 01893 int rb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); } 01894 01895 int 01896 rb_tolower(int c) 01897 { 01898 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c; 01899 } 01900 01901 int 01902 rb_toupper(int c) 01903 { 01904 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c; 01905 } 01906 01907