Ruby
2.0.0p247(2013-06-27revision41674)
|
00001 /********************************************************************** 00002 00003 transcode.c - 00004 00005 $Author: drbrain $ 00006 created at: Tue Oct 30 16:10:22 JST 2007 00007 00008 Copyright (C) 2007 Martin Duerst 00009 00010 **********************************************************************/ 00011 00012 #include "ruby/ruby.h" 00013 #include "ruby/encoding.h" 00014 #include "internal.h" 00015 #include "transcode_data.h" 00016 #include <ctype.h> 00017 00018 #define ENABLE_ECONV_NEWLINE_OPTION 1 00019 00020 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */ 00021 VALUE rb_eUndefinedConversionError; 00022 VALUE rb_eInvalidByteSequenceError; 00023 VALUE rb_eConverterNotFoundError; 00024 00025 VALUE rb_cEncodingConverter; 00026 00027 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback, sym_aref; 00028 static VALUE sym_xml, sym_text, sym_attr; 00029 static VALUE sym_universal_newline; 00030 static VALUE sym_crlf_newline; 00031 static VALUE sym_cr_newline; 00032 #ifdef ENABLE_ECONV_NEWLINE_OPTION 00033 static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf; 00034 #endif 00035 static VALUE sym_partial_input; 00036 00037 static VALUE sym_invalid_byte_sequence; 00038 static VALUE sym_undefined_conversion; 00039 static VALUE sym_destination_buffer_full; 00040 static VALUE sym_source_buffer_empty; 00041 static VALUE sym_finished; 00042 static VALUE sym_after_output; 00043 static VALUE sym_incomplete_input; 00044 00045 static unsigned char * 00046 allocate_converted_string(const char *sname, const char *dname, 00047 const unsigned char *str, size_t len, 00048 unsigned char *caller_dst_buf, size_t caller_dst_bufsize, 00049 size_t *dst_len_ptr); 00050 00051 /* dynamic structure, one per conversion (similar to iconv_t) */ 00052 /* may carry conversion state (e.g. for iso-2022-jp) */ 00053 typedef struct rb_transcoding { 00054 const rb_transcoder *transcoder; 00055 00056 int flags; 00057 00058 int resume_position; 00059 unsigned int next_table; 00060 VALUE next_info; 00061 unsigned char next_byte; 00062 unsigned int output_index; 00063 00064 ssize_t recognized_len; /* already interpreted */ 00065 ssize_t readagain_len; /* not yet interpreted */ 00066 union { 00067 unsigned char ary[8]; /* max_input <= sizeof(ary) */ 00068 unsigned char *ptr; /* length: max_input */ 00069 } readbuf; /* recognized_len + readagain_len used */ 00070 00071 ssize_t writebuf_off; 00072 ssize_t writebuf_len; 00073 union { 00074 unsigned char ary[8]; /* max_output <= sizeof(ary) */ 00075 unsigned char *ptr; /* length: max_output */ 00076 } writebuf; 00077 00078 union rb_transcoding_state_t { /* opaque data for stateful encoding */ 00079 void *ptr; 00080 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)]; 00081 double dummy_for_alignment; 00082 } state; 00083 } rb_transcoding; 00084 #define TRANSCODING_READBUF(tc) \ 00085 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \ 00086 (tc)->readbuf.ary : \ 00087 (tc)->readbuf.ptr) 00088 #define TRANSCODING_WRITEBUF(tc) \ 00089 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \ 00090 (tc)->writebuf.ary : \ 00091 (tc)->writebuf.ptr) 00092 #define TRANSCODING_WRITEBUF_SIZE(tc) \ 00093 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \ 00094 sizeof((tc)->writebuf.ary) : \ 00095 (size_t)(tc)->transcoder->max_output) 00096 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t)) 00097 #define TRANSCODING_STATE(tc) \ 00098 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \ 00099 (tc)->state.ary : \ 00100 (tc)->state.ptr) 00101 00102 typedef struct { 00103 struct rb_transcoding *tc; 00104 unsigned char *out_buf_start; 00105 unsigned char *out_data_start; 00106 unsigned char *out_data_end; 00107 unsigned char *out_buf_end; 00108 rb_econv_result_t last_result; 00109 } rb_econv_elem_t; 00110 00111 struct rb_econv_t { 00112 int flags; 00113 const char *source_encoding_name; 00114 const char *destination_encoding_name; 00115 00116 int started; 00117 00118 const unsigned char *replacement_str; 00119 size_t replacement_len; 00120 const char *replacement_enc; 00121 int replacement_allocated; 00122 00123 unsigned char *in_buf_start; 00124 unsigned char *in_data_start; 00125 unsigned char *in_data_end; 00126 unsigned char *in_buf_end; 00127 rb_econv_elem_t *elems; 00128 int num_allocated; 00129 int num_trans; 00130 int num_finished; 00131 struct rb_transcoding *last_tc; 00132 00133 /* last error */ 00134 struct { 00135 rb_econv_result_t result; 00136 struct rb_transcoding *error_tc; 00137 const char *source_encoding; 00138 const char *destination_encoding; 00139 const unsigned char *error_bytes_start; 00140 size_t error_bytes_len; 00141 size_t readagain_len; 00142 } last_error; 00143 00144 /* The following fields are only for Encoding::Converter. 00145 * rb_econv_open set them NULL. */ 00146 rb_encoding *source_encoding; 00147 rb_encoding *destination_encoding; 00148 }; 00149 00150 /* 00151 * Dispatch data and logic 00152 */ 00153 00154 #define DECORATOR_P(sname, dname) (*(sname) == '\0') 00155 00156 typedef struct { 00157 const char *sname; 00158 const char *dname; 00159 const char *lib; /* null means means no need to load a library */ 00160 const rb_transcoder *transcoder; 00161 } transcoder_entry_t; 00162 00163 static st_table *transcoder_table; 00164 00165 static transcoder_entry_t * 00166 make_transcoder_entry(const char *sname, const char *dname) 00167 { 00168 st_data_t val; 00169 st_table *table2; 00170 00171 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { 00172 val = (st_data_t)st_init_strcasetable(); 00173 st_add_direct(transcoder_table, (st_data_t)sname, val); 00174 } 00175 table2 = (st_table *)val; 00176 if (!st_lookup(table2, (st_data_t)dname, &val)) { 00177 transcoder_entry_t *entry = ALLOC(transcoder_entry_t); 00178 entry->sname = sname; 00179 entry->dname = dname; 00180 entry->lib = NULL; 00181 entry->transcoder = NULL; 00182 val = (st_data_t)entry; 00183 st_add_direct(table2, (st_data_t)dname, val); 00184 } 00185 return (transcoder_entry_t *)val; 00186 } 00187 00188 static transcoder_entry_t * 00189 get_transcoder_entry(const char *sname, const char *dname) 00190 { 00191 st_data_t val; 00192 st_table *table2; 00193 00194 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { 00195 return NULL; 00196 } 00197 table2 = (st_table *)val; 00198 if (!st_lookup(table2, (st_data_t)dname, &val)) { 00199 return NULL; 00200 } 00201 return (transcoder_entry_t *)val; 00202 } 00203 00204 void 00205 rb_register_transcoder(const rb_transcoder *tr) 00206 { 00207 const char *const sname = tr->src_encoding; 00208 const char *const dname = tr->dst_encoding; 00209 00210 transcoder_entry_t *entry; 00211 00212 entry = make_transcoder_entry(sname, dname); 00213 if (entry->transcoder) { 00214 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered", 00215 sname, dname); 00216 } 00217 00218 entry->transcoder = tr; 00219 } 00220 00221 static void 00222 declare_transcoder(const char *sname, const char *dname, const char *lib) 00223 { 00224 transcoder_entry_t *entry; 00225 00226 entry = make_transcoder_entry(sname, dname); 00227 entry->lib = lib; 00228 } 00229 00230 static const char transcoder_lib_prefix[] = "enc/trans/"; 00231 00232 void 00233 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib) 00234 { 00235 if (!lib) { 00236 rb_raise(rb_eArgError, "invalid library name - (null)"); 00237 } 00238 declare_transcoder(enc1, enc2, lib); 00239 } 00240 00241 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0) 00242 00243 typedef struct search_path_queue_tag { 00244 struct search_path_queue_tag *next; 00245 const char *enc; 00246 } search_path_queue_t; 00247 00248 typedef struct { 00249 st_table *visited; 00250 search_path_queue_t *queue; 00251 search_path_queue_t **queue_last_ptr; 00252 const char *base_enc; 00253 } search_path_bfs_t; 00254 00255 static int 00256 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg) 00257 { 00258 const char *dname = (const char *)key; 00259 search_path_bfs_t *bfs = (search_path_bfs_t *)arg; 00260 search_path_queue_t *q; 00261 00262 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) { 00263 return ST_CONTINUE; 00264 } 00265 00266 q = ALLOC(search_path_queue_t); 00267 q->enc = dname; 00268 q->next = NULL; 00269 *bfs->queue_last_ptr = q; 00270 bfs->queue_last_ptr = &q->next; 00271 00272 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc); 00273 return ST_CONTINUE; 00274 } 00275 00276 static int 00277 transcode_search_path(const char *sname, const char *dname, 00278 void (*callback)(const char *sname, const char *dname, int depth, void *arg), 00279 void *arg) 00280 { 00281 search_path_bfs_t bfs; 00282 search_path_queue_t *q; 00283 st_data_t val; 00284 st_table *table2; 00285 int found; 00286 int pathlen = -1; 00287 00288 if (encoding_equal(sname, dname)) 00289 return -1; 00290 00291 q = ALLOC(search_path_queue_t); 00292 q->enc = sname; 00293 q->next = NULL; 00294 bfs.queue_last_ptr = &q->next; 00295 bfs.queue = q; 00296 00297 bfs.visited = st_init_strcasetable(); 00298 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL); 00299 00300 while (bfs.queue) { 00301 q = bfs.queue; 00302 bfs.queue = q->next; 00303 if (!bfs.queue) 00304 bfs.queue_last_ptr = &bfs.queue; 00305 00306 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) { 00307 xfree(q); 00308 continue; 00309 } 00310 table2 = (st_table *)val; 00311 00312 if (st_lookup(table2, (st_data_t)dname, &val)) { 00313 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc); 00314 xfree(q); 00315 found = 1; 00316 goto cleanup; 00317 } 00318 00319 bfs.base_enc = q->enc; 00320 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs); 00321 bfs.base_enc = NULL; 00322 00323 xfree(q); 00324 } 00325 found = 0; 00326 00327 cleanup: 00328 while (bfs.queue) { 00329 q = bfs.queue; 00330 bfs.queue = q->next; 00331 xfree(q); 00332 } 00333 00334 if (found) { 00335 const char *enc = dname; 00336 int depth; 00337 pathlen = 0; 00338 while (1) { 00339 st_lookup(bfs.visited, (st_data_t)enc, &val); 00340 if (!val) 00341 break; 00342 pathlen++; 00343 enc = (const char *)val; 00344 } 00345 depth = pathlen; 00346 enc = dname; 00347 while (1) { 00348 st_lookup(bfs.visited, (st_data_t)enc, &val); 00349 if (!val) 00350 break; 00351 callback((const char *)val, enc, --depth, arg); 00352 enc = (const char *)val; 00353 } 00354 } 00355 00356 st_free_table(bfs.visited); 00357 00358 return pathlen; /* is -1 if not found */ 00359 } 00360 00361 static const rb_transcoder * 00362 load_transcoder_entry(transcoder_entry_t *entry) 00363 { 00364 if (entry->transcoder) 00365 return entry->transcoder; 00366 00367 if (entry->lib) { 00368 const char *const lib = entry->lib; 00369 const size_t len = strlen(lib); 00370 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len; 00371 const VALUE fn = rb_str_new(0, total_len); 00372 char *const path = RSTRING_PTR(fn); 00373 const int safe = rb_safe_level(); 00374 00375 entry->lib = NULL; 00376 00377 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1); 00378 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len); 00379 rb_str_set_len(fn, total_len); 00380 FL_UNSET(fn, FL_TAINT|FL_UNTRUSTED); 00381 OBJ_FREEZE(fn); 00382 if (!rb_require_safe(fn, safe > 3 ? 3 : safe)) 00383 return NULL; 00384 } 00385 00386 if (entry->transcoder) 00387 return entry->transcoder; 00388 00389 return NULL; 00390 } 00391 00392 static const char* 00393 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr) 00394 { 00395 if (encoding_equal(encname, "UTF-8")) { 00396 *len_ret = 3; 00397 *repl_encname_ptr = "UTF-8"; 00398 return "\xEF\xBF\xBD"; 00399 } 00400 else { 00401 *len_ret = 1; 00402 *repl_encname_ptr = "US-ASCII"; 00403 return "?"; 00404 } 00405 } 00406 00407 /* 00408 * Transcoding engine logic 00409 */ 00410 00411 static const unsigned char * 00412 transcode_char_start(rb_transcoding *tc, 00413 const unsigned char *in_start, 00414 const unsigned char *inchar_start, 00415 const unsigned char *in_p, 00416 size_t *char_len_ptr) 00417 { 00418 const unsigned char *ptr; 00419 if (inchar_start - in_start < tc->recognized_len) { 00420 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len, 00421 inchar_start, unsigned char, in_p - inchar_start); 00422 ptr = TRANSCODING_READBUF(tc); 00423 } 00424 else { 00425 ptr = inchar_start - tc->recognized_len; 00426 } 00427 *char_len_ptr = tc->recognized_len + (in_p - inchar_start); 00428 return ptr; 00429 } 00430 00431 static rb_econv_result_t 00432 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, 00433 const unsigned char *in_stop, unsigned char *out_stop, 00434 rb_transcoding *tc, 00435 const int opt) 00436 { 00437 const rb_transcoder *tr = tc->transcoder; 00438 int unitlen = tr->input_unit_length; 00439 ssize_t readagain_len = 0; 00440 00441 const unsigned char *inchar_start; 00442 const unsigned char *in_p; 00443 00444 unsigned char *out_p; 00445 00446 in_p = inchar_start = *in_pos; 00447 00448 out_p = *out_pos; 00449 00450 #define SUSPEND(ret, num) \ 00451 do { \ 00452 tc->resume_position = (num); \ 00453 if (0 < in_p - inchar_start) \ 00454 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \ 00455 inchar_start, unsigned char, in_p - inchar_start); \ 00456 *in_pos = in_p; \ 00457 *out_pos = out_p; \ 00458 tc->recognized_len += in_p - inchar_start; \ 00459 if (readagain_len) { \ 00460 tc->recognized_len -= readagain_len; \ 00461 tc->readagain_len = readagain_len; \ 00462 } \ 00463 return (ret); \ 00464 resume_label ## num:; \ 00465 } while (0) 00466 #define SUSPEND_OBUF(num) \ 00467 do { \ 00468 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \ 00469 } while (0) 00470 00471 #define SUSPEND_AFTER_OUTPUT(num) \ 00472 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \ 00473 SUSPEND(econv_after_output, num); \ 00474 } 00475 00476 #define next_table (tc->next_table) 00477 #define next_info (tc->next_info) 00478 #define next_byte (tc->next_byte) 00479 #define writebuf_len (tc->writebuf_len) 00480 #define writebuf_off (tc->writebuf_off) 00481 00482 switch (tc->resume_position) { 00483 case 0: break; 00484 case 1: goto resume_label1; 00485 case 2: goto resume_label2; 00486 case 3: goto resume_label3; 00487 case 4: goto resume_label4; 00488 case 5: goto resume_label5; 00489 case 6: goto resume_label6; 00490 case 7: goto resume_label7; 00491 case 8: goto resume_label8; 00492 case 9: goto resume_label9; 00493 case 10: goto resume_label10; 00494 case 11: goto resume_label11; 00495 case 12: goto resume_label12; 00496 case 13: goto resume_label13; 00497 case 14: goto resume_label14; 00498 case 15: goto resume_label15; 00499 case 16: goto resume_label16; 00500 case 17: goto resume_label17; 00501 case 18: goto resume_label18; 00502 case 19: goto resume_label19; 00503 case 20: goto resume_label20; 00504 case 21: goto resume_label21; 00505 case 22: goto resume_label22; 00506 case 23: goto resume_label23; 00507 case 24: goto resume_label24; 00508 case 25: goto resume_label25; 00509 case 26: goto resume_label26; 00510 case 27: goto resume_label27; 00511 case 28: goto resume_label28; 00512 case 29: goto resume_label29; 00513 case 30: goto resume_label30; 00514 case 31: goto resume_label31; 00515 case 32: goto resume_label32; 00516 case 33: goto resume_label33; 00517 case 34: goto resume_label34; 00518 } 00519 00520 while (1) { 00521 inchar_start = in_p; 00522 tc->recognized_len = 0; 00523 next_table = tr->conv_tree_start; 00524 00525 SUSPEND_AFTER_OUTPUT(24); 00526 00527 if (in_stop <= in_p) { 00528 if (!(opt & ECONV_PARTIAL_INPUT)) 00529 break; 00530 SUSPEND(econv_source_buffer_empty, 7); 00531 continue; 00532 } 00533 00534 #define BYTE_ADDR(index) (tr->byte_array + (index)) 00535 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index)) 00536 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table))) 00537 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table))) 00538 #define BL_MIN_BYTE (BL_BASE[0]) 00539 #define BL_MAX_BYTE (BL_BASE[1]) 00540 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE]) 00541 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))]) 00542 00543 next_byte = (unsigned char)*in_p++; 00544 follow_byte: 00545 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte) 00546 next_info = INVALID; 00547 else { 00548 next_info = (VALUE)BL_ACTION(next_byte); 00549 } 00550 follow_info: 00551 switch (next_info & 0x1F) { 00552 case NOMAP: 00553 { 00554 const unsigned char *p = inchar_start; 00555 writebuf_off = 0; 00556 while (p < in_p) { 00557 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++; 00558 } 00559 writebuf_len = writebuf_off; 00560 writebuf_off = 0; 00561 while (writebuf_off < writebuf_len) { 00562 SUSPEND_OBUF(3); 00563 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; 00564 } 00565 } 00566 continue; 00567 case 0x00: case 0x04: case 0x08: case 0x0C: 00568 case 0x10: case 0x14: case 0x18: case 0x1C: 00569 SUSPEND_AFTER_OUTPUT(25); 00570 while (in_p >= in_stop) { 00571 if (!(opt & ECONV_PARTIAL_INPUT)) 00572 goto incomplete; 00573 SUSPEND(econv_source_buffer_empty, 5); 00574 } 00575 next_byte = (unsigned char)*in_p++; 00576 next_table = (unsigned int)next_info; 00577 goto follow_byte; 00578 case ZERObt: /* drop input */ 00579 continue; 00580 case ONEbt: 00581 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info); 00582 continue; 00583 case TWObt: 00584 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info); 00585 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info); 00586 continue; 00587 case THREEbt: 00588 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info); 00589 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info); 00590 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info); 00591 continue; 00592 case FOURbt: 00593 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info); 00594 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info); 00595 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info); 00596 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info); 00597 continue; 00598 case GB4bt: 00599 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info); 00600 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info); 00601 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info); 00602 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info); 00603 continue; 00604 case STR1: 00605 tc->output_index = 0; 00606 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) { 00607 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index]; 00608 tc->output_index++; 00609 } 00610 continue; 00611 case FUNii: 00612 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info); 00613 goto follow_info; 00614 case FUNsi: 00615 { 00616 const unsigned char *char_start; 00617 size_t char_len; 00618 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); 00619 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len); 00620 goto follow_info; 00621 } 00622 case FUNio: 00623 SUSPEND_OBUF(13); 00624 if (tr->max_output <= out_stop - out_p) 00625 out_p += tr->func_io(TRANSCODING_STATE(tc), 00626 next_info, out_p, out_stop - out_p); 00627 else { 00628 writebuf_len = tr->func_io(TRANSCODING_STATE(tc), 00629 next_info, 00630 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); 00631 writebuf_off = 0; 00632 while (writebuf_off < writebuf_len) { 00633 SUSPEND_OBUF(20); 00634 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; 00635 } 00636 } 00637 break; 00638 case FUNso: 00639 { 00640 const unsigned char *char_start; 00641 size_t char_len; 00642 SUSPEND_OBUF(14); 00643 if (tr->max_output <= out_stop - out_p) { 00644 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); 00645 out_p += tr->func_so(TRANSCODING_STATE(tc), 00646 char_start, (size_t)char_len, 00647 out_p, out_stop - out_p); 00648 } 00649 else { 00650 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); 00651 writebuf_len = tr->func_so(TRANSCODING_STATE(tc), 00652 char_start, (size_t)char_len, 00653 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); 00654 writebuf_off = 0; 00655 while (writebuf_off < writebuf_len) { 00656 SUSPEND_OBUF(22); 00657 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; 00658 } 00659 } 00660 break; 00661 } 00662 case FUNsio: 00663 { 00664 const unsigned char *char_start; 00665 size_t char_len; 00666 SUSPEND_OBUF(33); 00667 if (tr->max_output <= out_stop - out_p) { 00668 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); 00669 out_p += tr->func_sio(TRANSCODING_STATE(tc), 00670 char_start, (size_t)char_len, next_info, 00671 out_p, out_stop - out_p); 00672 } 00673 else { 00674 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); 00675 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc), 00676 char_start, (size_t)char_len, next_info, 00677 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); 00678 writebuf_off = 0; 00679 while (writebuf_off < writebuf_len) { 00680 SUSPEND_OBUF(34); 00681 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; 00682 } 00683 } 00684 break; 00685 } 00686 case INVALID: 00687 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) { 00688 if (tc->recognized_len + (in_p - inchar_start) < unitlen) 00689 SUSPEND_AFTER_OUTPUT(26); 00690 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) { 00691 in_p = in_stop; 00692 SUSPEND(econv_source_buffer_empty, 8); 00693 } 00694 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) { 00695 in_p = in_stop; 00696 } 00697 else { 00698 in_p = inchar_start + (unitlen - tc->recognized_len); 00699 } 00700 } 00701 else { 00702 ssize_t invalid_len; /* including the last byte which causes invalid */ 00703 ssize_t discard_len; 00704 invalid_len = tc->recognized_len + (in_p - inchar_start); 00705 discard_len = ((invalid_len - 1) / unitlen) * unitlen; 00706 readagain_len = invalid_len - discard_len; 00707 } 00708 goto invalid; 00709 case UNDEF: 00710 goto undef; 00711 default: 00712 rb_raise(rb_eRuntimeError, "unknown transcoding instruction"); 00713 } 00714 continue; 00715 00716 invalid: 00717 SUSPEND(econv_invalid_byte_sequence, 1); 00718 continue; 00719 00720 incomplete: 00721 SUSPEND(econv_incomplete_input, 27); 00722 continue; 00723 00724 undef: 00725 SUSPEND(econv_undefined_conversion, 2); 00726 continue; 00727 } 00728 00729 /* cleanup */ 00730 if (tr->finish_func) { 00731 SUSPEND_OBUF(4); 00732 if (tr->max_output <= out_stop - out_p) { 00733 out_p += tr->finish_func(TRANSCODING_STATE(tc), 00734 out_p, out_stop - out_p); 00735 } 00736 else { 00737 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc), 00738 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); 00739 writebuf_off = 0; 00740 while (writebuf_off < writebuf_len) { 00741 SUSPEND_OBUF(23); 00742 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; 00743 } 00744 } 00745 } 00746 while (1) 00747 SUSPEND(econv_finished, 6); 00748 #undef SUSPEND 00749 #undef next_table 00750 #undef next_info 00751 #undef next_byte 00752 #undef writebuf_len 00753 #undef writebuf_off 00754 } 00755 00756 static rb_econv_result_t 00757 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos, 00758 const unsigned char *in_stop, unsigned char *out_stop, 00759 rb_transcoding *tc, 00760 const int opt) 00761 { 00762 if (tc->readagain_len) { 00763 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len); 00764 const unsigned char *readagain_pos = readagain_buf; 00765 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len; 00766 rb_econv_result_t res; 00767 00768 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len, 00769 unsigned char, tc->readagain_len); 00770 tc->readagain_len = 0; 00771 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT); 00772 if (res != econv_source_buffer_empty) { 00773 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len, 00774 readagain_pos, unsigned char, readagain_stop - readagain_pos); 00775 tc->readagain_len += readagain_stop - readagain_pos; 00776 return res; 00777 } 00778 } 00779 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt); 00780 } 00781 00782 static rb_transcoding * 00783 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags) 00784 { 00785 rb_transcoding *tc; 00786 00787 tc = ALLOC(rb_transcoding); 00788 tc->transcoder = tr; 00789 tc->flags = flags; 00790 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) 00791 tc->state.ptr = xmalloc(tr->state_size); 00792 if (tr->state_init_func) { 00793 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */ 00794 } 00795 tc->resume_position = 0; 00796 tc->recognized_len = 0; 00797 tc->readagain_len = 0; 00798 tc->writebuf_len = 0; 00799 tc->writebuf_off = 0; 00800 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) { 00801 tc->readbuf.ptr = xmalloc(tr->max_input); 00802 } 00803 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) { 00804 tc->writebuf.ptr = xmalloc(tr->max_output); 00805 } 00806 return tc; 00807 } 00808 00809 static rb_econv_result_t 00810 rb_transcoding_convert(rb_transcoding *tc, 00811 const unsigned char **input_ptr, const unsigned char *input_stop, 00812 unsigned char **output_ptr, unsigned char *output_stop, 00813 int flags) 00814 { 00815 return transcode_restartable( 00816 input_ptr, output_ptr, 00817 input_stop, output_stop, 00818 tc, flags); 00819 } 00820 00821 static void 00822 rb_transcoding_close(rb_transcoding *tc) 00823 { 00824 const rb_transcoder *tr = tc->transcoder; 00825 if (tr->state_fini_func) { 00826 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */ 00827 } 00828 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) 00829 xfree(tc->state.ptr); 00830 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) 00831 xfree(tc->readbuf.ptr); 00832 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) 00833 xfree(tc->writebuf.ptr); 00834 xfree(tc); 00835 } 00836 00837 static size_t 00838 rb_transcoding_memsize(rb_transcoding *tc) 00839 { 00840 size_t size = sizeof(rb_transcoding); 00841 const rb_transcoder *tr = tc->transcoder; 00842 00843 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) { 00844 size += tr->state_size; 00845 } 00846 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) { 00847 size += tr->max_input; 00848 } 00849 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) { 00850 size += tr->max_output; 00851 } 00852 return size; 00853 } 00854 00855 static rb_econv_t * 00856 rb_econv_alloc(int n_hint) 00857 { 00858 rb_econv_t *ec; 00859 00860 if (n_hint <= 0) 00861 n_hint = 1; 00862 00863 ec = ALLOC(rb_econv_t); 00864 ec->flags = 0; 00865 ec->source_encoding_name = NULL; 00866 ec->destination_encoding_name = NULL; 00867 ec->started = 0; 00868 ec->replacement_str = NULL; 00869 ec->replacement_len = 0; 00870 ec->replacement_enc = NULL; 00871 ec->replacement_allocated = 0; 00872 ec->in_buf_start = NULL; 00873 ec->in_data_start = NULL; 00874 ec->in_data_end = NULL; 00875 ec->in_buf_end = NULL; 00876 ec->num_allocated = n_hint; 00877 ec->num_trans = 0; 00878 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated); 00879 ec->num_finished = 0; 00880 ec->last_tc = NULL; 00881 ec->last_error.result = econv_source_buffer_empty; 00882 ec->last_error.error_tc = NULL; 00883 ec->last_error.source_encoding = NULL; 00884 ec->last_error.destination_encoding = NULL; 00885 ec->last_error.error_bytes_start = NULL; 00886 ec->last_error.error_bytes_len = 0; 00887 ec->last_error.readagain_len = 0; 00888 ec->source_encoding = NULL; 00889 ec->destination_encoding = NULL; 00890 return ec; 00891 } 00892 00893 static int 00894 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i) 00895 { 00896 int n, j; 00897 int bufsize = 4096; 00898 unsigned char *p; 00899 00900 if (ec->num_trans == ec->num_allocated) { 00901 n = ec->num_allocated * 2; 00902 REALLOC_N(ec->elems, rb_econv_elem_t, n); 00903 ec->num_allocated = n; 00904 } 00905 00906 p = xmalloc(bufsize); 00907 00908 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i); 00909 00910 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0); 00911 ec->elems[i].out_buf_start = p; 00912 ec->elems[i].out_buf_end = p + bufsize; 00913 ec->elems[i].out_data_start = p; 00914 ec->elems[i].out_data_end = p; 00915 ec->elems[i].last_result = econv_source_buffer_empty; 00916 00917 ec->num_trans++; 00918 00919 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding)) 00920 for (j = ec->num_trans-1; i <= j; j--) { 00921 rb_transcoding *tc = ec->elems[j].tc; 00922 const rb_transcoder *tr2 = tc->transcoder; 00923 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) { 00924 ec->last_tc = tc; 00925 break; 00926 } 00927 } 00928 00929 return 0; 00930 } 00931 00932 static rb_econv_t * 00933 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries) 00934 { 00935 rb_econv_t *ec; 00936 int i, ret; 00937 00938 for (i = 0; i < n; i++) { 00939 const rb_transcoder *tr; 00940 tr = load_transcoder_entry(entries[i]); 00941 if (!tr) 00942 return NULL; 00943 } 00944 00945 ec = rb_econv_alloc(n); 00946 00947 for (i = 0; i < n; i++) { 00948 const rb_transcoder *tr = load_transcoder_entry(entries[i]); 00949 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans); 00950 if (ret == -1) { 00951 rb_econv_close(ec); 00952 return NULL; 00953 } 00954 } 00955 00956 return ec; 00957 } 00958 00959 struct trans_open_t { 00960 transcoder_entry_t **entries; 00961 int num_additional; 00962 }; 00963 00964 static void 00965 trans_open_i(const char *sname, const char *dname, int depth, void *arg) 00966 { 00967 struct trans_open_t *toarg = arg; 00968 00969 if (!toarg->entries) { 00970 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional); 00971 } 00972 toarg->entries[depth] = get_transcoder_entry(sname, dname); 00973 } 00974 00975 static rb_econv_t * 00976 rb_econv_open0(const char *sname, const char *dname, int ecflags) 00977 { 00978 transcoder_entry_t **entries = NULL; 00979 int num_trans; 00980 rb_econv_t *ec; 00981 00982 int sidx, didx; 00983 00984 if (*sname) { 00985 sidx = rb_enc_find_index(sname); 00986 if (0 <= sidx) { 00987 rb_enc_from_index(sidx); 00988 } 00989 } 00990 00991 if (*dname) { 00992 didx = rb_enc_find_index(dname); 00993 if (0 <= didx) { 00994 rb_enc_from_index(didx); 00995 } 00996 } 00997 00998 if (*sname == '\0' && *dname == '\0') { 00999 num_trans = 0; 01000 entries = NULL; 01001 } 01002 else { 01003 struct trans_open_t toarg; 01004 toarg.entries = NULL; 01005 toarg.num_additional = 0; 01006 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg); 01007 entries = toarg.entries; 01008 if (num_trans < 0) { 01009 xfree(entries); 01010 return NULL; 01011 } 01012 } 01013 01014 ec = rb_econv_open_by_transcoder_entries(num_trans, entries); 01015 xfree(entries); 01016 if (!ec) 01017 return NULL; 01018 01019 ec->flags = ecflags; 01020 ec->source_encoding_name = sname; 01021 ec->destination_encoding_name = dname; 01022 01023 return ec; 01024 } 01025 01026 #define MAX_ECFLAGS_DECORATORS 32 01027 01028 static int 01029 decorator_names(int ecflags, const char **decorators_ret) 01030 { 01031 int num_decorators; 01032 01033 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) { 01034 case ECONV_UNIVERSAL_NEWLINE_DECORATOR: 01035 case ECONV_CRLF_NEWLINE_DECORATOR: 01036 case ECONV_CR_NEWLINE_DECORATOR: 01037 case 0: 01038 break; 01039 default: 01040 return -1; 01041 } 01042 01043 if ((ecflags & ECONV_XML_TEXT_DECORATOR) && 01044 (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)) 01045 return -1; 01046 01047 num_decorators = 0; 01048 01049 if (ecflags & ECONV_XML_TEXT_DECORATOR) 01050 decorators_ret[num_decorators++] = "xml_text_escape"; 01051 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) 01052 decorators_ret[num_decorators++] = "xml_attr_content_escape"; 01053 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) 01054 decorators_ret[num_decorators++] = "xml_attr_quote"; 01055 01056 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) 01057 decorators_ret[num_decorators++] = "crlf_newline"; 01058 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) 01059 decorators_ret[num_decorators++] = "cr_newline"; 01060 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) 01061 decorators_ret[num_decorators++] = "universal_newline"; 01062 01063 return num_decorators; 01064 } 01065 01066 rb_econv_t * 01067 rb_econv_open(const char *sname, const char *dname, int ecflags) 01068 { 01069 rb_econv_t *ec; 01070 int num_decorators; 01071 const char *decorators[MAX_ECFLAGS_DECORATORS]; 01072 int i; 01073 01074 num_decorators = decorator_names(ecflags, decorators); 01075 if (num_decorators == -1) 01076 return NULL; 01077 01078 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK); 01079 if (!ec) 01080 return NULL; 01081 01082 for (i = 0; i < num_decorators; i++) 01083 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) { 01084 rb_econv_close(ec); 01085 return NULL; 01086 } 01087 01088 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK; 01089 01090 return ec; 01091 } 01092 01093 static int 01094 trans_sweep(rb_econv_t *ec, 01095 const unsigned char **input_ptr, const unsigned char *input_stop, 01096 unsigned char **output_ptr, unsigned char *output_stop, 01097 int flags, 01098 int start) 01099 { 01100 int try; 01101 int i, f; 01102 01103 const unsigned char **ipp, *is, *iold; 01104 unsigned char **opp, *os, *oold; 01105 rb_econv_result_t res; 01106 01107 try = 1; 01108 while (try) { 01109 try = 0; 01110 for (i = start; i < ec->num_trans; i++) { 01111 rb_econv_elem_t *te = &ec->elems[i]; 01112 01113 if (i == 0) { 01114 ipp = input_ptr; 01115 is = input_stop; 01116 } 01117 else { 01118 rb_econv_elem_t *prev_te = &ec->elems[i-1]; 01119 ipp = (const unsigned char **)&prev_te->out_data_start; 01120 is = prev_te->out_data_end; 01121 } 01122 01123 if (i == ec->num_trans-1) { 01124 opp = output_ptr; 01125 os = output_stop; 01126 } 01127 else { 01128 if (te->out_buf_start != te->out_data_start) { 01129 ssize_t len = te->out_data_end - te->out_data_start; 01130 ssize_t off = te->out_data_start - te->out_buf_start; 01131 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len); 01132 te->out_data_start = te->out_buf_start; 01133 te->out_data_end -= off; 01134 } 01135 opp = &te->out_data_end; 01136 os = te->out_buf_end; 01137 } 01138 01139 f = flags; 01140 if (ec->num_finished != i) 01141 f |= ECONV_PARTIAL_INPUT; 01142 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) { 01143 start = 1; 01144 flags &= ~ECONV_AFTER_OUTPUT; 01145 } 01146 if (i != 0) 01147 f &= ~ECONV_AFTER_OUTPUT; 01148 iold = *ipp; 01149 oold = *opp; 01150 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f); 01151 if (iold != *ipp || oold != *opp) 01152 try = 1; 01153 01154 switch (res) { 01155 case econv_invalid_byte_sequence: 01156 case econv_incomplete_input: 01157 case econv_undefined_conversion: 01158 case econv_after_output: 01159 return i; 01160 01161 case econv_destination_buffer_full: 01162 case econv_source_buffer_empty: 01163 break; 01164 01165 case econv_finished: 01166 ec->num_finished = i+1; 01167 break; 01168 } 01169 } 01170 } 01171 return -1; 01172 } 01173 01174 static rb_econv_result_t 01175 rb_trans_conv(rb_econv_t *ec, 01176 const unsigned char **input_ptr, const unsigned char *input_stop, 01177 unsigned char **output_ptr, unsigned char *output_stop, 01178 int flags, 01179 int *result_position_ptr) 01180 { 01181 int i; 01182 int needreport_index; 01183 int sweep_start; 01184 01185 unsigned char empty_buf; 01186 unsigned char *empty_ptr = &empty_buf; 01187 01188 if (!input_ptr) { 01189 input_ptr = (const unsigned char **)&empty_ptr; 01190 input_stop = empty_ptr; 01191 } 01192 01193 if (!output_ptr) { 01194 output_ptr = &empty_ptr; 01195 output_stop = empty_ptr; 01196 } 01197 01198 if (ec->elems[0].last_result == econv_after_output) 01199 ec->elems[0].last_result = econv_source_buffer_empty; 01200 01201 needreport_index = -1; 01202 for (i = ec->num_trans-1; 0 <= i; i--) { 01203 switch (ec->elems[i].last_result) { 01204 case econv_invalid_byte_sequence: 01205 case econv_incomplete_input: 01206 case econv_undefined_conversion: 01207 case econv_after_output: 01208 case econv_finished: 01209 sweep_start = i+1; 01210 needreport_index = i; 01211 goto found_needreport; 01212 01213 case econv_destination_buffer_full: 01214 case econv_source_buffer_empty: 01215 break; 01216 01217 default: 01218 rb_bug("unexpected transcode last result"); 01219 } 01220 } 01221 01222 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */ 01223 01224 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full && 01225 (flags & ECONV_AFTER_OUTPUT)) { 01226 rb_econv_result_t res; 01227 01228 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop, 01229 (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, 01230 result_position_ptr); 01231 01232 if (res == econv_source_buffer_empty) 01233 return econv_after_output; 01234 return res; 01235 } 01236 01237 sweep_start = 0; 01238 01239 found_needreport: 01240 01241 do { 01242 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start); 01243 sweep_start = needreport_index + 1; 01244 } while (needreport_index != -1 && needreport_index != ec->num_trans-1); 01245 01246 for (i = ec->num_trans-1; 0 <= i; i--) { 01247 if (ec->elems[i].last_result != econv_source_buffer_empty) { 01248 rb_econv_result_t res = ec->elems[i].last_result; 01249 if (res == econv_invalid_byte_sequence || 01250 res == econv_incomplete_input || 01251 res == econv_undefined_conversion || 01252 res == econv_after_output) { 01253 ec->elems[i].last_result = econv_source_buffer_empty; 01254 } 01255 if (result_position_ptr) 01256 *result_position_ptr = i; 01257 return res; 01258 } 01259 } 01260 if (result_position_ptr) 01261 *result_position_ptr = -1; 01262 return econv_source_buffer_empty; 01263 } 01264 01265 static rb_econv_result_t 01266 rb_econv_convert0(rb_econv_t *ec, 01267 const unsigned char **input_ptr, const unsigned char *input_stop, 01268 unsigned char **output_ptr, unsigned char *output_stop, 01269 int flags) 01270 { 01271 rb_econv_result_t res; 01272 int result_position; 01273 int has_output = 0; 01274 01275 memset(&ec->last_error, 0, sizeof(ec->last_error)); 01276 01277 if (ec->num_trans == 0) { 01278 size_t len; 01279 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) { 01280 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) { 01281 len = output_stop - *output_ptr; 01282 memcpy(*output_ptr, ec->in_data_start, len); 01283 *output_ptr = output_stop; 01284 ec->in_data_start += len; 01285 res = econv_destination_buffer_full; 01286 goto gotresult; 01287 } 01288 len = ec->in_data_end - ec->in_data_start; 01289 memcpy(*output_ptr, ec->in_data_start, len); 01290 *output_ptr += len; 01291 ec->in_data_start = ec->in_data_end = ec->in_buf_start; 01292 if (flags & ECONV_AFTER_OUTPUT) { 01293 res = econv_after_output; 01294 goto gotresult; 01295 } 01296 } 01297 if (output_stop - *output_ptr < input_stop - *input_ptr) { 01298 len = output_stop - *output_ptr; 01299 } 01300 else { 01301 len = input_stop - *input_ptr; 01302 } 01303 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) { 01304 *(*output_ptr)++ = *(*input_ptr)++; 01305 res = econv_after_output; 01306 goto gotresult; 01307 } 01308 memcpy(*output_ptr, *input_ptr, len); 01309 *output_ptr += len; 01310 *input_ptr += len; 01311 if (*input_ptr != input_stop) 01312 res = econv_destination_buffer_full; 01313 else if (flags & ECONV_PARTIAL_INPUT) 01314 res = econv_source_buffer_empty; 01315 else 01316 res = econv_finished; 01317 goto gotresult; 01318 } 01319 01320 if (ec->elems[ec->num_trans-1].out_data_start) { 01321 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start; 01322 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end; 01323 if (data_start != data_end) { 01324 size_t len; 01325 if (output_stop - *output_ptr < data_end - data_start) { 01326 len = output_stop - *output_ptr; 01327 memcpy(*output_ptr, data_start, len); 01328 *output_ptr = output_stop; 01329 ec->elems[ec->num_trans-1].out_data_start += len; 01330 res = econv_destination_buffer_full; 01331 goto gotresult; 01332 } 01333 len = data_end - data_start; 01334 memcpy(*output_ptr, data_start, len); 01335 *output_ptr += len; 01336 ec->elems[ec->num_trans-1].out_data_start = 01337 ec->elems[ec->num_trans-1].out_data_end = 01338 ec->elems[ec->num_trans-1].out_buf_start; 01339 has_output = 1; 01340 } 01341 } 01342 01343 if (ec->in_buf_start && 01344 ec->in_data_start != ec->in_data_end) { 01345 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop, 01346 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position); 01347 if (res != econv_source_buffer_empty) 01348 goto gotresult; 01349 } 01350 01351 if (has_output && 01352 (flags & ECONV_AFTER_OUTPUT) && 01353 *input_ptr != input_stop) { 01354 input_stop = *input_ptr; 01355 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position); 01356 if (res == econv_source_buffer_empty) 01357 res = econv_after_output; 01358 } 01359 else if ((flags & ECONV_AFTER_OUTPUT) || 01360 ec->num_trans == 1) { 01361 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position); 01362 } 01363 else { 01364 flags |= ECONV_AFTER_OUTPUT; 01365 do { 01366 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position); 01367 } while (res == econv_after_output); 01368 } 01369 01370 gotresult: 01371 ec->last_error.result = res; 01372 if (res == econv_invalid_byte_sequence || 01373 res == econv_incomplete_input || 01374 res == econv_undefined_conversion) { 01375 rb_transcoding *error_tc = ec->elems[result_position].tc; 01376 ec->last_error.error_tc = error_tc; 01377 ec->last_error.source_encoding = error_tc->transcoder->src_encoding; 01378 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding; 01379 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc); 01380 ec->last_error.error_bytes_len = error_tc->recognized_len; 01381 ec->last_error.readagain_len = error_tc->readagain_len; 01382 } 01383 01384 return res; 01385 } 01386 01387 static int output_replacement_character(rb_econv_t *ec); 01388 01389 static int 01390 output_hex_charref(rb_econv_t *ec) 01391 { 01392 int ret; 01393 unsigned char utfbuf[1024]; 01394 const unsigned char *utf; 01395 size_t utf_len; 01396 int utf_allocated = 0; 01397 char charef_buf[16]; 01398 const unsigned char *p; 01399 01400 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) { 01401 utf = ec->last_error.error_bytes_start; 01402 utf_len = ec->last_error.error_bytes_len; 01403 } 01404 else { 01405 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE", 01406 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len, 01407 utfbuf, sizeof(utfbuf), 01408 &utf_len); 01409 if (!utf) 01410 return -1; 01411 if (utf != utfbuf && utf != ec->last_error.error_bytes_start) 01412 utf_allocated = 1; 01413 } 01414 01415 if (utf_len % 4 != 0) 01416 goto fail; 01417 01418 p = utf; 01419 while (4 <= utf_len) { 01420 unsigned int u = 0; 01421 u += p[0] << 24; 01422 u += p[1] << 16; 01423 u += p[2] << 8; 01424 u += p[3]; 01425 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u); 01426 01427 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII"); 01428 if (ret == -1) 01429 goto fail; 01430 01431 p += 4; 01432 utf_len -= 4; 01433 } 01434 01435 if (utf_allocated) 01436 xfree((void *)utf); 01437 return 0; 01438 01439 fail: 01440 if (utf_allocated) 01441 xfree((void *)utf); 01442 return -1; 01443 } 01444 01445 rb_econv_result_t 01446 rb_econv_convert(rb_econv_t *ec, 01447 const unsigned char **input_ptr, const unsigned char *input_stop, 01448 unsigned char **output_ptr, unsigned char *output_stop, 01449 int flags) 01450 { 01451 rb_econv_result_t ret; 01452 01453 unsigned char empty_buf; 01454 unsigned char *empty_ptr = &empty_buf; 01455 01456 ec->started = 1; 01457 01458 if (!input_ptr) { 01459 input_ptr = (const unsigned char **)&empty_ptr; 01460 input_stop = empty_ptr; 01461 } 01462 01463 if (!output_ptr) { 01464 output_ptr = &empty_ptr; 01465 output_stop = empty_ptr; 01466 } 01467 01468 resume: 01469 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags); 01470 01471 if (ret == econv_invalid_byte_sequence || 01472 ret == econv_incomplete_input) { 01473 /* deal with invalid byte sequence */ 01474 /* todo: add more alternative behaviors */ 01475 switch (ec->flags & ECONV_INVALID_MASK) { 01476 case ECONV_INVALID_REPLACE: 01477 if (output_replacement_character(ec) == 0) 01478 goto resume; 01479 } 01480 } 01481 01482 if (ret == econv_undefined_conversion) { 01483 /* valid character in source encoding 01484 * but no related character(s) in destination encoding */ 01485 /* todo: add more alternative behaviors */ 01486 switch (ec->flags & ECONV_UNDEF_MASK) { 01487 case ECONV_UNDEF_REPLACE: 01488 if (output_replacement_character(ec) == 0) 01489 goto resume; 01490 break; 01491 01492 case ECONV_UNDEF_HEX_CHARREF: 01493 if (output_hex_charref(ec) == 0) 01494 goto resume; 01495 break; 01496 } 01497 } 01498 01499 return ret; 01500 } 01501 01502 const char * 01503 rb_econv_encoding_to_insert_output(rb_econv_t *ec) 01504 { 01505 rb_transcoding *tc = ec->last_tc; 01506 const rb_transcoder *tr; 01507 01508 if (tc == NULL) 01509 return ""; 01510 01511 tr = tc->transcoder; 01512 01513 if (tr->asciicompat_type == asciicompat_encoder) 01514 return tr->src_encoding; 01515 return tr->dst_encoding; 01516 } 01517 01518 static unsigned char * 01519 allocate_converted_string(const char *sname, const char *dname, 01520 const unsigned char *str, size_t len, 01521 unsigned char *caller_dst_buf, size_t caller_dst_bufsize, 01522 size_t *dst_len_ptr) 01523 { 01524 unsigned char *dst_str; 01525 size_t dst_len; 01526 size_t dst_bufsize; 01527 01528 rb_econv_t *ec; 01529 rb_econv_result_t res; 01530 01531 const unsigned char *sp; 01532 unsigned char *dp; 01533 01534 if (caller_dst_buf) 01535 dst_bufsize = caller_dst_bufsize; 01536 else if (len == 0) 01537 dst_bufsize = 1; 01538 else 01539 dst_bufsize = len; 01540 01541 ec = rb_econv_open(sname, dname, 0); 01542 if (ec == NULL) 01543 return NULL; 01544 if (caller_dst_buf) 01545 dst_str = caller_dst_buf; 01546 else 01547 dst_str = xmalloc(dst_bufsize); 01548 dst_len = 0; 01549 sp = str; 01550 dp = dst_str+dst_len; 01551 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0); 01552 dst_len = dp - dst_str; 01553 while (res == econv_destination_buffer_full) { 01554 if (SIZE_MAX/2 < dst_bufsize) { 01555 goto fail; 01556 } 01557 dst_bufsize *= 2; 01558 if (dst_str == caller_dst_buf) { 01559 unsigned char *tmp; 01560 tmp = xmalloc(dst_bufsize); 01561 memcpy(tmp, dst_str, dst_bufsize/2); 01562 dst_str = tmp; 01563 } 01564 else { 01565 dst_str = xrealloc(dst_str, dst_bufsize); 01566 } 01567 dp = dst_str+dst_len; 01568 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0); 01569 dst_len = dp - dst_str; 01570 } 01571 if (res != econv_finished) { 01572 goto fail; 01573 } 01574 rb_econv_close(ec); 01575 *dst_len_ptr = dst_len; 01576 return dst_str; 01577 01578 fail: 01579 if (dst_str != caller_dst_buf) 01580 xfree(dst_str); 01581 rb_econv_close(ec); 01582 return NULL; 01583 } 01584 01585 /* result: 0:success -1:failure */ 01586 int 01587 rb_econv_insert_output(rb_econv_t *ec, 01588 const unsigned char *str, size_t len, const char *str_encoding) 01589 { 01590 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec); 01591 unsigned char insert_buf[4096]; 01592 const unsigned char *insert_str = NULL; 01593 size_t insert_len; 01594 01595 int last_trans_index; 01596 rb_transcoding *tc; 01597 01598 unsigned char **buf_start_p; 01599 unsigned char **data_start_p; 01600 unsigned char **data_end_p; 01601 unsigned char **buf_end_p; 01602 01603 size_t need; 01604 01605 ec->started = 1; 01606 01607 if (len == 0) 01608 return 0; 01609 01610 if (encoding_equal(insert_encoding, str_encoding)) { 01611 insert_str = str; 01612 insert_len = len; 01613 } 01614 else { 01615 insert_str = allocate_converted_string(str_encoding, insert_encoding, 01616 str, len, insert_buf, sizeof(insert_buf), &insert_len); 01617 if (insert_str == NULL) 01618 return -1; 01619 } 01620 01621 need = insert_len; 01622 01623 last_trans_index = ec->num_trans-1; 01624 if (ec->num_trans == 0) { 01625 tc = NULL; 01626 buf_start_p = &ec->in_buf_start; 01627 data_start_p = &ec->in_data_start; 01628 data_end_p = &ec->in_data_end; 01629 buf_end_p = &ec->in_buf_end; 01630 } 01631 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) { 01632 tc = ec->elems[last_trans_index].tc; 01633 need += tc->readagain_len; 01634 if (need < insert_len) 01635 goto fail; 01636 if (last_trans_index == 0) { 01637 buf_start_p = &ec->in_buf_start; 01638 data_start_p = &ec->in_data_start; 01639 data_end_p = &ec->in_data_end; 01640 buf_end_p = &ec->in_buf_end; 01641 } 01642 else { 01643 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1]; 01644 buf_start_p = &ee->out_buf_start; 01645 data_start_p = &ee->out_data_start; 01646 data_end_p = &ee->out_data_end; 01647 buf_end_p = &ee->out_buf_end; 01648 } 01649 } 01650 else { 01651 rb_econv_elem_t *ee = &ec->elems[last_trans_index]; 01652 buf_start_p = &ee->out_buf_start; 01653 data_start_p = &ee->out_data_start; 01654 data_end_p = &ee->out_data_end; 01655 buf_end_p = &ee->out_buf_end; 01656 tc = ec->elems[last_trans_index].tc; 01657 } 01658 01659 if (*buf_start_p == NULL) { 01660 unsigned char *buf = xmalloc(need); 01661 *buf_start_p = buf; 01662 *data_start_p = buf; 01663 *data_end_p = buf; 01664 *buf_end_p = buf+need; 01665 } 01666 else if ((size_t)(*buf_end_p - *data_end_p) < need) { 01667 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p); 01668 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p); 01669 *data_start_p = *buf_start_p; 01670 if ((size_t)(*buf_end_p - *data_end_p) < need) { 01671 unsigned char *buf; 01672 size_t s = (*data_end_p - *buf_start_p) + need; 01673 if (s < need) 01674 goto fail; 01675 buf = xrealloc(*buf_start_p, s); 01676 *data_start_p = buf; 01677 *data_end_p = buf + (*data_end_p - *buf_start_p); 01678 *buf_start_p = buf; 01679 *buf_end_p = buf + s; 01680 } 01681 } 01682 01683 memcpy(*data_end_p, insert_str, insert_len); 01684 *data_end_p += insert_len; 01685 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) { 01686 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len); 01687 *data_end_p += tc->readagain_len; 01688 tc->readagain_len = 0; 01689 } 01690 01691 if (insert_str != str && insert_str != insert_buf) 01692 xfree((void*)insert_str); 01693 return 0; 01694 01695 fail: 01696 if (insert_str != str && insert_str != insert_buf) 01697 xfree((void*)insert_str); 01698 return -1; 01699 } 01700 01701 void 01702 rb_econv_close(rb_econv_t *ec) 01703 { 01704 int i; 01705 01706 if (ec->replacement_allocated) { 01707 xfree((void *)ec->replacement_str); 01708 } 01709 for (i = 0; i < ec->num_trans; i++) { 01710 rb_transcoding_close(ec->elems[i].tc); 01711 if (ec->elems[i].out_buf_start) 01712 xfree(ec->elems[i].out_buf_start); 01713 } 01714 xfree(ec->in_buf_start); 01715 xfree(ec->elems); 01716 xfree(ec); 01717 } 01718 01719 size_t 01720 rb_econv_memsize(rb_econv_t *ec) 01721 { 01722 size_t size = sizeof(rb_econv_t); 01723 int i; 01724 01725 if (ec->replacement_allocated) { 01726 size += ec->replacement_len; 01727 } 01728 for (i = 0; i < ec->num_trans; i++) { 01729 size += rb_transcoding_memsize(ec->elems[i].tc); 01730 01731 if (ec->elems[i].out_buf_start) { 01732 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start; 01733 } 01734 } 01735 size += ec->in_buf_end - ec->in_buf_start; 01736 size += sizeof(rb_econv_elem_t) * ec->num_allocated; 01737 01738 return size; 01739 } 01740 01741 int 01742 rb_econv_putbackable(rb_econv_t *ec) 01743 { 01744 if (ec->num_trans == 0) 01745 return 0; 01746 #if SIZEOF_SIZE_T > SIZEOF_INT 01747 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX; 01748 #endif 01749 return (int)ec->elems[0].tc->readagain_len; 01750 } 01751 01752 void 01753 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n) 01754 { 01755 rb_transcoding *tc; 01756 if (ec->num_trans == 0 || n == 0) 01757 return; 01758 tc = ec->elems[0].tc; 01759 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n); 01760 tc->readagain_len -= n; 01761 } 01762 01763 struct asciicompat_encoding_t { 01764 const char *ascii_compat_name; 01765 const char *ascii_incompat_name; 01766 }; 01767 01768 static int 01769 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg) 01770 { 01771 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg; 01772 transcoder_entry_t *entry = (transcoder_entry_t *)val; 01773 const rb_transcoder *tr; 01774 01775 if (DECORATOR_P(entry->sname, entry->dname)) 01776 return ST_CONTINUE; 01777 tr = load_transcoder_entry(entry); 01778 if (tr && tr->asciicompat_type == asciicompat_decoder) { 01779 data->ascii_compat_name = tr->dst_encoding; 01780 return ST_STOP; 01781 } 01782 return ST_CONTINUE; 01783 } 01784 01785 const char * 01786 rb_econv_asciicompat_encoding(const char *ascii_incompat_name) 01787 { 01788 st_data_t v; 01789 st_table *table2; 01790 struct asciicompat_encoding_t data; 01791 01792 if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v)) 01793 return NULL; 01794 table2 = (st_table *)v; 01795 01796 /* 01797 * Assumption: 01798 * There is at most one transcoder for 01799 * converting from ASCII incompatible encoding. 01800 * 01801 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others. 01802 */ 01803 if (table2->num_entries != 1) 01804 return NULL; 01805 01806 data.ascii_incompat_name = ascii_incompat_name; 01807 data.ascii_compat_name = NULL; 01808 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data); 01809 return data.ascii_compat_name; 01810 } 01811 01812 VALUE 01813 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags) 01814 { 01815 unsigned const char *ss, *sp, *se; 01816 unsigned char *ds, *dp, *de; 01817 rb_econv_result_t res; 01818 int max_output; 01819 01820 if (NIL_P(dst)) { 01821 dst = rb_str_buf_new(len); 01822 if (ec->destination_encoding) 01823 rb_enc_associate(dst, ec->destination_encoding); 01824 } 01825 01826 if (ec->last_tc) 01827 max_output = ec->last_tc->transcoder->max_output; 01828 else 01829 max_output = 1; 01830 01831 res = econv_destination_buffer_full; 01832 while (res == econv_destination_buffer_full) { 01833 long dlen = RSTRING_LEN(dst); 01834 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) { 01835 unsigned long new_capa = (unsigned long)dlen + len + max_output; 01836 if (LONG_MAX < new_capa) 01837 rb_raise(rb_eArgError, "too long string"); 01838 rb_str_resize(dst, new_capa); 01839 rb_str_set_len(dst, dlen); 01840 } 01841 ss = sp = (const unsigned char *)RSTRING_PTR(src) + off; 01842 se = ss + len; 01843 ds = (unsigned char *)RSTRING_PTR(dst); 01844 de = ds + rb_str_capacity(dst); 01845 dp = ds += dlen; 01846 res = rb_econv_convert(ec, &sp, se, &dp, de, flags); 01847 off += sp - ss; 01848 len -= sp - ss; 01849 rb_str_set_len(dst, dlen + (dp - ds)); 01850 rb_econv_check_error(ec); 01851 } 01852 01853 return dst; 01854 } 01855 01856 VALUE 01857 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags) 01858 { 01859 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags); 01860 } 01861 01862 VALUE 01863 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags) 01864 { 01865 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags); 01866 } 01867 01868 VALUE 01869 rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags) 01870 { 01871 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags); 01872 } 01873 01874 static int 01875 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n) 01876 { 01877 transcoder_entry_t *entry; 01878 const rb_transcoder *tr; 01879 01880 if (ec->started != 0) 01881 return -1; 01882 01883 entry = get_transcoder_entry(sname, dname); 01884 if (!entry) 01885 return -1; 01886 01887 tr = load_transcoder_entry(entry); 01888 if (!tr) return -1; 01889 01890 return rb_econv_add_transcoder_at(ec, tr, n); 01891 } 01892 01893 static int 01894 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n) 01895 { 01896 return rb_econv_add_converter(ec, "", decorator_name, n); 01897 } 01898 01899 int 01900 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name) 01901 { 01902 const rb_transcoder *tr; 01903 01904 if (ec->num_trans == 0) 01905 return rb_econv_decorate_at(ec, decorator_name, 0); 01906 01907 tr = ec->elems[0].tc->transcoder; 01908 01909 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && 01910 tr->asciicompat_type == asciicompat_decoder) 01911 return rb_econv_decorate_at(ec, decorator_name, 1); 01912 01913 return rb_econv_decorate_at(ec, decorator_name, 0); 01914 } 01915 01916 int 01917 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name) 01918 { 01919 const rb_transcoder *tr; 01920 01921 if (ec->num_trans == 0) 01922 return rb_econv_decorate_at(ec, decorator_name, 0); 01923 01924 tr = ec->elems[ec->num_trans-1].tc->transcoder; 01925 01926 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && 01927 tr->asciicompat_type == asciicompat_encoder) 01928 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1); 01929 01930 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans); 01931 } 01932 01933 void 01934 rb_econv_binmode(rb_econv_t *ec) 01935 { 01936 const char *dname = 0; 01937 01938 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) { 01939 case ECONV_UNIVERSAL_NEWLINE_DECORATOR: 01940 dname = "universal_newline"; 01941 break; 01942 case ECONV_CRLF_NEWLINE_DECORATOR: 01943 dname = "crlf_newline"; 01944 break; 01945 case ECONV_CR_NEWLINE_DECORATOR: 01946 dname = "cr_newline"; 01947 break; 01948 } 01949 01950 if (dname) { 01951 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder; 01952 int num_trans = ec->num_trans; 01953 int i, j = 0; 01954 01955 for (i=0; i < num_trans; i++) { 01956 if (transcoder == ec->elems[i].tc->transcoder) { 01957 rb_transcoding_close(ec->elems[i].tc); 01958 xfree(ec->elems[i].out_buf_start); 01959 ec->num_trans--; 01960 } 01961 else 01962 ec->elems[j++] = ec->elems[i]; 01963 } 01964 } 01965 01966 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK; 01967 } 01968 01969 static VALUE 01970 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg) 01971 { 01972 int has_description = 0; 01973 01974 if (NIL_P(mesg)) 01975 mesg = rb_str_new(NULL, 0); 01976 01977 if (*sname != '\0' || *dname != '\0') { 01978 if (*sname == '\0') 01979 rb_str_cat2(mesg, dname); 01980 else if (*dname == '\0') 01981 rb_str_cat2(mesg, sname); 01982 else 01983 rb_str_catf(mesg, "%s to %s", sname, dname); 01984 has_description = 1; 01985 } 01986 01987 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK| 01988 ECONV_XML_TEXT_DECORATOR| 01989 ECONV_XML_ATTR_CONTENT_DECORATOR| 01990 ECONV_XML_ATTR_QUOTE_DECORATOR)) { 01991 const char *pre = ""; 01992 if (has_description) 01993 rb_str_cat2(mesg, " with "); 01994 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) { 01995 rb_str_cat2(mesg, pre); pre = ","; 01996 rb_str_cat2(mesg, "universal_newline"); 01997 } 01998 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) { 01999 rb_str_cat2(mesg, pre); pre = ","; 02000 rb_str_cat2(mesg, "crlf_newline"); 02001 } 02002 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) { 02003 rb_str_cat2(mesg, pre); pre = ","; 02004 rb_str_cat2(mesg, "cr_newline"); 02005 } 02006 if (ecflags & ECONV_XML_TEXT_DECORATOR) { 02007 rb_str_cat2(mesg, pre); pre = ","; 02008 rb_str_cat2(mesg, "xml_text"); 02009 } 02010 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) { 02011 rb_str_cat2(mesg, pre); pre = ","; 02012 rb_str_cat2(mesg, "xml_attr_content"); 02013 } 02014 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) { 02015 rb_str_cat2(mesg, pre); pre = ","; 02016 rb_str_cat2(mesg, "xml_attr_quote"); 02017 } 02018 has_description = 1; 02019 } 02020 if (!has_description) { 02021 rb_str_cat2(mesg, "no-conversion"); 02022 } 02023 02024 return mesg; 02025 } 02026 02027 VALUE 02028 rb_econv_open_exc(const char *sname, const char *dname, int ecflags) 02029 { 02030 VALUE mesg, exc; 02031 mesg = rb_str_new_cstr("code converter not found ("); 02032 econv_description(sname, dname, ecflags, mesg); 02033 rb_str_cat2(mesg, ")"); 02034 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg); 02035 return exc; 02036 } 02037 02038 static VALUE 02039 make_econv_exception(rb_econv_t *ec) 02040 { 02041 VALUE mesg, exc; 02042 if (ec->last_error.result == econv_invalid_byte_sequence || 02043 ec->last_error.result == econv_incomplete_input) { 02044 const char *err = (const char *)ec->last_error.error_bytes_start; 02045 size_t error_len = ec->last_error.error_bytes_len; 02046 VALUE bytes = rb_str_new(err, error_len); 02047 VALUE dumped = rb_str_dump(bytes); 02048 size_t readagain_len = ec->last_error.readagain_len; 02049 VALUE bytes2 = Qnil; 02050 VALUE dumped2; 02051 int idx; 02052 if (ec->last_error.result == econv_incomplete_input) { 02053 mesg = rb_sprintf("incomplete %s on %s", 02054 StringValueCStr(dumped), 02055 ec->last_error.source_encoding); 02056 } 02057 else if (readagain_len) { 02058 bytes2 = rb_str_new(err+error_len, readagain_len); 02059 dumped2 = rb_str_dump(bytes2); 02060 mesg = rb_sprintf("%s followed by %s on %s", 02061 StringValueCStr(dumped), 02062 StringValueCStr(dumped2), 02063 ec->last_error.source_encoding); 02064 } 02065 else { 02066 mesg = rb_sprintf("%s on %s", 02067 StringValueCStr(dumped), 02068 ec->last_error.source_encoding); 02069 } 02070 02071 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg); 02072 rb_ivar_set(exc, rb_intern("error_bytes"), bytes); 02073 rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2); 02074 rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse); 02075 02076 set_encs: 02077 rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding)); 02078 rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding)); 02079 idx = rb_enc_find_index(ec->last_error.source_encoding); 02080 if (0 <= idx) 02081 rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx))); 02082 idx = rb_enc_find_index(ec->last_error.destination_encoding); 02083 if (0 <= idx) 02084 rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx))); 02085 return exc; 02086 } 02087 if (ec->last_error.result == econv_undefined_conversion) { 02088 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start, 02089 ec->last_error.error_bytes_len); 02090 VALUE dumped = Qnil; 02091 int idx; 02092 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) { 02093 rb_encoding *utf8 = rb_utf8_encoding(); 02094 const char *start, *end; 02095 int n; 02096 start = (const char *)ec->last_error.error_bytes_start; 02097 end = start + ec->last_error.error_bytes_len; 02098 n = rb_enc_precise_mbclen(start, end, utf8); 02099 if (MBCLEN_CHARFOUND_P(n) && 02100 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) { 02101 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8); 02102 dumped = rb_sprintf("U+%04X", cc); 02103 } 02104 } 02105 if (dumped == Qnil) 02106 dumped = rb_str_dump(bytes); 02107 if (strcmp(ec->last_error.source_encoding, 02108 ec->source_encoding_name) == 0 && 02109 strcmp(ec->last_error.destination_encoding, 02110 ec->destination_encoding_name) == 0) { 02111 mesg = rb_sprintf("%s from %s to %s", 02112 StringValueCStr(dumped), 02113 ec->last_error.source_encoding, 02114 ec->last_error.destination_encoding); 02115 } 02116 else { 02117 int i; 02118 mesg = rb_sprintf("%s to %s in conversion from %s", 02119 StringValueCStr(dumped), 02120 ec->last_error.destination_encoding, 02121 ec->source_encoding_name); 02122 for (i = 0; i < ec->num_trans; i++) { 02123 const rb_transcoder *tr = ec->elems[i].tc->transcoder; 02124 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding)) 02125 rb_str_catf(mesg, " to %s", 02126 ec->elems[i].tc->transcoder->dst_encoding); 02127 } 02128 } 02129 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg); 02130 idx = rb_enc_find_index(ec->last_error.source_encoding); 02131 if (0 <= idx) 02132 rb_enc_associate_index(bytes, idx); 02133 rb_ivar_set(exc, rb_intern("error_char"), bytes); 02134 goto set_encs; 02135 } 02136 return Qnil; 02137 } 02138 02139 static void 02140 more_output_buffer( 02141 VALUE destination, 02142 unsigned char *(*resize_destination)(VALUE, size_t, size_t), 02143 int max_output, 02144 unsigned char **out_start_ptr, 02145 unsigned char **out_pos, 02146 unsigned char **out_stop_ptr) 02147 { 02148 size_t len = (*out_pos - *out_start_ptr); 02149 size_t new_len = (len + max_output) * 2; 02150 *out_start_ptr = resize_destination(destination, len, new_len); 02151 *out_pos = *out_start_ptr + len; 02152 *out_stop_ptr = *out_start_ptr + new_len; 02153 } 02154 02155 static int 02156 make_replacement(rb_econv_t *ec) 02157 { 02158 rb_transcoding *tc; 02159 const rb_transcoder *tr; 02160 const unsigned char *replacement; 02161 const char *repl_enc; 02162 const char *ins_enc; 02163 size_t len; 02164 02165 if (ec->replacement_str) 02166 return 0; 02167 02168 ins_enc = rb_econv_encoding_to_insert_output(ec); 02169 02170 tc = ec->last_tc; 02171 if (*ins_enc) { 02172 tr = tc->transcoder; 02173 rb_enc_find(tr->dst_encoding); 02174 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc); 02175 } 02176 else { 02177 replacement = (unsigned char *)"?"; 02178 len = 1; 02179 repl_enc = ""; 02180 } 02181 02182 ec->replacement_str = replacement; 02183 ec->replacement_len = len; 02184 ec->replacement_enc = repl_enc; 02185 ec->replacement_allocated = 0; 02186 return 0; 02187 } 02188 02189 int 02190 rb_econv_set_replacement(rb_econv_t *ec, 02191 const unsigned char *str, size_t len, const char *encname) 02192 { 02193 unsigned char *str2; 02194 size_t len2; 02195 const char *encname2; 02196 02197 encname2 = rb_econv_encoding_to_insert_output(ec); 02198 02199 if (encoding_equal(encname, encname2)) { 02200 str2 = xmalloc(len); 02201 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */ 02202 len2 = len; 02203 encname2 = encname; 02204 } 02205 else { 02206 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2); 02207 if (!str2) 02208 return -1; 02209 } 02210 02211 if (ec->replacement_allocated) { 02212 xfree((void *)ec->replacement_str); 02213 } 02214 ec->replacement_allocated = 1; 02215 ec->replacement_str = str2; 02216 ec->replacement_len = len2; 02217 ec->replacement_enc = encname2; 02218 return 0; 02219 } 02220 02221 static int 02222 output_replacement_character(rb_econv_t *ec) 02223 { 02224 int ret; 02225 02226 if (make_replacement(ec) == -1) 02227 return -1; 02228 02229 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc); 02230 if (ret == -1) 02231 return -1; 02232 02233 return 0; 02234 } 02235 02236 #if 1 02237 #define hash_fallback rb_hash_aref 02238 02239 static VALUE 02240 proc_fallback(VALUE fallback, VALUE c) 02241 { 02242 return rb_proc_call(fallback, rb_ary_new4(1, &c)); 02243 } 02244 02245 static VALUE 02246 method_fallback(VALUE fallback, VALUE c) 02247 { 02248 return rb_method_call(1, &c, fallback); 02249 } 02250 02251 static VALUE 02252 aref_fallback(VALUE fallback, VALUE c) 02253 { 02254 return rb_funcall3(fallback, sym_aref, 1, &c); 02255 } 02256 02257 static void 02258 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, 02259 const unsigned char *in_stop, unsigned char *out_stop, 02260 VALUE destination, 02261 unsigned char *(*resize_destination)(VALUE, size_t, size_t), 02262 const char *src_encoding, 02263 const char *dst_encoding, 02264 int ecflags, 02265 VALUE ecopts) 02266 { 02267 rb_econv_t *ec; 02268 rb_transcoding *last_tc; 02269 rb_econv_result_t ret; 02270 unsigned char *out_start = *out_pos; 02271 int max_output; 02272 VALUE exc; 02273 VALUE fallback = Qnil; 02274 VALUE (*fallback_func)(VALUE, VALUE) = 0; 02275 02276 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts); 02277 if (!ec) 02278 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags)); 02279 02280 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) { 02281 fallback = rb_hash_aref(ecopts, sym_fallback); 02282 if (RB_TYPE_P(fallback, T_HASH)) { 02283 fallback_func = hash_fallback; 02284 } 02285 else if (rb_obj_is_proc(fallback)) { 02286 fallback_func = proc_fallback; 02287 } 02288 else if (rb_obj_is_method(fallback)) { 02289 fallback_func = method_fallback; 02290 } 02291 else { 02292 fallback_func = aref_fallback; 02293 } 02294 } 02295 last_tc = ec->last_tc; 02296 max_output = last_tc ? last_tc->transcoder->max_output : 1; 02297 02298 resume: 02299 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0); 02300 02301 if (!NIL_P(fallback) && ret == econv_undefined_conversion) { 02302 VALUE rep = rb_enc_str_new( 02303 (const char *)ec->last_error.error_bytes_start, 02304 ec->last_error.error_bytes_len, 02305 rb_enc_find(ec->last_error.source_encoding)); 02306 rep = (*fallback_func)(fallback, rep); 02307 if (rep != Qundef && !NIL_P(rep)) { 02308 StringValue(rep); 02309 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep), 02310 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep))); 02311 if ((int)ret == -1) { 02312 rb_raise(rb_eArgError, "too big fallback string"); 02313 } 02314 goto resume; 02315 } 02316 } 02317 02318 if (ret == econv_invalid_byte_sequence || 02319 ret == econv_incomplete_input || 02320 ret == econv_undefined_conversion) { 02321 exc = make_econv_exception(ec); 02322 rb_econv_close(ec); 02323 rb_exc_raise(exc); 02324 } 02325 02326 if (ret == econv_destination_buffer_full) { 02327 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop); 02328 goto resume; 02329 } 02330 02331 rb_econv_close(ec); 02332 return; 02333 } 02334 #else 02335 /* sample transcode_loop implementation in byte-by-byte stream style */ 02336 static void 02337 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, 02338 const unsigned char *in_stop, unsigned char *out_stop, 02339 VALUE destination, 02340 unsigned char *(*resize_destination)(VALUE, size_t, size_t), 02341 const char *src_encoding, 02342 const char *dst_encoding, 02343 int ecflags, 02344 VALUE ecopts) 02345 { 02346 rb_econv_t *ec; 02347 rb_transcoding *last_tc; 02348 rb_econv_result_t ret; 02349 unsigned char *out_start = *out_pos; 02350 const unsigned char *ptr; 02351 int max_output; 02352 VALUE exc; 02353 02354 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts); 02355 if (!ec) 02356 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags)); 02357 02358 last_tc = ec->last_tc; 02359 max_output = last_tc ? last_tc->transcoder->max_output : 1; 02360 02361 ret = econv_source_buffer_empty; 02362 ptr = *in_pos; 02363 while (ret != econv_finished) { 02364 unsigned char input_byte; 02365 const unsigned char *p = &input_byte; 02366 02367 if (ret == econv_source_buffer_empty) { 02368 if (ptr < in_stop) { 02369 input_byte = *ptr; 02370 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT); 02371 } 02372 else { 02373 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0); 02374 } 02375 } 02376 else { 02377 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT); 02378 } 02379 if (&input_byte != p) 02380 ptr += p - &input_byte; 02381 switch (ret) { 02382 case econv_invalid_byte_sequence: 02383 case econv_incomplete_input: 02384 case econv_undefined_conversion: 02385 exc = make_econv_exception(ec); 02386 rb_econv_close(ec); 02387 rb_exc_raise(exc); 02388 break; 02389 02390 case econv_destination_buffer_full: 02391 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop); 02392 break; 02393 02394 case econv_source_buffer_empty: 02395 break; 02396 02397 case econv_finished: 02398 break; 02399 } 02400 } 02401 rb_econv_close(ec); 02402 *in_pos = in_stop; 02403 return; 02404 } 02405 #endif 02406 02407 02408 /* 02409 * String-specific code 02410 */ 02411 02412 static unsigned char * 02413 str_transcoding_resize(VALUE destination, size_t len, size_t new_len) 02414 { 02415 rb_str_resize(destination, new_len); 02416 return (unsigned char *)RSTRING_PTR(destination); 02417 } 02418 02419 static int 02420 econv_opts(VALUE opt, int ecflags) 02421 { 02422 VALUE v; 02423 02424 v = rb_hash_aref(opt, sym_invalid); 02425 if (NIL_P(v)) { 02426 } 02427 else if (v==sym_replace) { 02428 ecflags |= ECONV_INVALID_REPLACE; 02429 } 02430 else { 02431 rb_raise(rb_eArgError, "unknown value for invalid character option"); 02432 } 02433 02434 v = rb_hash_aref(opt, sym_undef); 02435 if (NIL_P(v)) { 02436 } 02437 else if (v==sym_replace) { 02438 ecflags |= ECONV_UNDEF_REPLACE; 02439 } 02440 else { 02441 rb_raise(rb_eArgError, "unknown value for undefined character option"); 02442 } 02443 02444 v = rb_hash_aref(opt, sym_replace); 02445 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) { 02446 ecflags |= ECONV_UNDEF_REPLACE; 02447 } 02448 02449 v = rb_hash_aref(opt, sym_xml); 02450 if (!NIL_P(v)) { 02451 if (v==sym_text) { 02452 ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF; 02453 } 02454 else if (v==sym_attr) { 02455 ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF; 02456 } 02457 else if (RB_TYPE_P(v, T_SYMBOL)) { 02458 rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v))); 02459 } 02460 else { 02461 rb_raise(rb_eArgError, "unexpected value for xml option"); 02462 } 02463 } 02464 02465 #ifdef ENABLE_ECONV_NEWLINE_OPTION 02466 v = rb_hash_aref(opt, sym_newline); 02467 if (!NIL_P(v)) { 02468 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK; 02469 if (v == sym_universal) { 02470 ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR; 02471 } 02472 else if (v == sym_crlf) { 02473 ecflags |= ECONV_CRLF_NEWLINE_DECORATOR; 02474 } 02475 else if (v == sym_cr) { 02476 ecflags |= ECONV_CR_NEWLINE_DECORATOR; 02477 } 02478 else if (v == sym_lf) { 02479 /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */ 02480 } 02481 else if (SYMBOL_P(v)) { 02482 rb_raise(rb_eArgError, "unexpected value for newline option: %s", 02483 rb_id2name(SYM2ID(v))); 02484 } 02485 else { 02486 rb_raise(rb_eArgError, "unexpected value for newline option"); 02487 } 02488 } 02489 else 02490 #endif 02491 { 02492 int setflags = 0, newlineflag = 0; 02493 02494 v = rb_hash_aref(opt, sym_universal_newline); 02495 if (RTEST(v)) 02496 setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR; 02497 newlineflag |= !NIL_P(v); 02498 02499 v = rb_hash_aref(opt, sym_crlf_newline); 02500 if (RTEST(v)) 02501 setflags |= ECONV_CRLF_NEWLINE_DECORATOR; 02502 newlineflag |= !NIL_P(v); 02503 02504 v = rb_hash_aref(opt, sym_cr_newline); 02505 if (RTEST(v)) 02506 setflags |= ECONV_CR_NEWLINE_DECORATOR; 02507 newlineflag |= !NIL_P(v); 02508 02509 if (newlineflag) { 02510 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK; 02511 ecflags |= setflags; 02512 } 02513 } 02514 02515 return ecflags; 02516 } 02517 02518 int 02519 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags) 02520 { 02521 VALUE newhash = Qnil; 02522 VALUE v; 02523 02524 if (NIL_P(opthash)) { 02525 *opts = Qnil; 02526 return ecflags; 02527 } 02528 ecflags = econv_opts(opthash, ecflags); 02529 02530 v = rb_hash_aref(opthash, sym_replace); 02531 if (!NIL_P(v)) { 02532 StringValue(v); 02533 if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) { 02534 VALUE dumped = rb_str_dump(v); 02535 rb_raise(rb_eArgError, "replacement string is broken: %s as %s", 02536 StringValueCStr(dumped), 02537 rb_enc_name(rb_enc_get(v))); 02538 } 02539 v = rb_str_new_frozen(v); 02540 newhash = rb_hash_new(); 02541 rb_hash_aset(newhash, sym_replace, v); 02542 } 02543 02544 v = rb_hash_aref(opthash, sym_fallback); 02545 if (!NIL_P(v)) { 02546 VALUE h = rb_check_hash_type(v); 02547 if (NIL_P(h) 02548 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, sym_aref)) 02549 : (v = h, 1)) { 02550 if (NIL_P(newhash)) 02551 newhash = rb_hash_new(); 02552 rb_hash_aset(newhash, sym_fallback, v); 02553 } 02554 } 02555 02556 if (!NIL_P(newhash)) 02557 rb_hash_freeze(newhash); 02558 *opts = newhash; 02559 02560 return ecflags; 02561 } 02562 02563 int 02564 rb_econv_prepare_opts(VALUE opthash, VALUE *opts) 02565 { 02566 return rb_econv_prepare_options(opthash, opts, 0); 02567 } 02568 02569 rb_econv_t * 02570 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash) 02571 { 02572 rb_econv_t *ec; 02573 VALUE replacement; 02574 02575 if (NIL_P(opthash)) { 02576 replacement = Qnil; 02577 } 02578 else { 02579 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash)) 02580 rb_bug("rb_econv_open_opts called with invalid opthash"); 02581 replacement = rb_hash_aref(opthash, sym_replace); 02582 } 02583 02584 ec = rb_econv_open(source_encoding, destination_encoding, ecflags); 02585 if (!ec) 02586 return ec; 02587 02588 if (!NIL_P(replacement)) { 02589 int ret; 02590 rb_encoding *enc = rb_enc_get(replacement); 02591 02592 ret = rb_econv_set_replacement(ec, 02593 (const unsigned char *)RSTRING_PTR(replacement), 02594 RSTRING_LEN(replacement), 02595 rb_enc_name(enc)); 02596 if (ret == -1) { 02597 rb_econv_close(ec); 02598 return NULL; 02599 } 02600 } 02601 return ec; 02602 } 02603 02604 static int 02605 enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p) 02606 { 02607 rb_encoding *enc; 02608 const char *n; 02609 int encidx; 02610 VALUE encval; 02611 02612 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) || 02613 !(enc = rb_enc_from_index(encidx))) { 02614 enc = NULL; 02615 encidx = 0; 02616 n = StringValueCStr(*arg); 02617 } 02618 else { 02619 n = rb_enc_name(enc); 02620 } 02621 02622 *name_p = n; 02623 *enc_p = enc; 02624 02625 return encidx; 02626 } 02627 02628 static int 02629 str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2, 02630 const char **sname_p, rb_encoding **senc_p, 02631 const char **dname_p, rb_encoding **denc_p) 02632 { 02633 rb_encoding *senc, *denc; 02634 const char *sname, *dname; 02635 int sencidx, dencidx; 02636 02637 dencidx = enc_arg(arg1, &dname, &denc); 02638 02639 if (NIL_P(*arg2)) { 02640 sencidx = rb_enc_get_index(str); 02641 senc = rb_enc_from_index(sencidx); 02642 sname = rb_enc_name(senc); 02643 } 02644 else { 02645 sencidx = enc_arg(arg2, &sname, &senc); 02646 } 02647 02648 *sname_p = sname; 02649 *senc_p = senc; 02650 *dname_p = dname; 02651 *denc_p = denc; 02652 return dencidx; 02653 } 02654 02655 static int 02656 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) 02657 { 02658 VALUE dest; 02659 VALUE str = *self; 02660 volatile VALUE arg1, arg2; 02661 long blen, slen; 02662 unsigned char *buf, *bp, *sp; 02663 const unsigned char *fromp; 02664 rb_encoding *senc, *denc; 02665 const char *sname, *dname; 02666 int dencidx; 02667 02668 rb_check_arity(argc, 0, 2); 02669 02670 if (argc == 0) { 02671 arg1 = rb_enc_default_internal(); 02672 if (NIL_P(arg1)) { 02673 if (!ecflags) return -1; 02674 arg1 = rb_obj_encoding(str); 02675 } 02676 ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE; 02677 } 02678 else { 02679 arg1 = argv[0]; 02680 } 02681 arg2 = argc<=1 ? Qnil : argv[1]; 02682 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc); 02683 02684 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK| 02685 ECONV_XML_TEXT_DECORATOR| 02686 ECONV_XML_ATTR_CONTENT_DECORATOR| 02687 ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) { 02688 if (senc && senc == denc) { 02689 return NIL_P(arg2) ? -1 : dencidx; 02690 } 02691 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) { 02692 if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) { 02693 return dencidx; 02694 } 02695 } 02696 if (encoding_equal(sname, dname)) { 02697 return NIL_P(arg2) ? -1 : dencidx; 02698 } 02699 } 02700 else { 02701 if (encoding_equal(sname, dname)) { 02702 sname = ""; 02703 dname = ""; 02704 } 02705 } 02706 02707 fromp = sp = (unsigned char *)RSTRING_PTR(str); 02708 slen = RSTRING_LEN(str); 02709 blen = slen + 30; /* len + margin */ 02710 dest = rb_str_tmp_new(blen); 02711 bp = (unsigned char *)RSTRING_PTR(dest); 02712 02713 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts); 02714 if (fromp != sp+slen) { 02715 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp); 02716 } 02717 buf = (unsigned char *)RSTRING_PTR(dest); 02718 *bp = '\0'; 02719 rb_str_set_len(dest, bp - buf); 02720 02721 /* set encoding */ 02722 if (!denc) { 02723 dencidx = rb_define_dummy_encoding(dname); 02724 } 02725 *self = dest; 02726 02727 return dencidx; 02728 } 02729 02730 static int 02731 str_transcode(int argc, VALUE *argv, VALUE *self) 02732 { 02733 VALUE opt; 02734 int ecflags = 0; 02735 VALUE ecopts = Qnil; 02736 02737 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt); 02738 if (!NIL_P(opt)) { 02739 ecflags = rb_econv_prepare_opts(opt, &ecopts); 02740 } 02741 return str_transcode0(argc, argv, self, ecflags, ecopts); 02742 } 02743 02744 static inline VALUE 02745 str_encode_associate(VALUE str, int encidx) 02746 { 02747 int cr = 0; 02748 02749 rb_enc_associate_index(str, encidx); 02750 02751 /* transcoded string never be broken. */ 02752 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) { 02753 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr); 02754 } 02755 else { 02756 cr = ENC_CODERANGE_VALID; 02757 } 02758 ENC_CODERANGE_SET(str, cr); 02759 return str; 02760 } 02761 02762 /* 02763 * call-seq: 02764 * str.encode!(encoding [, options] ) -> str 02765 * str.encode!(dst_encoding, src_encoding [, options] ) -> str 02766 * 02767 * The first form transcodes the contents of <i>str</i> from 02768 * str.encoding to +encoding+. 02769 * The second form transcodes the contents of <i>str</i> from 02770 * src_encoding to dst_encoding. 02771 * The options Hash gives details for conversion. See String#encode 02772 * for details. 02773 * Returns the string even if no changes were made. 02774 */ 02775 02776 static VALUE 02777 str_encode_bang(int argc, VALUE *argv, VALUE str) 02778 { 02779 VALUE newstr; 02780 int encidx; 02781 02782 rb_check_frozen(str); 02783 02784 newstr = str; 02785 encidx = str_transcode(argc, argv, &newstr); 02786 02787 if (encidx < 0) return str; 02788 if (newstr == str) { 02789 rb_enc_associate_index(str, encidx); 02790 return str; 02791 } 02792 rb_str_shared_replace(str, newstr); 02793 return str_encode_associate(str, encidx); 02794 } 02795 02796 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx); 02797 02798 /* 02799 * call-seq: 02800 * str.encode(encoding [, options] ) -> str 02801 * str.encode(dst_encoding, src_encoding [, options] ) -> str 02802 * str.encode([options]) -> str 02803 * 02804 * The first form returns a copy of +str+ transcoded 02805 * to encoding +encoding+. 02806 * The second form returns a copy of +str+ transcoded 02807 * from src_encoding to dst_encoding. 02808 * The last form returns a copy of +str+ transcoded to 02809 * <tt>Encoding.default_internal</tt>. 02810 * 02811 * By default, the first and second form raise 02812 * Encoding::UndefinedConversionError for characters that are 02813 * undefined in the destination encoding, and 02814 * Encoding::InvalidByteSequenceError for invalid byte sequences 02815 * in the source encoding. The last form by default does not raise 02816 * exceptions but uses replacement strings. 02817 * 02818 * Please note that conversion from an encoding +enc+ to the 02819 * same encoding +enc+ is a no-op, i.e. the receiver is returned without 02820 * any changes, and no exceptions are raised, even if there are invalid bytes. 02821 * 02822 * The +options+ Hash gives details for conversion and can have the following 02823 * keys: 02824 * 02825 * :invalid :: 02826 * If the value is +:replace+, #encode replaces invalid byte sequences in 02827 * +str+ with the replacement character. The default is to raise the 02828 * Encoding::InvalidByteSequenceError exception 02829 * :undef :: 02830 * If the value is +:replace+, #encode replaces characters which are 02831 * undefined in the destination encoding with the replacement character. 02832 * The default is to raise the Encoding::UndefinedConversionError. 02833 * :replace :: 02834 * Sets the replacement string to the given value. The default replacement 02835 * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise. 02836 * :fallback :: 02837 * Sets the replacement string by the given object for undefined 02838 * character. The object should be a Hash, a Proc, a Method, or an 02839 * object which has [] method. 02840 * Its key is an undefined character encoded in the source encoding 02841 * of current transcoder. Its value can be any encoding until it 02842 * can be converted into the destination encoding of the transcoder. 02843 * :xml :: 02844 * The value must be +:text+ or +:attr+. 02845 * If the value is +:text+ #encode replaces undefined characters with their 02846 * (upper-case hexadecimal) numeric character references. '&', '<', and '>' 02847 * are converted to "&", "<", and ">", respectively. 02848 * If the value is +:attr+, #encode also quotes the replacement result 02849 * (using '"'), and replaces '"' with """. 02850 * :cr_newline :: 02851 * Replaces LF ("\n") with CR ("\r") if value is true. 02852 * :crlf_newline :: 02853 * Replaces LF ("\n") with CRLF ("\r\n") if value is true. 02854 * :universal_newline :: 02855 * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true. 02856 */ 02857 02858 static VALUE 02859 str_encode(int argc, VALUE *argv, VALUE str) 02860 { 02861 VALUE newstr = str; 02862 int encidx = str_transcode(argc, argv, &newstr); 02863 return encoded_dup(newstr, str, encidx); 02864 } 02865 02866 VALUE 02867 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts) 02868 { 02869 int argc = 1; 02870 VALUE *argv = &to; 02871 VALUE newstr = str; 02872 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts); 02873 return encoded_dup(newstr, str, encidx); 02874 } 02875 02876 static VALUE 02877 encoded_dup(VALUE newstr, VALUE str, int encidx) 02878 { 02879 if (encidx < 0) return rb_str_dup(str); 02880 if (newstr == str) { 02881 newstr = rb_str_dup(str); 02882 rb_enc_associate_index(newstr, encidx); 02883 return newstr; 02884 } 02885 else { 02886 RBASIC(newstr)->klass = rb_obj_class(str); 02887 } 02888 return str_encode_associate(newstr, encidx); 02889 } 02890 02891 static void 02892 econv_free(void *ptr) 02893 { 02894 rb_econv_t *ec = ptr; 02895 rb_econv_close(ec); 02896 } 02897 02898 static size_t 02899 econv_memsize(const void *ptr) 02900 { 02901 return ptr ? sizeof(rb_econv_t) : 0; 02902 } 02903 02904 static const rb_data_type_t econv_data_type = { 02905 "econv", 02906 {NULL, econv_free, econv_memsize,}, 02907 }; 02908 02909 static VALUE 02910 econv_s_allocate(VALUE klass) 02911 { 02912 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL); 02913 } 02914 02915 static rb_encoding * 02916 make_dummy_encoding(const char *name) 02917 { 02918 rb_encoding *enc; 02919 int idx; 02920 idx = rb_define_dummy_encoding(name); 02921 enc = rb_enc_from_index(idx); 02922 return enc; 02923 } 02924 02925 static rb_encoding * 02926 make_encoding(const char *name) 02927 { 02928 rb_encoding *enc; 02929 enc = rb_enc_find(name); 02930 if (!enc) 02931 enc = make_dummy_encoding(name); 02932 return enc; 02933 } 02934 02935 static VALUE 02936 make_encobj(const char *name) 02937 { 02938 return rb_enc_from_encoding(make_encoding(name)); 02939 } 02940 02941 /* 02942 * call-seq: 02943 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil 02944 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil 02945 * 02946 * Returns the corresponding ASCII compatible encoding. 02947 * 02948 * Returns nil if the argument is an ASCII compatible encoding. 02949 * 02950 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which 02951 * can represents exactly the same characters as the given ASCII incompatible encoding. 02952 * So, no conversion undefined error occurs when converting between the two encodings. 02953 * 02954 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP> 02955 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8> 02956 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil 02957 * 02958 */ 02959 static VALUE 02960 econv_s_asciicompat_encoding(VALUE klass, VALUE arg) 02961 { 02962 const char *arg_name, *result_name; 02963 rb_encoding *arg_enc, *result_enc; 02964 02965 enc_arg(&arg, &arg_name, &arg_enc); 02966 02967 result_name = rb_econv_asciicompat_encoding(arg_name); 02968 02969 if (result_name == NULL) 02970 return Qnil; 02971 02972 result_enc = make_encoding(result_name); 02973 02974 return rb_enc_from_encoding(result_enc); 02975 } 02976 02977 static void 02978 econv_args(int argc, VALUE *argv, 02979 volatile VALUE *snamev_p, volatile VALUE *dnamev_p, 02980 const char **sname_p, const char **dname_p, 02981 rb_encoding **senc_p, rb_encoding **denc_p, 02982 int *ecflags_p, 02983 VALUE *ecopts_p) 02984 { 02985 VALUE opt, flags_v, ecopts; 02986 int sidx, didx; 02987 const char *sname, *dname; 02988 rb_encoding *senc, *denc; 02989 int ecflags; 02990 02991 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt); 02992 02993 if (!NIL_P(flags_v)) { 02994 if (!NIL_P(opt)) { 02995 rb_error_arity(argc + 1, 2, 3); 02996 } 02997 ecflags = NUM2INT(rb_to_int(flags_v)); 02998 ecopts = Qnil; 02999 } 03000 else if (!NIL_P(opt)) { 03001 ecflags = rb_econv_prepare_opts(opt, &ecopts); 03002 } 03003 else { 03004 ecflags = 0; 03005 ecopts = Qnil; 03006 } 03007 03008 senc = NULL; 03009 sidx = rb_to_encoding_index(*snamev_p); 03010 if (0 <= sidx) { 03011 senc = rb_enc_from_index(sidx); 03012 } 03013 else { 03014 StringValue(*snamev_p); 03015 } 03016 03017 denc = NULL; 03018 didx = rb_to_encoding_index(*dnamev_p); 03019 if (0 <= didx) { 03020 denc = rb_enc_from_index(didx); 03021 } 03022 else { 03023 StringValue(*dnamev_p); 03024 } 03025 03026 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p); 03027 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p); 03028 03029 *sname_p = sname; 03030 *dname_p = dname; 03031 *senc_p = senc; 03032 *denc_p = denc; 03033 *ecflags_p = ecflags; 03034 *ecopts_p = ecopts; 03035 } 03036 03037 static int 03038 decorate_convpath(VALUE convpath, int ecflags) 03039 { 03040 int num_decorators; 03041 const char *decorators[MAX_ECFLAGS_DECORATORS]; 03042 int i; 03043 int n, len; 03044 03045 num_decorators = decorator_names(ecflags, decorators); 03046 if (num_decorators == -1) 03047 return -1; 03048 03049 len = n = RARRAY_LENINT(convpath); 03050 if (n != 0) { 03051 VALUE pair = RARRAY_PTR(convpath)[n-1]; 03052 if (RB_TYPE_P(pair, T_ARRAY)) { 03053 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0])); 03054 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1])); 03055 transcoder_entry_t *entry = get_transcoder_entry(sname, dname); 03056 const rb_transcoder *tr = load_transcoder_entry(entry); 03057 if (!tr) 03058 return -1; 03059 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && 03060 tr->asciicompat_type == asciicompat_encoder) { 03061 n--; 03062 rb_ary_store(convpath, len + num_decorators - 1, pair); 03063 } 03064 } 03065 else { 03066 rb_ary_store(convpath, len + num_decorators - 1, pair); 03067 } 03068 } 03069 03070 for (i = 0; i < num_decorators; i++) 03071 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i])); 03072 03073 return 0; 03074 } 03075 03076 static void 03077 search_convpath_i(const char *sname, const char *dname, int depth, void *arg) 03078 { 03079 VALUE *ary_p = arg; 03080 VALUE v; 03081 03082 if (*ary_p == Qnil) { 03083 *ary_p = rb_ary_new(); 03084 } 03085 03086 if (DECORATOR_P(sname, dname)) { 03087 v = rb_str_new_cstr(dname); 03088 } 03089 else { 03090 v = rb_assoc_new(make_encobj(sname), make_encobj(dname)); 03091 } 03092 rb_ary_store(*ary_p, depth, v); 03093 } 03094 03095 /* 03096 * call-seq: 03097 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary 03098 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary 03099 * 03100 * Returns a conversion path. 03101 * 03102 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP") 03103 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], 03104 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]] 03105 * 03106 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true) 03107 * or 03108 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal) 03109 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], 03110 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], 03111 * # "universal_newline"] 03112 * 03113 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true) 03114 * or 03115 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal) 03116 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], 03117 * # "universal_newline", 03118 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]] 03119 */ 03120 static VALUE 03121 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass) 03122 { 03123 volatile VALUE snamev, dnamev; 03124 const char *sname, *dname; 03125 rb_encoding *senc, *denc; 03126 int ecflags; 03127 VALUE ecopts; 03128 VALUE convpath; 03129 03130 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts); 03131 03132 convpath = Qnil; 03133 transcode_search_path(sname, dname, search_convpath_i, &convpath); 03134 03135 if (NIL_P(convpath)) 03136 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags)); 03137 03138 if (decorate_convpath(convpath, ecflags) == -1) 03139 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags)); 03140 03141 return convpath; 03142 } 03143 03144 /* 03145 * Check the existence of a conversion path. 03146 * Returns the number of converters in the conversion path. 03147 * result: >=0:success -1:failure 03148 */ 03149 int 03150 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding) 03151 { 03152 VALUE convpath = Qnil; 03153 transcode_search_path(from_encoding, to_encoding, search_convpath_i, 03154 &convpath); 03155 return RTEST(convpath); 03156 } 03157 03158 struct rb_econv_init_by_convpath_t { 03159 rb_econv_t *ec; 03160 int index; 03161 int ret; 03162 }; 03163 03164 static void 03165 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg) 03166 { 03167 struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg; 03168 int ret; 03169 03170 if (a->ret == -1) 03171 return; 03172 03173 ret = rb_econv_add_converter(a->ec, sname, dname, a->index); 03174 03175 a->ret = ret; 03176 return; 03177 } 03178 03179 static rb_econv_t * 03180 rb_econv_init_by_convpath(VALUE self, VALUE convpath, 03181 const char **sname_p, const char **dname_p, 03182 rb_encoding **senc_p, rb_encoding**denc_p) 03183 { 03184 rb_econv_t *ec; 03185 long i; 03186 int ret, first=1; 03187 VALUE elt; 03188 rb_encoding *senc = 0, *denc = 0; 03189 const char *sname, *dname; 03190 03191 ec = rb_econv_alloc(RARRAY_LENINT(convpath)); 03192 DATA_PTR(self) = ec; 03193 03194 for (i = 0; i < RARRAY_LEN(convpath); i++) { 03195 volatile VALUE snamev, dnamev; 03196 VALUE pair; 03197 elt = rb_ary_entry(convpath, i); 03198 if (!NIL_P(pair = rb_check_array_type(elt))) { 03199 if (RARRAY_LEN(pair) != 2) 03200 rb_raise(rb_eArgError, "not a 2-element array in convpath"); 03201 snamev = rb_ary_entry(pair, 0); 03202 enc_arg(&snamev, &sname, &senc); 03203 dnamev = rb_ary_entry(pair, 1); 03204 enc_arg(&dnamev, &dname, &denc); 03205 } 03206 else { 03207 sname = ""; 03208 dname = StringValueCStr(elt); 03209 } 03210 if (DECORATOR_P(sname, dname)) { 03211 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans); 03212 if (ret == -1) 03213 rb_raise(rb_eArgError, "decoration failed: %s", dname); 03214 } 03215 else { 03216 int j = ec->num_trans; 03217 struct rb_econv_init_by_convpath_t arg; 03218 arg.ec = ec; 03219 arg.index = ec->num_trans; 03220 arg.ret = 0; 03221 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg); 03222 if (ret == -1 || arg.ret == -1) 03223 rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname); 03224 if (first) { 03225 first = 0; 03226 *senc_p = senc; 03227 *sname_p = ec->elems[j].tc->transcoder->src_encoding; 03228 } 03229 *denc_p = denc; 03230 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding; 03231 } 03232 } 03233 03234 if (first) { 03235 *senc_p = NULL; 03236 *denc_p = NULL; 03237 *sname_p = ""; 03238 *dname_p = ""; 03239 } 03240 03241 ec->source_encoding_name = *sname_p; 03242 ec->destination_encoding_name = *dname_p; 03243 03244 return ec; 03245 } 03246 03247 /* 03248 * call-seq: 03249 * Encoding::Converter.new(source_encoding, destination_encoding) 03250 * Encoding::Converter.new(source_encoding, destination_encoding, opt) 03251 * Encoding::Converter.new(convpath) 03252 * 03253 * possible options elements: 03254 * hash form: 03255 * :invalid => nil # raise error on invalid byte sequence (default) 03256 * :invalid => :replace # replace invalid byte sequence 03257 * :undef => nil # raise error on undefined conversion (default) 03258 * :undef => :replace # replace undefined conversion 03259 * :replace => string # replacement string ("?" or "\uFFFD" if not specified) 03260 * :newline => :universal # decorator for converting CRLF and CR to LF 03261 * :newline => :crlf # decorator for converting LF to CRLF 03262 * :newline => :cr # decorator for converting LF to CR 03263 * :universal_newline => true # decorator for converting CRLF and CR to LF 03264 * :crlf_newline => true # decorator for converting LF to CRLF 03265 * :cr_newline => true # decorator for converting LF to CR 03266 * :xml => :text # escape as XML CharData. 03267 * :xml => :attr # escape as XML AttValue 03268 * integer form: 03269 * Encoding::Converter::INVALID_REPLACE 03270 * Encoding::Converter::UNDEF_REPLACE 03271 * Encoding::Converter::UNDEF_HEX_CHARREF 03272 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR 03273 * Encoding::Converter::CRLF_NEWLINE_DECORATOR 03274 * Encoding::Converter::CR_NEWLINE_DECORATOR 03275 * Encoding::Converter::XML_TEXT_DECORATOR 03276 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR 03277 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR 03278 * 03279 * Encoding::Converter.new creates an instance of Encoding::Converter. 03280 * 03281 * Source_encoding and destination_encoding should be a string or 03282 * Encoding object. 03283 * 03284 * opt should be nil, a hash or an integer. 03285 * 03286 * convpath should be an array. 03287 * convpath may contain 03288 * - two-element arrays which contain encodings or encoding names, or 03289 * - strings representing decorator names. 03290 * 03291 * Encoding::Converter.new optionally takes an option. 03292 * The option should be a hash or an integer. 03293 * The option hash can contain :invalid => nil, etc. 03294 * The option integer should be logical-or of constants such as 03295 * Encoding::Converter::INVALID_REPLACE, etc. 03296 * 03297 * [:invalid => nil] 03298 * Raise error on invalid byte sequence. This is a default behavior. 03299 * [:invalid => :replace] 03300 * Replace invalid byte sequence by replacement string. 03301 * [:undef => nil] 03302 * Raise an error if a character in source_encoding is not defined in destination_encoding. 03303 * This is a default behavior. 03304 * [:undef => :replace] 03305 * Replace undefined character in destination_encoding with replacement string. 03306 * [:replace => string] 03307 * Specify the replacement string. 03308 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others. 03309 * [:universal_newline => true] 03310 * Convert CRLF and CR to LF. 03311 * [:crlf_newline => true] 03312 * Convert LF to CRLF. 03313 * [:cr_newline => true] 03314 * Convert LF to CR. 03315 * [:xml => :text] 03316 * Escape as XML CharData. 03317 * This form can be used as a HTML 4.0 #PCDATA. 03318 * - '&' -> '&' 03319 * - '<' -> '<' 03320 * - '>' -> '>' 03321 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH; 03322 * [:xml => :attr] 03323 * Escape as XML AttValue. 03324 * The converted result is quoted as "...". 03325 * This form can be used as a HTML 4.0 attribute value. 03326 * - '&' -> '&' 03327 * - '<' -> '<' 03328 * - '>' -> '>' 03329 * - '"' -> '"' 03330 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH; 03331 * 03332 * Examples: 03333 * # UTF-16BE to UTF-8 03334 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8") 03335 * 03336 * # Usually, decorators such as newline conversion are inserted last. 03337 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true) 03338 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>], 03339 * # "universal_newline"] 03340 * 03341 * # But, if the last encoding is ASCII incompatible, 03342 * # decorators are inserted before the last conversion. 03343 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true) 03344 * p ec.convpath #=> ["crlf_newline", 03345 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]] 03346 * 03347 * # Conversion path can be specified directly. 03348 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]]) 03349 * p ec.convpath #=> ["universal_newline", 03350 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>], 03351 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]] 03352 */ 03353 static VALUE 03354 econv_init(int argc, VALUE *argv, VALUE self) 03355 { 03356 VALUE ecopts; 03357 volatile VALUE snamev, dnamev; 03358 const char *sname, *dname; 03359 rb_encoding *senc, *denc; 03360 rb_econv_t *ec; 03361 int ecflags; 03362 VALUE convpath; 03363 03364 if (rb_check_typeddata(self, &econv_data_type)) { 03365 rb_raise(rb_eTypeError, "already initialized"); 03366 } 03367 03368 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) { 03369 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc); 03370 ecflags = 0; 03371 ecopts = Qnil; 03372 } 03373 else { 03374 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts); 03375 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts); 03376 } 03377 03378 if (!ec) { 03379 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags)); 03380 } 03381 03382 if (!DECORATOR_P(sname, dname)) { 03383 if (!senc) 03384 senc = make_dummy_encoding(sname); 03385 if (!denc) 03386 denc = make_dummy_encoding(dname); 03387 } 03388 03389 ec->source_encoding = senc; 03390 ec->destination_encoding = denc; 03391 03392 DATA_PTR(self) = ec; 03393 03394 return self; 03395 } 03396 03397 /* 03398 * call-seq: 03399 * ec.inspect -> string 03400 * 03401 * Returns a printable version of <i>ec</i> 03402 * 03403 * ec = Encoding::Converter.new("iso-8859-1", "utf-8") 03404 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8> 03405 * 03406 */ 03407 static VALUE 03408 econv_inspect(VALUE self) 03409 { 03410 const char *cname = rb_obj_classname(self); 03411 rb_econv_t *ec; 03412 03413 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec); 03414 if (!ec) 03415 return rb_sprintf("#<%s: uninitialized>", cname); 03416 else { 03417 const char *sname = ec->source_encoding_name; 03418 const char *dname = ec->destination_encoding_name; 03419 VALUE str; 03420 str = rb_sprintf("#<%s: ", cname); 03421 econv_description(sname, dname, ec->flags, str); 03422 rb_str_cat2(str, ">"); 03423 return str; 03424 } 03425 } 03426 03427 static rb_econv_t * 03428 check_econv(VALUE self) 03429 { 03430 rb_econv_t *ec; 03431 03432 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec); 03433 if (!ec) { 03434 rb_raise(rb_eTypeError, "uninitialized encoding converter"); 03435 } 03436 return ec; 03437 } 03438 03439 /* 03440 * call-seq: 03441 * ec.source_encoding -> encoding 03442 * 03443 * Returns the source encoding as an Encoding object. 03444 */ 03445 static VALUE 03446 econv_source_encoding(VALUE self) 03447 { 03448 rb_econv_t *ec = check_econv(self); 03449 if (!ec->source_encoding) 03450 return Qnil; 03451 return rb_enc_from_encoding(ec->source_encoding); 03452 } 03453 03454 /* 03455 * call-seq: 03456 * ec.destination_encoding -> encoding 03457 * 03458 * Returns the destination encoding as an Encoding object. 03459 */ 03460 static VALUE 03461 econv_destination_encoding(VALUE self) 03462 { 03463 rb_econv_t *ec = check_econv(self); 03464 if (!ec->destination_encoding) 03465 return Qnil; 03466 return rb_enc_from_encoding(ec->destination_encoding); 03467 } 03468 03469 /* 03470 * call-seq: 03471 * ec.convpath -> ary 03472 * 03473 * Returns the conversion path of ec. 03474 * 03475 * The result is an array of conversions. 03476 * 03477 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true) 03478 * p ec.convpath 03479 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], 03480 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], 03481 * # "crlf_newline"] 03482 * 03483 * Each element of the array is a pair of encodings or a string. 03484 * A pair means an encoding conversion. 03485 * A string means a decorator. 03486 * 03487 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means 03488 * a converter from ISO-8859-1 to UTF-8. 03489 * "crlf_newline" means newline converter from LF to CRLF. 03490 */ 03491 static VALUE 03492 econv_convpath(VALUE self) 03493 { 03494 rb_econv_t *ec = check_econv(self); 03495 VALUE result; 03496 int i; 03497 03498 result = rb_ary_new(); 03499 for (i = 0; i < ec->num_trans; i++) { 03500 const rb_transcoder *tr = ec->elems[i].tc->transcoder; 03501 VALUE v; 03502 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding)) 03503 v = rb_str_new_cstr(tr->dst_encoding); 03504 else 03505 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding)); 03506 rb_ary_push(result, v); 03507 } 03508 return result; 03509 } 03510 03511 /* 03512 * call-seq: 03513 * ec == other -> true or false 03514 */ 03515 static VALUE 03516 econv_equal(VALUE self, VALUE other) 03517 { 03518 rb_econv_t *ec1 = check_econv(self); 03519 rb_econv_t *ec2; 03520 int i; 03521 03522 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) { 03523 return Qnil; 03524 } 03525 ec2 = DATA_PTR(other); 03526 if (!ec2) return Qfalse; 03527 if (ec1->source_encoding_name != ec2->source_encoding_name && 03528 strcmp(ec1->source_encoding_name, ec2->source_encoding_name)) 03529 return Qfalse; 03530 if (ec1->destination_encoding_name != ec2->destination_encoding_name && 03531 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name)) 03532 return Qfalse; 03533 if (ec1->flags != ec2->flags) return Qfalse; 03534 if (ec1->replacement_enc != ec2->replacement_enc && 03535 strcmp(ec1->replacement_enc, ec2->replacement_enc)) 03536 return Qfalse; 03537 if (ec1->replacement_len != ec2->replacement_len) return Qfalse; 03538 if (ec1->replacement_str != ec2->replacement_str && 03539 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len)) 03540 return Qfalse; 03541 03542 if (ec1->num_trans != ec2->num_trans) return Qfalse; 03543 for (i = 0; i < ec1->num_trans; i++) { 03544 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder) 03545 return Qfalse; 03546 } 03547 return Qtrue; 03548 } 03549 03550 static VALUE 03551 econv_result_to_symbol(rb_econv_result_t res) 03552 { 03553 switch (res) { 03554 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence; 03555 case econv_incomplete_input: return sym_incomplete_input; 03556 case econv_undefined_conversion: return sym_undefined_conversion; 03557 case econv_destination_buffer_full: return sym_destination_buffer_full; 03558 case econv_source_buffer_empty: return sym_source_buffer_empty; 03559 case econv_finished: return sym_finished; 03560 case econv_after_output: return sym_after_output; 03561 default: return INT2NUM(res); /* should not be reached */ 03562 } 03563 } 03564 03565 /* 03566 * call-seq: 03567 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol 03568 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol 03569 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol 03570 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol 03571 * 03572 * possible opt elements: 03573 * hash form: 03574 * :partial_input => true # source buffer may be part of larger source 03575 * :after_output => true # stop conversion after output before input 03576 * integer form: 03577 * Encoding::Converter::PARTIAL_INPUT 03578 * Encoding::Converter::AFTER_OUTPUT 03579 * 03580 * possible results: 03581 * :invalid_byte_sequence 03582 * :incomplete_input 03583 * :undefined_conversion 03584 * :after_output 03585 * :destination_buffer_full 03586 * :source_buffer_empty 03587 * :finished 03588 * 03589 * primitive_convert converts source_buffer into destination_buffer. 03590 * 03591 * source_buffer should be a string or nil. 03592 * nil means an empty string. 03593 * 03594 * destination_buffer should be a string. 03595 * 03596 * destination_byteoffset should be an integer or nil. 03597 * nil means the end of destination_buffer. 03598 * If it is omitted, nil is assumed. 03599 * 03600 * destination_bytesize should be an integer or nil. 03601 * nil means unlimited. 03602 * If it is omitted, nil is assumed. 03603 * 03604 * opt should be nil, a hash or an integer. 03605 * nil means no flags. 03606 * If it is omitted, nil is assumed. 03607 * 03608 * primitive_convert converts the content of source_buffer from beginning 03609 * and store the result into destination_buffer. 03610 * 03611 * destination_byteoffset and destination_bytesize specify the region which 03612 * the converted result is stored. 03613 * destination_byteoffset specifies the start position in destination_buffer in bytes. 03614 * If destination_byteoffset is nil, 03615 * destination_buffer.bytesize is used for appending the result. 03616 * destination_bytesize specifies maximum number of bytes. 03617 * If destination_bytesize is nil, 03618 * destination size is unlimited. 03619 * After conversion, destination_buffer is resized to 03620 * destination_byteoffset + actually produced number of bytes. 03621 * Also destination_buffer's encoding is set to destination_encoding. 03622 * 03623 * primitive_convert drops the converted part of source_buffer. 03624 * the dropped part is converted in destination_buffer or 03625 * buffered in Encoding::Converter object. 03626 * 03627 * primitive_convert stops conversion when one of following condition met. 03628 * - invalid byte sequence found in source buffer (:invalid_byte_sequence) 03629 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error. 03630 * - unexpected end of source buffer (:incomplete_input) 03631 * this occur only when :partial_input is not specified. 03632 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error. 03633 * - character not representable in output encoding (:undefined_conversion) 03634 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error. 03635 * - after some output is generated, before input is done (:after_output) 03636 * this occur only when :after_output is specified. 03637 * - destination buffer is full (:destination_buffer_full) 03638 * this occur only when destination_bytesize is non-nil. 03639 * - source buffer is empty (:source_buffer_empty) 03640 * this occur only when :partial_input is specified. 03641 * - conversion is finished (:finished) 03642 * 03643 * example: 03644 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE") 03645 * ret = ec.primitive_convert(src="pi", dst="", nil, 100) 03646 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"] 03647 * 03648 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE") 03649 * ret = ec.primitive_convert(src="pi", dst="", nil, 1) 03650 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"] 03651 * ret = ec.primitive_convert(src, dst="", nil, 1) 03652 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"] 03653 * ret = ec.primitive_convert(src, dst="", nil, 1) 03654 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"] 03655 * ret = ec.primitive_convert(src, dst="", nil, 1) 03656 * p [ret, src, dst] #=> [:finished, "", "i"] 03657 * 03658 */ 03659 static VALUE 03660 econv_primitive_convert(int argc, VALUE *argv, VALUE self) 03661 { 03662 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v; 03663 rb_econv_t *ec = check_econv(self); 03664 rb_econv_result_t res; 03665 const unsigned char *ip, *is; 03666 unsigned char *op, *os; 03667 long output_byteoffset, output_bytesize; 03668 unsigned long output_byteend; 03669 int flags; 03670 03671 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt); 03672 03673 if (NIL_P(output_byteoffset_v)) 03674 output_byteoffset = 0; /* dummy */ 03675 else 03676 output_byteoffset = NUM2LONG(output_byteoffset_v); 03677 03678 if (NIL_P(output_bytesize_v)) 03679 output_bytesize = 0; /* dummy */ 03680 else 03681 output_bytesize = NUM2LONG(output_bytesize_v); 03682 03683 if (!NIL_P(flags_v)) { 03684 if (!NIL_P(opt)) { 03685 rb_error_arity(argc + 1, 2, 5); 03686 } 03687 flags = NUM2INT(rb_to_int(flags_v)); 03688 } 03689 else if (!NIL_P(opt)) { 03690 VALUE v; 03691 flags = 0; 03692 v = rb_hash_aref(opt, sym_partial_input); 03693 if (RTEST(v)) 03694 flags |= ECONV_PARTIAL_INPUT; 03695 v = rb_hash_aref(opt, sym_after_output); 03696 if (RTEST(v)) 03697 flags |= ECONV_AFTER_OUTPUT; 03698 } 03699 else { 03700 flags = 0; 03701 } 03702 03703 StringValue(output); 03704 if (!NIL_P(input)) 03705 StringValue(input); 03706 rb_str_modify(output); 03707 03708 if (NIL_P(output_bytesize_v)) { 03709 output_bytesize = RSTRING_EMBED_LEN_MAX; 03710 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input)) 03711 output_bytesize = RSTRING_LEN(input); 03712 } 03713 03714 retry: 03715 03716 if (NIL_P(output_byteoffset_v)) 03717 output_byteoffset = RSTRING_LEN(output); 03718 03719 if (output_byteoffset < 0) 03720 rb_raise(rb_eArgError, "negative output_byteoffset"); 03721 03722 if (RSTRING_LEN(output) < output_byteoffset) 03723 rb_raise(rb_eArgError, "output_byteoffset too big"); 03724 03725 if (output_bytesize < 0) 03726 rb_raise(rb_eArgError, "negative output_bytesize"); 03727 03728 output_byteend = (unsigned long)output_byteoffset + 03729 (unsigned long)output_bytesize; 03730 03731 if (output_byteend < (unsigned long)output_byteoffset || 03732 LONG_MAX < output_byteend) 03733 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big"); 03734 03735 if (rb_str_capacity(output) < output_byteend) 03736 rb_str_resize(output, output_byteend); 03737 03738 if (NIL_P(input)) { 03739 ip = is = NULL; 03740 } 03741 else { 03742 ip = (const unsigned char *)RSTRING_PTR(input); 03743 is = ip + RSTRING_LEN(input); 03744 } 03745 03746 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset; 03747 os = op + output_bytesize; 03748 03749 res = rb_econv_convert(ec, &ip, is, &op, os, flags); 03750 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output)); 03751 if (!NIL_P(input)) 03752 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input)); 03753 03754 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) { 03755 if (LONG_MAX / 2 < output_bytesize) 03756 rb_raise(rb_eArgError, "too long conversion result"); 03757 output_bytesize *= 2; 03758 output_byteoffset_v = Qnil; 03759 goto retry; 03760 } 03761 03762 if (ec->destination_encoding) { 03763 rb_enc_associate(output, ec->destination_encoding); 03764 } 03765 03766 return econv_result_to_symbol(res); 03767 } 03768 03769 /* 03770 * call-seq: 03771 * ec.convert(source_string) -> destination_string 03772 * 03773 * Convert source_string and return destination_string. 03774 * 03775 * source_string is assumed as a part of source. 03776 * i.e. :partial_input=>true is specified internally. 03777 * finish method should be used last. 03778 * 03779 * ec = Encoding::Converter.new("utf-8", "euc-jp") 03780 * puts ec.convert("\u3042").dump #=> "\xA4\xA2" 03781 * puts ec.finish.dump #=> "" 03782 * 03783 * ec = Encoding::Converter.new("euc-jp", "utf-8") 03784 * puts ec.convert("\xA4").dump #=> "" 03785 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82" 03786 * puts ec.finish.dump #=> "" 03787 * 03788 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp") 03789 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP") 03790 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP") 03791 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP") 03792 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP") 03793 * 03794 * If a conversion error occur, 03795 * Encoding::UndefinedConversionError or 03796 * Encoding::InvalidByteSequenceError is raised. 03797 * Encoding::Converter#convert doesn't supply methods to recover or restart 03798 * from these exceptions. 03799 * When you want to handle these conversion errors, 03800 * use Encoding::Converter#primitive_convert. 03801 * 03802 */ 03803 static VALUE 03804 econv_convert(VALUE self, VALUE source_string) 03805 { 03806 VALUE ret, dst; 03807 VALUE av[5]; 03808 int ac; 03809 rb_econv_t *ec = check_econv(self); 03810 03811 StringValue(source_string); 03812 03813 dst = rb_str_new(NULL, 0); 03814 03815 av[0] = rb_str_dup(source_string); 03816 av[1] = dst; 03817 av[2] = Qnil; 03818 av[3] = Qnil; 03819 av[4] = INT2NUM(ECONV_PARTIAL_INPUT); 03820 ac = 5; 03821 03822 ret = econv_primitive_convert(ac, av, self); 03823 03824 if (ret == sym_invalid_byte_sequence || 03825 ret == sym_undefined_conversion || 03826 ret == sym_incomplete_input) { 03827 VALUE exc = make_econv_exception(ec); 03828 rb_exc_raise(exc); 03829 } 03830 03831 if (ret == sym_finished) { 03832 rb_raise(rb_eArgError, "converter already finished"); 03833 } 03834 03835 if (ret != sym_source_buffer_empty) { 03836 rb_bug("unexpected result of econv_primitive_convert"); 03837 } 03838 03839 return dst; 03840 } 03841 03842 /* 03843 * call-seq: 03844 * ec.finish -> string 03845 * 03846 * Finishes the converter. 03847 * It returns the last part of the converted string. 03848 * 03849 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp") 03850 * p ec.convert("\u3042") #=> "\e$B$\"" 03851 * p ec.finish #=> "\e(B" 03852 */ 03853 static VALUE 03854 econv_finish(VALUE self) 03855 { 03856 VALUE ret, dst; 03857 VALUE av[5]; 03858 int ac; 03859 rb_econv_t *ec = check_econv(self); 03860 03861 dst = rb_str_new(NULL, 0); 03862 03863 av[0] = Qnil; 03864 av[1] = dst; 03865 av[2] = Qnil; 03866 av[3] = Qnil; 03867 av[4] = INT2NUM(0); 03868 ac = 5; 03869 03870 ret = econv_primitive_convert(ac, av, self); 03871 03872 if (ret == sym_invalid_byte_sequence || 03873 ret == sym_undefined_conversion || 03874 ret == sym_incomplete_input) { 03875 VALUE exc = make_econv_exception(ec); 03876 rb_exc_raise(exc); 03877 } 03878 03879 if (ret != sym_finished) { 03880 rb_bug("unexpected result of econv_primitive_convert"); 03881 } 03882 03883 return dst; 03884 } 03885 03886 /* 03887 * call-seq: 03888 * ec.primitive_errinfo -> array 03889 * 03890 * primitive_errinfo returns important information regarding the last error 03891 * as a 5-element array: 03892 * 03893 * [result, enc1, enc2, error_bytes, readagain_bytes] 03894 * 03895 * result is the last result of primitive_convert. 03896 * 03897 * Other elements are only meaningful when result is 03898 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion. 03899 * 03900 * enc1 and enc2 indicate a conversion step as a pair of strings. 03901 * For example, a converter from EUC-JP to ISO-8859-1 converts 03902 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1. 03903 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"]. 03904 * 03905 * error_bytes and readagain_bytes indicate the byte sequences which caused the error. 03906 * error_bytes is discarded portion. 03907 * readagain_bytes is buffered portion which is read again on next conversion. 03908 * 03909 * Example: 03910 * 03911 * # \xff is invalid as EUC-JP. 03912 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") 03913 * ec.primitive_convert(src="\xff", dst="", nil, 10) 03914 * p ec.primitive_errinfo 03915 * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""] 03916 * 03917 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1. 03918 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion, 03919 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82). 03920 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") 03921 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10) 03922 * p ec.primitive_errinfo 03923 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""] 03924 * 03925 * # partial character is invalid 03926 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") 03927 * ec.primitive_convert(src="\xa4", dst="", nil, 10) 03928 * p ec.primitive_errinfo 03929 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""] 03930 * 03931 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by 03932 * # partial characters. 03933 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") 03934 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) 03935 * p ec.primitive_errinfo 03936 * #=> [:source_buffer_empty, nil, nil, nil, nil] 03937 * 03938 * # \xd8\x00\x00@ is invalid as UTF-16BE because 03939 * # no low surrogate after high surrogate (\xd8\x00). 03940 * # It is detected by 3rd byte (\00) which is part of next character. 03941 * # So the high surrogate (\xd8\x00) is discarded and 03942 * # the 3rd byte is read again later. 03943 * # Since the byte is buffered in ec, it is dropped from src. 03944 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8") 03945 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10) 03946 * p ec.primitive_errinfo 03947 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"] 03948 * p src 03949 * #=> "@" 03950 * 03951 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE. 03952 * # The problem is detected by 4th byte. 03953 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8") 03954 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10) 03955 * p ec.primitive_errinfo 03956 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"] 03957 * p src 03958 * #=> "" 03959 * 03960 */ 03961 static VALUE 03962 econv_primitive_errinfo(VALUE self) 03963 { 03964 rb_econv_t *ec = check_econv(self); 03965 03966 VALUE ary; 03967 03968 ary = rb_ary_new2(5); 03969 03970 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result)); 03971 rb_ary_store(ary, 4, Qnil); 03972 03973 if (ec->last_error.source_encoding) 03974 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding)); 03975 03976 if (ec->last_error.destination_encoding) 03977 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding)); 03978 03979 if (ec->last_error.error_bytes_start) { 03980 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len)); 03981 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len)); 03982 } 03983 03984 return ary; 03985 } 03986 03987 /* 03988 * call-seq: 03989 * ec.insert_output(string) -> nil 03990 * 03991 * Inserts string into the encoding converter. 03992 * The string will be converted to the destination encoding and 03993 * output on later conversions. 03994 * 03995 * If the destination encoding is stateful, 03996 * string is converted according to the state and the state is updated. 03997 * 03998 * This method should be used only when a conversion error occurs. 03999 * 04000 * ec = Encoding::Converter.new("utf-8", "iso-8859-1") 04001 * src = "HIRAGANA LETTER A is \u{3042}." 04002 * dst = "" 04003 * p ec.primitive_convert(src, dst) #=> :undefined_conversion 04004 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."] 04005 * ec.insert_output("<err>") 04006 * p ec.primitive_convert(src, dst) #=> :finished 04007 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""] 04008 * 04009 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp") 04010 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp 04011 * dst = "" 04012 * p ec.primitive_convert(src, dst) #=> :undefined_conversion 04013 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"] 04014 * ec.insert_output "?" # state change required to output "?". 04015 * p ec.primitive_convert(src, dst) #=> :finished 04016 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""] 04017 * 04018 */ 04019 static VALUE 04020 econv_insert_output(VALUE self, VALUE string) 04021 { 04022 const char *insert_enc; 04023 04024 int ret; 04025 04026 rb_econv_t *ec = check_econv(self); 04027 04028 StringValue(string); 04029 insert_enc = rb_econv_encoding_to_insert_output(ec); 04030 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil); 04031 04032 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc); 04033 if (ret == -1) { 04034 rb_raise(rb_eArgError, "too big string"); 04035 } 04036 04037 return Qnil; 04038 } 04039 04040 /* 04041 * call-seq 04042 * ec.putback -> string 04043 * ec.putback(max_numbytes) -> string 04044 * 04045 * Put back the bytes which will be converted. 04046 * 04047 * The bytes are caused by invalid_byte_sequence error. 04048 * When invalid_byte_sequence error, some bytes are discarded and 04049 * some bytes are buffered to be converted later. 04050 * The latter bytes can be put back. 04051 * It can be observed by 04052 * Encoding::InvalidByteSequenceError#readagain_bytes and 04053 * Encoding::Converter#primitive_errinfo. 04054 * 04055 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1") 04056 * src = "\x00\xd8\x61\x00" 04057 * dst = "" 04058 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence 04059 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"] 04060 * p ec.putback #=> "a\x00" 04061 * p ec.putback #=> "" # no more bytes to put back 04062 * 04063 */ 04064 static VALUE 04065 econv_putback(int argc, VALUE *argv, VALUE self) 04066 { 04067 rb_econv_t *ec = check_econv(self); 04068 int n; 04069 int putbackable; 04070 VALUE str, max; 04071 04072 rb_scan_args(argc, argv, "01", &max); 04073 04074 if (NIL_P(max)) 04075 n = rb_econv_putbackable(ec); 04076 else { 04077 n = NUM2INT(max); 04078 putbackable = rb_econv_putbackable(ec); 04079 if (putbackable < n) 04080 n = putbackable; 04081 } 04082 04083 str = rb_str_new(NULL, n); 04084 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n); 04085 04086 if (ec->source_encoding) { 04087 rb_enc_associate(str, ec->source_encoding); 04088 } 04089 04090 return str; 04091 } 04092 04093 /* 04094 * call-seq: 04095 * ec.last_error -> exception or nil 04096 * 04097 * Returns an exception object for the last conversion. 04098 * Returns nil if the last conversion did not produce an error. 04099 * 04100 * "error" means that 04101 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for 04102 * Encoding::Converter#convert and 04103 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for 04104 * Encoding::Converter#primitive_convert. 04105 * 04106 * ec = Encoding::Converter.new("utf-8", "iso-8859-1") 04107 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence 04108 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8> 04109 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full 04110 * p ec.last_error #=> nil 04111 * 04112 */ 04113 static VALUE 04114 econv_last_error(VALUE self) 04115 { 04116 rb_econv_t *ec = check_econv(self); 04117 VALUE exc; 04118 04119 exc = make_econv_exception(ec); 04120 if (NIL_P(exc)) 04121 return Qnil; 04122 return exc; 04123 } 04124 04125 /* 04126 * call-seq: 04127 * ec.replacement -> string 04128 * 04129 * Returns the replacement string. 04130 * 04131 * ec = Encoding::Converter.new("euc-jp", "us-ascii") 04132 * p ec.replacement #=> "?" 04133 * 04134 * ec = Encoding::Converter.new("euc-jp", "utf-8") 04135 * p ec.replacement #=> "\uFFFD" 04136 */ 04137 static VALUE 04138 econv_get_replacement(VALUE self) 04139 { 04140 rb_econv_t *ec = check_econv(self); 04141 int ret; 04142 rb_encoding *enc; 04143 04144 ret = make_replacement(ec); 04145 if (ret == -1) { 04146 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"); 04147 } 04148 04149 enc = rb_enc_find(ec->replacement_enc); 04150 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc); 04151 } 04152 04153 /* 04154 * call-seq: 04155 * ec.replacement = string 04156 * 04157 * Sets the replacement string. 04158 * 04159 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace) 04160 * ec.replacement = "<undef>" 04161 * p ec.convert("a \u3042 b") #=> "a <undef> b" 04162 */ 04163 static VALUE 04164 econv_set_replacement(VALUE self, VALUE arg) 04165 { 04166 rb_econv_t *ec = check_econv(self); 04167 VALUE string = arg; 04168 int ret; 04169 rb_encoding *enc; 04170 04171 StringValue(string); 04172 enc = rb_enc_get(string); 04173 04174 ret = rb_econv_set_replacement(ec, 04175 (const unsigned char *)RSTRING_PTR(string), 04176 RSTRING_LEN(string), 04177 rb_enc_name(enc)); 04178 04179 if (ret == -1) { 04180 /* xxx: rb_eInvalidByteSequenceError? */ 04181 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"); 04182 } 04183 04184 return arg; 04185 } 04186 04187 VALUE 04188 rb_econv_make_exception(rb_econv_t *ec) 04189 { 04190 return make_econv_exception(ec); 04191 } 04192 04193 void 04194 rb_econv_check_error(rb_econv_t *ec) 04195 { 04196 VALUE exc; 04197 04198 exc = make_econv_exception(ec); 04199 if (NIL_P(exc)) 04200 return; 04201 rb_exc_raise(exc); 04202 } 04203 04204 /* 04205 * call-seq: 04206 * ecerr.source_encoding_name -> string 04207 * 04208 * Returns the source encoding name as a string. 04209 */ 04210 static VALUE 04211 ecerr_source_encoding_name(VALUE self) 04212 { 04213 return rb_attr_get(self, rb_intern("source_encoding_name")); 04214 } 04215 04216 /* 04217 * call-seq: 04218 * ecerr.source_encoding -> encoding 04219 * 04220 * Returns the source encoding as an encoding object. 04221 * 04222 * Note that the result may not be equal to the source encoding of 04223 * the encoding converter if the conversion has multiple steps. 04224 * 04225 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP 04226 * begin 04227 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP. 04228 * rescue Encoding::UndefinedConversionError 04229 * p $!.source_encoding #=> #<Encoding:UTF-8> 04230 * p $!.destination_encoding #=> #<Encoding:EUC-JP> 04231 * p $!.source_encoding_name #=> "UTF-8" 04232 * p $!.destination_encoding_name #=> "EUC-JP" 04233 * end 04234 * 04235 */ 04236 static VALUE 04237 ecerr_source_encoding(VALUE self) 04238 { 04239 return rb_attr_get(self, rb_intern("source_encoding")); 04240 } 04241 04242 /* 04243 * call-seq: 04244 * ecerr.destination_encoding_name -> string 04245 * 04246 * Returns the destination encoding name as a string. 04247 */ 04248 static VALUE 04249 ecerr_destination_encoding_name(VALUE self) 04250 { 04251 return rb_attr_get(self, rb_intern("destination_encoding_name")); 04252 } 04253 04254 /* 04255 * call-seq: 04256 * ecerr.destination_encoding -> string 04257 * 04258 * Returns the destination encoding as an encoding object. 04259 */ 04260 static VALUE 04261 ecerr_destination_encoding(VALUE self) 04262 { 04263 return rb_attr_get(self, rb_intern("destination_encoding")); 04264 } 04265 04266 /* 04267 * call-seq: 04268 * ecerr.error_char -> string 04269 * 04270 * Returns the one-character string which cause Encoding::UndefinedConversionError. 04271 * 04272 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") 04273 * begin 04274 * ec.convert("\xa0") 04275 * rescue Encoding::UndefinedConversionError 04276 * puts $!.error_char.dump #=> "\xC2\xA0" 04277 * p $!.error_char.encoding #=> #<Encoding:UTF-8> 04278 * end 04279 * 04280 */ 04281 static VALUE 04282 ecerr_error_char(VALUE self) 04283 { 04284 return rb_attr_get(self, rb_intern("error_char")); 04285 } 04286 04287 /* 04288 * call-seq: 04289 * ecerr.error_bytes -> string 04290 * 04291 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs. 04292 * 04293 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") 04294 * begin 04295 * ec.convert("abc\xA1\xFFdef") 04296 * rescue Encoding::InvalidByteSequenceError 04297 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP> 04298 * puts $!.error_bytes.dump #=> "\xA1" 04299 * puts $!.readagain_bytes.dump #=> "\xFF" 04300 * end 04301 */ 04302 static VALUE 04303 ecerr_error_bytes(VALUE self) 04304 { 04305 return rb_attr_get(self, rb_intern("error_bytes")); 04306 } 04307 04308 /* 04309 * call-seq: 04310 * ecerr.readagain_bytes -> string 04311 * 04312 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs. 04313 */ 04314 static VALUE 04315 ecerr_readagain_bytes(VALUE self) 04316 { 04317 return rb_attr_get(self, rb_intern("readagain_bytes")); 04318 } 04319 04320 /* 04321 * call-seq: 04322 * ecerr.incomplete_input? -> true or false 04323 * 04324 * Returns true if the invalid byte sequence error is caused by 04325 * premature end of string. 04326 * 04327 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") 04328 * 04329 * begin 04330 * ec.convert("abc\xA1z") 04331 * rescue Encoding::InvalidByteSequenceError 04332 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP> 04333 * p $!.incomplete_input? #=> false 04334 * end 04335 * 04336 * begin 04337 * ec.convert("abc\xA1") 04338 * ec.finish 04339 * rescue Encoding::InvalidByteSequenceError 04340 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP> 04341 * p $!.incomplete_input? #=> true 04342 * end 04343 */ 04344 static VALUE 04345 ecerr_incomplete_input(VALUE self) 04346 { 04347 return rb_attr_get(self, rb_intern("incomplete_input")); 04348 } 04349 04350 /* 04351 * Document-class: Encoding::UndefinedConversionError 04352 * 04353 * Raised by Encoding and String methods when a transcoding operation 04354 * fails. 04355 */ 04356 04357 /* 04358 * Document-class: Encoding::InvalidByteSequenceError 04359 * 04360 * Raised by Encoding and String methods when the string being 04361 * transcoded contains a byte invalid for the either the source or 04362 * target encoding. 04363 */ 04364 04365 /* 04366 * Document-class: Encoding::ConverterNotFoundError 04367 * 04368 * Raised by transcoding methods when a named encoding does not 04369 * correspond with a known converter. 04370 */ 04371 04372 void 04373 Init_transcode(void) 04374 { 04375 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError); 04376 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError); 04377 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError); 04378 04379 transcoder_table = st_init_strcasetable(); 04380 04381 sym_invalid = ID2SYM(rb_intern("invalid")); 04382 sym_undef = ID2SYM(rb_intern("undef")); 04383 sym_replace = ID2SYM(rb_intern("replace")); 04384 sym_fallback = ID2SYM(rb_intern("fallback")); 04385 sym_aref = ID2SYM(rb_intern("[]")); 04386 sym_xml = ID2SYM(rb_intern("xml")); 04387 sym_text = ID2SYM(rb_intern("text")); 04388 sym_attr = ID2SYM(rb_intern("attr")); 04389 04390 sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence")); 04391 sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion")); 04392 sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full")); 04393 sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty")); 04394 sym_finished = ID2SYM(rb_intern("finished")); 04395 sym_after_output = ID2SYM(rb_intern("after_output")); 04396 sym_incomplete_input = ID2SYM(rb_intern("incomplete_input")); 04397 sym_universal_newline = ID2SYM(rb_intern("universal_newline")); 04398 sym_crlf_newline = ID2SYM(rb_intern("crlf_newline")); 04399 sym_cr_newline = ID2SYM(rb_intern("cr_newline")); 04400 sym_partial_input = ID2SYM(rb_intern("partial_input")); 04401 04402 #ifdef ENABLE_ECONV_NEWLINE_OPTION 04403 sym_newline = ID2SYM(rb_intern("newline")); 04404 sym_universal = ID2SYM(rb_intern("universal")); 04405 sym_crlf = ID2SYM(rb_intern("crlf")); 04406 sym_cr = ID2SYM(rb_intern("cr")); 04407 sym_lf = ID2SYM(rb_intern("lf")); 04408 #endif 04409 04410 rb_define_method(rb_cString, "encode", str_encode, -1); 04411 rb_define_method(rb_cString, "encode!", str_encode_bang, -1); 04412 04413 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData); 04414 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate); 04415 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1); 04416 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1); 04417 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1); 04418 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0); 04419 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0); 04420 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0); 04421 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0); 04422 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1); 04423 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1); 04424 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0); 04425 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0); 04426 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1); 04427 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1); 04428 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0); 04429 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0); 04430 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1); 04431 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1); 04432 04433 /* Document-const: INVALID_MASK 04434 * 04435 * Mask for invalid byte sequences 04436 */ 04437 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK)); 04438 04439 /* Document-const: INVALID_REPLACE 04440 * 04441 * Replace invalid byte sequences 04442 */ 04443 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE)); 04444 04445 /* Document-const: UNDEF_MASK 04446 * 04447 * Mask for a valid character in the source encoding but no related 04448 * character(s) in destination encoding. 04449 */ 04450 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK)); 04451 04452 /* Document-const: UNDEF_REPLACE 04453 * 04454 * Replace byte sequences that are undefined in the destination encoding. 04455 */ 04456 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE)); 04457 04458 /* Document-const: UNDEF_HEX_CHARREF 04459 * 04460 * Replace byte sequences that are undefined in the destination encoding 04461 * with an XML hexadecimal character reference. This is valid for XML 04462 * conversion. 04463 */ 04464 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF)); 04465 04466 /* Document-const: PARTIAL_INPUT 04467 * 04468 * Indicates the source may be part of a larger string. See 04469 * primitive_convert for an example. 04470 */ 04471 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT)); 04472 04473 /* Document-const: AFTER_OUTPUT 04474 * 04475 * Stop converting after some output is complete but before all of the 04476 * input was consumed. See primitive_convert for an example. 04477 */ 04478 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT)); 04479 04480 /* Document-const: UNIVERSAL_NEWLINE_DECORATOR 04481 * 04482 * Decorator for converting CRLF and CR to LF 04483 */ 04484 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR)); 04485 04486 /* Document-const: CRLF_NEWLINE_DECORATOR 04487 * 04488 * Decorator for converting LF to CRLF 04489 */ 04490 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR)); 04491 04492 /* Document-const: CR_NEWLINE_DECORATOR 04493 * 04494 * Decorator for converting LF to CR 04495 */ 04496 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR)); 04497 04498 /* Document-const: XML_TEXT_DECORATOR 04499 * 04500 * Escape as XML CharData 04501 */ 04502 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR)); 04503 04504 /* Document-const: XML_ATTR_CONTENT_DECORATOR 04505 * 04506 * Escape as XML AttValue 04507 */ 04508 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR)); 04509 04510 /* Document-const: XML_ATTR_QUOTE_DECORATOR 04511 * 04512 * Escape as XML AttValue 04513 */ 04514 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR)); 04515 04516 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0); 04517 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0); 04518 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0); 04519 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0); 04520 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0); 04521 04522 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0); 04523 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0); 04524 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0); 04525 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0); 04526 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0); 04527 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0); 04528 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0); 04529 04530 Init_newline(); 04531 } 04532