Ruby  2.0.0p247(2013-06-27revision41674)
transcode.c
Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   transcode.c -
00004 
00005   $Author: drbrain $
00006   created at: Tue Oct 30 16:10:22 JST 2007
00007 
00008   Copyright (C) 2007 Martin Duerst
00009 
00010 **********************************************************************/
00011 
00012 #include "ruby/ruby.h"
00013 #include "ruby/encoding.h"
00014 #include "internal.h"
00015 #include "transcode_data.h"
00016 #include <ctype.h>
00017 
00018 #define ENABLE_ECONV_NEWLINE_OPTION 1
00019 
00020 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
00021 VALUE rb_eUndefinedConversionError;
00022 VALUE rb_eInvalidByteSequenceError;
00023 VALUE rb_eConverterNotFoundError;
00024 
00025 VALUE rb_cEncodingConverter;
00026 
00027 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback, sym_aref;
00028 static VALUE sym_xml, sym_text, sym_attr;
00029 static VALUE sym_universal_newline;
00030 static VALUE sym_crlf_newline;
00031 static VALUE sym_cr_newline;
00032 #ifdef ENABLE_ECONV_NEWLINE_OPTION
00033 static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
00034 #endif
00035 static VALUE sym_partial_input;
00036 
00037 static VALUE sym_invalid_byte_sequence;
00038 static VALUE sym_undefined_conversion;
00039 static VALUE sym_destination_buffer_full;
00040 static VALUE sym_source_buffer_empty;
00041 static VALUE sym_finished;
00042 static VALUE sym_after_output;
00043 static VALUE sym_incomplete_input;
00044 
00045 static unsigned char *
00046 allocate_converted_string(const char *sname, const char *dname,
00047         const unsigned char *str, size_t len,
00048         unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
00049         size_t *dst_len_ptr);
00050 
00051 /* dynamic structure, one per conversion (similar to iconv_t) */
00052 /* may carry conversion state (e.g. for iso-2022-jp) */
00053 typedef struct rb_transcoding {
00054     const rb_transcoder *transcoder;
00055 
00056     int flags;
00057 
00058     int resume_position;
00059     unsigned int next_table;
00060     VALUE next_info;
00061     unsigned char next_byte;
00062     unsigned int output_index;
00063 
00064     ssize_t recognized_len; /* already interpreted */
00065     ssize_t readagain_len; /* not yet interpreted */
00066     union {
00067         unsigned char ary[8]; /* max_input <= sizeof(ary) */
00068         unsigned char *ptr; /* length: max_input */
00069     } readbuf; /* recognized_len + readagain_len used */
00070 
00071     ssize_t writebuf_off;
00072     ssize_t writebuf_len;
00073     union {
00074         unsigned char ary[8]; /* max_output <= sizeof(ary) */
00075         unsigned char *ptr; /* length: max_output */
00076     } writebuf;
00077 
00078     union rb_transcoding_state_t { /* opaque data for stateful encoding */
00079         void *ptr;
00080         char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
00081         double dummy_for_alignment;
00082     } state;
00083 } rb_transcoding;
00084 #define TRANSCODING_READBUF(tc) \
00085     ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
00086      (tc)->readbuf.ary : \
00087      (tc)->readbuf.ptr)
00088 #define TRANSCODING_WRITEBUF(tc) \
00089     ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
00090      (tc)->writebuf.ary : \
00091      (tc)->writebuf.ptr)
00092 #define TRANSCODING_WRITEBUF_SIZE(tc) \
00093     ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
00094      sizeof((tc)->writebuf.ary) : \
00095      (size_t)(tc)->transcoder->max_output)
00096 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
00097 #define TRANSCODING_STATE(tc) \
00098     ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
00099      (tc)->state.ary : \
00100      (tc)->state.ptr)
00101 
00102 typedef struct {
00103     struct rb_transcoding *tc;
00104     unsigned char *out_buf_start;
00105     unsigned char *out_data_start;
00106     unsigned char *out_data_end;
00107     unsigned char *out_buf_end;
00108     rb_econv_result_t last_result;
00109 } rb_econv_elem_t;
00110 
00111 struct rb_econv_t {
00112     int flags;
00113     const char *source_encoding_name;
00114     const char *destination_encoding_name;
00115 
00116     int started;
00117 
00118     const unsigned char *replacement_str;
00119     size_t replacement_len;
00120     const char *replacement_enc;
00121     int replacement_allocated;
00122 
00123     unsigned char *in_buf_start;
00124     unsigned char *in_data_start;
00125     unsigned char *in_data_end;
00126     unsigned char *in_buf_end;
00127     rb_econv_elem_t *elems;
00128     int num_allocated;
00129     int num_trans;
00130     int num_finished;
00131     struct rb_transcoding *last_tc;
00132 
00133     /* last error */
00134     struct {
00135         rb_econv_result_t result;
00136         struct rb_transcoding *error_tc;
00137         const char *source_encoding;
00138         const char *destination_encoding;
00139         const unsigned char *error_bytes_start;
00140         size_t error_bytes_len;
00141         size_t readagain_len;
00142     } last_error;
00143 
00144     /* The following fields are only for Encoding::Converter.
00145      * rb_econv_open set them NULL. */
00146     rb_encoding *source_encoding;
00147     rb_encoding *destination_encoding;
00148 };
00149 
00150 /*
00151  *  Dispatch data and logic
00152  */
00153 
00154 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
00155 
00156 typedef struct {
00157     const char *sname;
00158     const char *dname;
00159     const char *lib; /* null means means no need to load a library */
00160     const rb_transcoder *transcoder;
00161 } transcoder_entry_t;
00162 
00163 static st_table *transcoder_table;
00164 
00165 static transcoder_entry_t *
00166 make_transcoder_entry(const char *sname, const char *dname)
00167 {
00168     st_data_t val;
00169     st_table *table2;
00170 
00171     if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
00172         val = (st_data_t)st_init_strcasetable();
00173         st_add_direct(transcoder_table, (st_data_t)sname, val);
00174     }
00175     table2 = (st_table *)val;
00176     if (!st_lookup(table2, (st_data_t)dname, &val)) {
00177         transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
00178         entry->sname = sname;
00179         entry->dname = dname;
00180         entry->lib = NULL;
00181         entry->transcoder = NULL;
00182         val = (st_data_t)entry;
00183         st_add_direct(table2, (st_data_t)dname, val);
00184     }
00185     return (transcoder_entry_t *)val;
00186 }
00187 
00188 static transcoder_entry_t *
00189 get_transcoder_entry(const char *sname, const char *dname)
00190 {
00191     st_data_t val;
00192     st_table *table2;
00193 
00194     if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
00195         return NULL;
00196     }
00197     table2 = (st_table *)val;
00198     if (!st_lookup(table2, (st_data_t)dname, &val)) {
00199         return NULL;
00200     }
00201     return (transcoder_entry_t *)val;
00202 }
00203 
00204 void
00205 rb_register_transcoder(const rb_transcoder *tr)
00206 {
00207     const char *const sname = tr->src_encoding;
00208     const char *const dname = tr->dst_encoding;
00209 
00210     transcoder_entry_t *entry;
00211 
00212     entry = make_transcoder_entry(sname, dname);
00213     if (entry->transcoder) {
00214         rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
00215                  sname, dname);
00216     }
00217 
00218     entry->transcoder = tr;
00219 }
00220 
00221 static void
00222 declare_transcoder(const char *sname, const char *dname, const char *lib)
00223 {
00224     transcoder_entry_t *entry;
00225 
00226     entry = make_transcoder_entry(sname, dname);
00227     entry->lib = lib;
00228 }
00229 
00230 static const char transcoder_lib_prefix[] = "enc/trans/";
00231 
00232 void
00233 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
00234 {
00235     if (!lib) {
00236         rb_raise(rb_eArgError, "invalid library name - (null)");
00237     }
00238     declare_transcoder(enc1, enc2, lib);
00239 }
00240 
00241 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
00242 
00243 typedef struct search_path_queue_tag {
00244     struct search_path_queue_tag *next;
00245     const char *enc;
00246 } search_path_queue_t;
00247 
00248 typedef struct {
00249     st_table *visited;
00250     search_path_queue_t *queue;
00251     search_path_queue_t **queue_last_ptr;
00252     const char *base_enc;
00253 } search_path_bfs_t;
00254 
00255 static int
00256 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
00257 {
00258     const char *dname = (const char *)key;
00259     search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
00260     search_path_queue_t *q;
00261 
00262     if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
00263         return ST_CONTINUE;
00264     }
00265 
00266     q = ALLOC(search_path_queue_t);
00267     q->enc = dname;
00268     q->next = NULL;
00269     *bfs->queue_last_ptr = q;
00270     bfs->queue_last_ptr = &q->next;
00271 
00272     st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
00273     return ST_CONTINUE;
00274 }
00275 
00276 static int
00277 transcode_search_path(const char *sname, const char *dname,
00278     void (*callback)(const char *sname, const char *dname, int depth, void *arg),
00279     void *arg)
00280 {
00281     search_path_bfs_t bfs;
00282     search_path_queue_t *q;
00283     st_data_t val;
00284     st_table *table2;
00285     int found;
00286     int pathlen = -1;
00287 
00288     if (encoding_equal(sname, dname))
00289         return -1;
00290 
00291     q = ALLOC(search_path_queue_t);
00292     q->enc = sname;
00293     q->next = NULL;
00294     bfs.queue_last_ptr = &q->next;
00295     bfs.queue = q;
00296 
00297     bfs.visited = st_init_strcasetable();
00298     st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
00299 
00300     while (bfs.queue) {
00301         q = bfs.queue;
00302         bfs.queue = q->next;
00303         if (!bfs.queue)
00304             bfs.queue_last_ptr = &bfs.queue;
00305 
00306         if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
00307             xfree(q);
00308             continue;
00309         }
00310         table2 = (st_table *)val;
00311 
00312         if (st_lookup(table2, (st_data_t)dname, &val)) {
00313             st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
00314             xfree(q);
00315             found = 1;
00316             goto cleanup;
00317         }
00318 
00319         bfs.base_enc = q->enc;
00320         st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
00321         bfs.base_enc = NULL;
00322 
00323         xfree(q);
00324     }
00325     found = 0;
00326 
00327   cleanup:
00328     while (bfs.queue) {
00329         q = bfs.queue;
00330         bfs.queue = q->next;
00331         xfree(q);
00332     }
00333 
00334     if (found) {
00335         const char *enc = dname;
00336         int depth;
00337         pathlen = 0;
00338         while (1) {
00339             st_lookup(bfs.visited, (st_data_t)enc, &val);
00340             if (!val)
00341                 break;
00342             pathlen++;
00343             enc = (const char *)val;
00344         }
00345         depth = pathlen;
00346         enc = dname;
00347         while (1) {
00348             st_lookup(bfs.visited, (st_data_t)enc, &val);
00349             if (!val)
00350                 break;
00351             callback((const char *)val, enc, --depth, arg);
00352             enc = (const char *)val;
00353         }
00354     }
00355 
00356     st_free_table(bfs.visited);
00357 
00358     return pathlen; /* is -1 if not found */
00359 }
00360 
00361 static const rb_transcoder *
00362 load_transcoder_entry(transcoder_entry_t *entry)
00363 {
00364     if (entry->transcoder)
00365         return entry->transcoder;
00366 
00367     if (entry->lib) {
00368         const char *const lib = entry->lib;
00369         const size_t len = strlen(lib);
00370         const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
00371         const VALUE fn = rb_str_new(0, total_len);
00372         char *const path = RSTRING_PTR(fn);
00373         const int safe = rb_safe_level();
00374 
00375         entry->lib = NULL;
00376 
00377         memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
00378         memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
00379         rb_str_set_len(fn, total_len);
00380         FL_UNSET(fn, FL_TAINT|FL_UNTRUSTED);
00381         OBJ_FREEZE(fn);
00382         if (!rb_require_safe(fn, safe > 3 ? 3 : safe))
00383             return NULL;
00384     }
00385 
00386     if (entry->transcoder)
00387         return entry->transcoder;
00388 
00389     return NULL;
00390 }
00391 
00392 static const char*
00393 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
00394 {
00395     if (encoding_equal(encname, "UTF-8")) {
00396         *len_ret = 3;
00397         *repl_encname_ptr = "UTF-8";
00398         return "\xEF\xBF\xBD";
00399     }
00400     else {
00401         *len_ret = 1;
00402         *repl_encname_ptr = "US-ASCII";
00403         return "?";
00404     }
00405 }
00406 
00407 /*
00408  *  Transcoding engine logic
00409  */
00410 
00411 static const unsigned char *
00412 transcode_char_start(rb_transcoding *tc,
00413                          const unsigned char *in_start,
00414                          const unsigned char *inchar_start,
00415                          const unsigned char *in_p,
00416                          size_t *char_len_ptr)
00417 {
00418     const unsigned char *ptr;
00419     if (inchar_start - in_start < tc->recognized_len) {
00420         MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
00421                inchar_start, unsigned char, in_p - inchar_start);
00422         ptr = TRANSCODING_READBUF(tc);
00423     }
00424     else {
00425         ptr = inchar_start - tc->recognized_len;
00426     }
00427     *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
00428     return ptr;
00429 }
00430 
00431 static rb_econv_result_t
00432 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
00433                       const unsigned char *in_stop, unsigned char *out_stop,
00434                       rb_transcoding *tc,
00435                       const int opt)
00436 {
00437     const rb_transcoder *tr = tc->transcoder;
00438     int unitlen = tr->input_unit_length;
00439     ssize_t readagain_len = 0;
00440 
00441     const unsigned char *inchar_start;
00442     const unsigned char *in_p;
00443 
00444     unsigned char *out_p;
00445 
00446     in_p = inchar_start = *in_pos;
00447 
00448     out_p = *out_pos;
00449 
00450 #define SUSPEND(ret, num) \
00451     do { \
00452         tc->resume_position = (num); \
00453         if (0 < in_p - inchar_start) \
00454             MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
00455                    inchar_start, unsigned char, in_p - inchar_start); \
00456         *in_pos = in_p; \
00457         *out_pos = out_p; \
00458         tc->recognized_len += in_p - inchar_start; \
00459         if (readagain_len) { \
00460             tc->recognized_len -= readagain_len; \
00461             tc->readagain_len = readagain_len; \
00462         } \
00463         return (ret); \
00464         resume_label ## num:; \
00465     } while (0)
00466 #define SUSPEND_OBUF(num) \
00467     do { \
00468         while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
00469     } while (0)
00470 
00471 #define SUSPEND_AFTER_OUTPUT(num) \
00472     if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
00473         SUSPEND(econv_after_output, num); \
00474     }
00475 
00476 #define next_table (tc->next_table)
00477 #define next_info (tc->next_info)
00478 #define next_byte (tc->next_byte)
00479 #define writebuf_len (tc->writebuf_len)
00480 #define writebuf_off (tc->writebuf_off)
00481 
00482     switch (tc->resume_position) {
00483       case 0: break;
00484       case 1: goto resume_label1;
00485       case 2: goto resume_label2;
00486       case 3: goto resume_label3;
00487       case 4: goto resume_label4;
00488       case 5: goto resume_label5;
00489       case 6: goto resume_label6;
00490       case 7: goto resume_label7;
00491       case 8: goto resume_label8;
00492       case 9: goto resume_label9;
00493       case 10: goto resume_label10;
00494       case 11: goto resume_label11;
00495       case 12: goto resume_label12;
00496       case 13: goto resume_label13;
00497       case 14: goto resume_label14;
00498       case 15: goto resume_label15;
00499       case 16: goto resume_label16;
00500       case 17: goto resume_label17;
00501       case 18: goto resume_label18;
00502       case 19: goto resume_label19;
00503       case 20: goto resume_label20;
00504       case 21: goto resume_label21;
00505       case 22: goto resume_label22;
00506       case 23: goto resume_label23;
00507       case 24: goto resume_label24;
00508       case 25: goto resume_label25;
00509       case 26: goto resume_label26;
00510       case 27: goto resume_label27;
00511       case 28: goto resume_label28;
00512       case 29: goto resume_label29;
00513       case 30: goto resume_label30;
00514       case 31: goto resume_label31;
00515       case 32: goto resume_label32;
00516       case 33: goto resume_label33;
00517       case 34: goto resume_label34;
00518     }
00519 
00520     while (1) {
00521         inchar_start = in_p;
00522         tc->recognized_len = 0;
00523         next_table = tr->conv_tree_start;
00524 
00525         SUSPEND_AFTER_OUTPUT(24);
00526 
00527         if (in_stop <= in_p) {
00528             if (!(opt & ECONV_PARTIAL_INPUT))
00529                 break;
00530             SUSPEND(econv_source_buffer_empty, 7);
00531             continue;
00532         }
00533 
00534 #define BYTE_ADDR(index) (tr->byte_array + (index))
00535 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
00536 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
00537 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
00538 #define BL_MIN_BYTE     (BL_BASE[0])
00539 #define BL_MAX_BYTE     (BL_BASE[1])
00540 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
00541 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
00542 
00543         next_byte = (unsigned char)*in_p++;
00544       follow_byte:
00545         if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
00546             next_info = INVALID;
00547         else {
00548             next_info = (VALUE)BL_ACTION(next_byte);
00549         }
00550       follow_info:
00551         switch (next_info & 0x1F) {
00552           case NOMAP:
00553             {
00554                 const unsigned char *p = inchar_start;
00555                 writebuf_off = 0;
00556                 while (p < in_p) {
00557                     TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
00558                 }
00559                 writebuf_len = writebuf_off;
00560                 writebuf_off = 0;
00561                 while (writebuf_off < writebuf_len) {
00562                     SUSPEND_OBUF(3);
00563                     *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00564                 }
00565             }
00566             continue;
00567           case 0x00: case 0x04: case 0x08: case 0x0C:
00568           case 0x10: case 0x14: case 0x18: case 0x1C:
00569             SUSPEND_AFTER_OUTPUT(25);
00570             while (in_p >= in_stop) {
00571                 if (!(opt & ECONV_PARTIAL_INPUT))
00572                     goto incomplete;
00573                 SUSPEND(econv_source_buffer_empty, 5);
00574             }
00575             next_byte = (unsigned char)*in_p++;
00576             next_table = (unsigned int)next_info;
00577             goto follow_byte;
00578           case ZERObt: /* drop input */
00579             continue;
00580           case ONEbt:
00581             SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
00582             continue;
00583           case TWObt:
00584             SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
00585             SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
00586             continue;
00587           case THREEbt:
00588             SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
00589             SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
00590             SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
00591             continue;
00592           case FOURbt:
00593             SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
00594             SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
00595             SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
00596             SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
00597             continue;
00598           case GB4bt:
00599             SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
00600             SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
00601             SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
00602             SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
00603             continue;
00604           case STR1:
00605             tc->output_index = 0;
00606             while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
00607                 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
00608                 tc->output_index++;
00609             }
00610             continue;
00611           case FUNii:
00612             next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
00613             goto follow_info;
00614           case FUNsi:
00615             {
00616                 const unsigned char *char_start;
00617                 size_t char_len;
00618                 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00619                 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
00620                 goto follow_info;
00621             }
00622           case FUNio:
00623             SUSPEND_OBUF(13);
00624             if (tr->max_output <= out_stop - out_p)
00625                 out_p += tr->func_io(TRANSCODING_STATE(tc),
00626                     next_info, out_p, out_stop - out_p);
00627             else {
00628                 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
00629                     next_info,
00630                     TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00631                 writebuf_off = 0;
00632                 while (writebuf_off < writebuf_len) {
00633                     SUSPEND_OBUF(20);
00634                     *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00635                 }
00636             }
00637             break;
00638           case FUNso:
00639             {
00640                 const unsigned char *char_start;
00641                 size_t char_len;
00642                 SUSPEND_OBUF(14);
00643                 if (tr->max_output <= out_stop - out_p) {
00644                     char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00645                     out_p += tr->func_so(TRANSCODING_STATE(tc),
00646                         char_start, (size_t)char_len,
00647                         out_p, out_stop - out_p);
00648                 }
00649                 else {
00650                     char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00651                     writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
00652                         char_start, (size_t)char_len,
00653                         TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00654                     writebuf_off = 0;
00655                     while (writebuf_off < writebuf_len) {
00656                         SUSPEND_OBUF(22);
00657                         *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00658                     }
00659                 }
00660                 break;
00661             }
00662       case FUNsio:
00663             {
00664                 const unsigned char *char_start;
00665                 size_t char_len;
00666                 SUSPEND_OBUF(33);
00667                 if (tr->max_output <= out_stop - out_p) {
00668                     char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00669                     out_p += tr->func_sio(TRANSCODING_STATE(tc),
00670                         char_start, (size_t)char_len, next_info,
00671                         out_p, out_stop - out_p);
00672                 }
00673                 else {
00674                     char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00675                     writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
00676                         char_start, (size_t)char_len, next_info,
00677                         TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00678                     writebuf_off = 0;
00679                     while (writebuf_off < writebuf_len) {
00680                         SUSPEND_OBUF(34);
00681                         *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00682                     }
00683                 }
00684                 break;
00685             }
00686           case INVALID:
00687             if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
00688                 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
00689                     SUSPEND_AFTER_OUTPUT(26);
00690                 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
00691                     in_p = in_stop;
00692                     SUSPEND(econv_source_buffer_empty, 8);
00693                 }
00694                 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
00695                     in_p = in_stop;
00696                 }
00697                 else {
00698                     in_p = inchar_start + (unitlen - tc->recognized_len);
00699                 }
00700             }
00701             else {
00702                 ssize_t invalid_len; /* including the last byte which causes invalid */
00703                 ssize_t discard_len;
00704                 invalid_len = tc->recognized_len + (in_p - inchar_start);
00705                 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
00706                 readagain_len = invalid_len - discard_len;
00707             }
00708             goto invalid;
00709           case UNDEF:
00710             goto undef;
00711           default:
00712             rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
00713         }
00714         continue;
00715 
00716       invalid:
00717         SUSPEND(econv_invalid_byte_sequence, 1);
00718         continue;
00719 
00720       incomplete:
00721         SUSPEND(econv_incomplete_input, 27);
00722         continue;
00723 
00724       undef:
00725         SUSPEND(econv_undefined_conversion, 2);
00726         continue;
00727     }
00728 
00729     /* cleanup */
00730     if (tr->finish_func) {
00731         SUSPEND_OBUF(4);
00732         if (tr->max_output <= out_stop - out_p) {
00733             out_p += tr->finish_func(TRANSCODING_STATE(tc),
00734                 out_p, out_stop - out_p);
00735         }
00736         else {
00737             writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
00738                 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00739             writebuf_off = 0;
00740             while (writebuf_off < writebuf_len) {
00741                 SUSPEND_OBUF(23);
00742                 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00743             }
00744         }
00745     }
00746     while (1)
00747         SUSPEND(econv_finished, 6);
00748 #undef SUSPEND
00749 #undef next_table
00750 #undef next_info
00751 #undef next_byte
00752 #undef writebuf_len
00753 #undef writebuf_off
00754 }
00755 
00756 static rb_econv_result_t
00757 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
00758                       const unsigned char *in_stop, unsigned char *out_stop,
00759                       rb_transcoding *tc,
00760                       const int opt)
00761 {
00762     if (tc->readagain_len) {
00763         unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
00764         const unsigned char *readagain_pos = readagain_buf;
00765         const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
00766         rb_econv_result_t res;
00767 
00768         MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
00769                unsigned char, tc->readagain_len);
00770         tc->readagain_len = 0;
00771         res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
00772         if (res != econv_source_buffer_empty) {
00773             MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
00774                    readagain_pos, unsigned char, readagain_stop - readagain_pos);
00775             tc->readagain_len += readagain_stop - readagain_pos;
00776             return res;
00777         }
00778     }
00779     return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
00780 }
00781 
00782 static rb_transcoding *
00783 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
00784 {
00785     rb_transcoding *tc;
00786 
00787     tc = ALLOC(rb_transcoding);
00788     tc->transcoder = tr;
00789     tc->flags = flags;
00790     if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
00791         tc->state.ptr = xmalloc(tr->state_size);
00792     if (tr->state_init_func) {
00793         (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
00794     }
00795     tc->resume_position = 0;
00796     tc->recognized_len = 0;
00797     tc->readagain_len = 0;
00798     tc->writebuf_len = 0;
00799     tc->writebuf_off = 0;
00800     if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
00801         tc->readbuf.ptr = xmalloc(tr->max_input);
00802     }
00803     if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
00804         tc->writebuf.ptr = xmalloc(tr->max_output);
00805     }
00806     return tc;
00807 }
00808 
00809 static rb_econv_result_t
00810 rb_transcoding_convert(rb_transcoding *tc,
00811   const unsigned char **input_ptr, const unsigned char *input_stop,
00812   unsigned char **output_ptr, unsigned char *output_stop,
00813   int flags)
00814 {
00815     return transcode_restartable(
00816                 input_ptr, output_ptr,
00817                 input_stop, output_stop,
00818                 tc, flags);
00819 }
00820 
00821 static void
00822 rb_transcoding_close(rb_transcoding *tc)
00823 {
00824     const rb_transcoder *tr = tc->transcoder;
00825     if (tr->state_fini_func) {
00826         (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
00827     }
00828     if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
00829         xfree(tc->state.ptr);
00830     if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
00831         xfree(tc->readbuf.ptr);
00832     if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
00833         xfree(tc->writebuf.ptr);
00834     xfree(tc);
00835 }
00836 
00837 static size_t
00838 rb_transcoding_memsize(rb_transcoding *tc)
00839 {
00840     size_t size = sizeof(rb_transcoding);
00841     const rb_transcoder *tr = tc->transcoder;
00842 
00843     if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
00844         size += tr->state_size;
00845     }
00846     if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
00847         size += tr->max_input;
00848     }
00849     if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
00850         size += tr->max_output;
00851     }
00852     return size;
00853 }
00854 
00855 static rb_econv_t *
00856 rb_econv_alloc(int n_hint)
00857 {
00858     rb_econv_t *ec;
00859 
00860     if (n_hint <= 0)
00861         n_hint = 1;
00862 
00863     ec = ALLOC(rb_econv_t);
00864     ec->flags = 0;
00865     ec->source_encoding_name = NULL;
00866     ec->destination_encoding_name = NULL;
00867     ec->started = 0;
00868     ec->replacement_str = NULL;
00869     ec->replacement_len = 0;
00870     ec->replacement_enc = NULL;
00871     ec->replacement_allocated = 0;
00872     ec->in_buf_start = NULL;
00873     ec->in_data_start = NULL;
00874     ec->in_data_end = NULL;
00875     ec->in_buf_end = NULL;
00876     ec->num_allocated = n_hint;
00877     ec->num_trans = 0;
00878     ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
00879     ec->num_finished = 0;
00880     ec->last_tc = NULL;
00881     ec->last_error.result = econv_source_buffer_empty;
00882     ec->last_error.error_tc = NULL;
00883     ec->last_error.source_encoding = NULL;
00884     ec->last_error.destination_encoding = NULL;
00885     ec->last_error.error_bytes_start = NULL;
00886     ec->last_error.error_bytes_len = 0;
00887     ec->last_error.readagain_len = 0;
00888     ec->source_encoding = NULL;
00889     ec->destination_encoding = NULL;
00890     return ec;
00891 }
00892 
00893 static int
00894 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
00895 {
00896     int n, j;
00897     int bufsize = 4096;
00898     unsigned char *p;
00899 
00900     if (ec->num_trans == ec->num_allocated) {
00901         n = ec->num_allocated * 2;
00902         REALLOC_N(ec->elems, rb_econv_elem_t, n);
00903         ec->num_allocated = n;
00904     }
00905 
00906     p = xmalloc(bufsize);
00907 
00908     MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
00909 
00910     ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
00911     ec->elems[i].out_buf_start = p;
00912     ec->elems[i].out_buf_end = p + bufsize;
00913     ec->elems[i].out_data_start = p;
00914     ec->elems[i].out_data_end = p;
00915     ec->elems[i].last_result = econv_source_buffer_empty;
00916 
00917     ec->num_trans++;
00918 
00919     if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
00920         for (j = ec->num_trans-1; i <= j; j--) {
00921             rb_transcoding *tc = ec->elems[j].tc;
00922             const rb_transcoder *tr2 = tc->transcoder;
00923             if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
00924                 ec->last_tc = tc;
00925                 break;
00926             }
00927         }
00928 
00929     return 0;
00930 }
00931 
00932 static rb_econv_t *
00933 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
00934 {
00935     rb_econv_t *ec;
00936     int i, ret;
00937 
00938     for (i = 0; i < n; i++) {
00939         const rb_transcoder *tr;
00940         tr = load_transcoder_entry(entries[i]);
00941         if (!tr)
00942             return NULL;
00943     }
00944 
00945     ec = rb_econv_alloc(n);
00946 
00947     for (i = 0; i < n; i++) {
00948         const rb_transcoder *tr = load_transcoder_entry(entries[i]);
00949         ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
00950         if (ret == -1) {
00951             rb_econv_close(ec);
00952             return NULL;
00953         }
00954     }
00955 
00956     return ec;
00957 }
00958 
00959 struct trans_open_t {
00960     transcoder_entry_t **entries;
00961     int num_additional;
00962 };
00963 
00964 static void
00965 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
00966 {
00967     struct trans_open_t *toarg = arg;
00968 
00969     if (!toarg->entries) {
00970         toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
00971     }
00972     toarg->entries[depth] = get_transcoder_entry(sname, dname);
00973 }
00974 
00975 static rb_econv_t *
00976 rb_econv_open0(const char *sname, const char *dname, int ecflags)
00977 {
00978     transcoder_entry_t **entries = NULL;
00979     int num_trans;
00980     rb_econv_t *ec;
00981 
00982     int sidx, didx;
00983 
00984     if (*sname) {
00985         sidx = rb_enc_find_index(sname);
00986         if (0 <= sidx) {
00987             rb_enc_from_index(sidx);
00988         }
00989     }
00990 
00991     if (*dname) {
00992         didx = rb_enc_find_index(dname);
00993         if (0 <= didx) {
00994             rb_enc_from_index(didx);
00995         }
00996     }
00997 
00998     if (*sname == '\0' && *dname == '\0') {
00999         num_trans = 0;
01000         entries = NULL;
01001     }
01002     else {
01003         struct trans_open_t toarg;
01004         toarg.entries = NULL;
01005         toarg.num_additional = 0;
01006         num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
01007         entries = toarg.entries;
01008         if (num_trans < 0) {
01009             xfree(entries);
01010             return NULL;
01011         }
01012     }
01013 
01014     ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
01015     xfree(entries);
01016     if (!ec)
01017         return NULL;
01018 
01019     ec->flags = ecflags;
01020     ec->source_encoding_name = sname;
01021     ec->destination_encoding_name = dname;
01022 
01023     return ec;
01024 }
01025 
01026 #define MAX_ECFLAGS_DECORATORS 32
01027 
01028 static int
01029 decorator_names(int ecflags, const char **decorators_ret)
01030 {
01031     int num_decorators;
01032 
01033     switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
01034       case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
01035       case ECONV_CRLF_NEWLINE_DECORATOR:
01036       case ECONV_CR_NEWLINE_DECORATOR:
01037       case 0:
01038         break;
01039       default:
01040         return -1;
01041     }
01042 
01043     if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
01044         (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR))
01045         return -1;
01046 
01047     num_decorators = 0;
01048 
01049     if (ecflags & ECONV_XML_TEXT_DECORATOR)
01050         decorators_ret[num_decorators++] = "xml_text_escape";
01051     if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
01052         decorators_ret[num_decorators++] = "xml_attr_content_escape";
01053     if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
01054         decorators_ret[num_decorators++] = "xml_attr_quote";
01055 
01056     if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
01057         decorators_ret[num_decorators++] = "crlf_newline";
01058     if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
01059         decorators_ret[num_decorators++] = "cr_newline";
01060     if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
01061         decorators_ret[num_decorators++] = "universal_newline";
01062 
01063     return num_decorators;
01064 }
01065 
01066 rb_econv_t *
01067 rb_econv_open(const char *sname, const char *dname, int ecflags)
01068 {
01069     rb_econv_t *ec;
01070     int num_decorators;
01071     const char *decorators[MAX_ECFLAGS_DECORATORS];
01072     int i;
01073 
01074     num_decorators = decorator_names(ecflags, decorators);
01075     if (num_decorators == -1)
01076         return NULL;
01077 
01078     ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
01079     if (!ec)
01080         return NULL;
01081 
01082     for (i = 0; i < num_decorators; i++)
01083         if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
01084             rb_econv_close(ec);
01085             return NULL;
01086         }
01087 
01088     ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
01089 
01090     return ec;
01091 }
01092 
01093 static int
01094 trans_sweep(rb_econv_t *ec,
01095     const unsigned char **input_ptr, const unsigned char *input_stop,
01096     unsigned char **output_ptr, unsigned char *output_stop,
01097     int flags,
01098     int start)
01099 {
01100     int try;
01101     int i, f;
01102 
01103     const unsigned char **ipp, *is, *iold;
01104     unsigned char **opp, *os, *oold;
01105     rb_econv_result_t res;
01106 
01107     try = 1;
01108     while (try) {
01109         try = 0;
01110         for (i = start; i < ec->num_trans; i++) {
01111             rb_econv_elem_t *te = &ec->elems[i];
01112 
01113             if (i == 0) {
01114                 ipp = input_ptr;
01115                 is = input_stop;
01116             }
01117             else {
01118                 rb_econv_elem_t *prev_te = &ec->elems[i-1];
01119                 ipp = (const unsigned char **)&prev_te->out_data_start;
01120                 is = prev_te->out_data_end;
01121             }
01122 
01123             if (i == ec->num_trans-1) {
01124                 opp = output_ptr;
01125                 os = output_stop;
01126             }
01127             else {
01128                 if (te->out_buf_start != te->out_data_start) {
01129                     ssize_t len = te->out_data_end - te->out_data_start;
01130                     ssize_t off = te->out_data_start - te->out_buf_start;
01131                     MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
01132                     te->out_data_start = te->out_buf_start;
01133                     te->out_data_end -= off;
01134                 }
01135                 opp = &te->out_data_end;
01136                 os = te->out_buf_end;
01137             }
01138 
01139             f = flags;
01140             if (ec->num_finished != i)
01141                 f |= ECONV_PARTIAL_INPUT;
01142             if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
01143                 start = 1;
01144                 flags &= ~ECONV_AFTER_OUTPUT;
01145             }
01146             if (i != 0)
01147                 f &= ~ECONV_AFTER_OUTPUT;
01148             iold = *ipp;
01149             oold = *opp;
01150             te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
01151             if (iold != *ipp || oold != *opp)
01152                 try = 1;
01153 
01154             switch (res) {
01155               case econv_invalid_byte_sequence:
01156               case econv_incomplete_input:
01157               case econv_undefined_conversion:
01158               case econv_after_output:
01159                 return i;
01160 
01161               case econv_destination_buffer_full:
01162               case econv_source_buffer_empty:
01163                 break;
01164 
01165               case econv_finished:
01166                 ec->num_finished = i+1;
01167                 break;
01168             }
01169         }
01170     }
01171     return -1;
01172 }
01173 
01174 static rb_econv_result_t
01175 rb_trans_conv(rb_econv_t *ec,
01176     const unsigned char **input_ptr, const unsigned char *input_stop,
01177     unsigned char **output_ptr, unsigned char *output_stop,
01178     int flags,
01179     int *result_position_ptr)
01180 {
01181     int i;
01182     int needreport_index;
01183     int sweep_start;
01184 
01185     unsigned char empty_buf;
01186     unsigned char *empty_ptr = &empty_buf;
01187 
01188     if (!input_ptr) {
01189         input_ptr = (const unsigned char **)&empty_ptr;
01190         input_stop = empty_ptr;
01191     }
01192 
01193     if (!output_ptr) {
01194         output_ptr = &empty_ptr;
01195         output_stop = empty_ptr;
01196     }
01197 
01198     if (ec->elems[0].last_result == econv_after_output)
01199         ec->elems[0].last_result = econv_source_buffer_empty;
01200 
01201     needreport_index = -1;
01202     for (i = ec->num_trans-1; 0 <= i; i--) {
01203         switch (ec->elems[i].last_result) {
01204           case econv_invalid_byte_sequence:
01205           case econv_incomplete_input:
01206           case econv_undefined_conversion:
01207           case econv_after_output:
01208           case econv_finished:
01209             sweep_start = i+1;
01210             needreport_index = i;
01211             goto found_needreport;
01212 
01213           case econv_destination_buffer_full:
01214           case econv_source_buffer_empty:
01215             break;
01216 
01217           default:
01218             rb_bug("unexpected transcode last result");
01219         }
01220     }
01221 
01222     /* /^[sd]+$/ is confirmed.  but actually /^s*d*$/. */
01223 
01224     if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
01225         (flags & ECONV_AFTER_OUTPUT)) {
01226         rb_econv_result_t res;
01227 
01228         res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
01229                 (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
01230                 result_position_ptr);
01231 
01232         if (res == econv_source_buffer_empty)
01233             return econv_after_output;
01234         return res;
01235     }
01236 
01237     sweep_start = 0;
01238 
01239   found_needreport:
01240 
01241     do {
01242         needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
01243         sweep_start = needreport_index + 1;
01244     } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
01245 
01246     for (i = ec->num_trans-1; 0 <= i; i--) {
01247         if (ec->elems[i].last_result != econv_source_buffer_empty) {
01248             rb_econv_result_t res = ec->elems[i].last_result;
01249             if (res == econv_invalid_byte_sequence ||
01250                 res == econv_incomplete_input ||
01251                 res == econv_undefined_conversion ||
01252                 res == econv_after_output) {
01253                 ec->elems[i].last_result = econv_source_buffer_empty;
01254             }
01255             if (result_position_ptr)
01256                 *result_position_ptr = i;
01257             return res;
01258         }
01259     }
01260     if (result_position_ptr)
01261         *result_position_ptr = -1;
01262     return econv_source_buffer_empty;
01263 }
01264 
01265 static rb_econv_result_t
01266 rb_econv_convert0(rb_econv_t *ec,
01267     const unsigned char **input_ptr, const unsigned char *input_stop,
01268     unsigned char **output_ptr, unsigned char *output_stop,
01269     int flags)
01270 {
01271     rb_econv_result_t res;
01272     int result_position;
01273     int has_output = 0;
01274 
01275     memset(&ec->last_error, 0, sizeof(ec->last_error));
01276 
01277     if (ec->num_trans == 0) {
01278         size_t len;
01279         if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
01280             if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
01281                 len = output_stop - *output_ptr;
01282                 memcpy(*output_ptr, ec->in_data_start, len);
01283                 *output_ptr = output_stop;
01284                 ec->in_data_start += len;
01285                 res = econv_destination_buffer_full;
01286                 goto gotresult;
01287             }
01288             len = ec->in_data_end - ec->in_data_start;
01289             memcpy(*output_ptr, ec->in_data_start, len);
01290             *output_ptr += len;
01291             ec->in_data_start = ec->in_data_end = ec->in_buf_start;
01292             if (flags & ECONV_AFTER_OUTPUT) {
01293                 res = econv_after_output;
01294                 goto gotresult;
01295             }
01296         }
01297         if (output_stop - *output_ptr < input_stop - *input_ptr) {
01298             len = output_stop - *output_ptr;
01299         }
01300         else {
01301             len = input_stop - *input_ptr;
01302         }
01303         if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
01304             *(*output_ptr)++ = *(*input_ptr)++;
01305             res = econv_after_output;
01306             goto gotresult;
01307         }
01308         memcpy(*output_ptr, *input_ptr, len);
01309         *output_ptr += len;
01310         *input_ptr += len;
01311         if (*input_ptr != input_stop)
01312             res = econv_destination_buffer_full;
01313         else if (flags & ECONV_PARTIAL_INPUT)
01314             res = econv_source_buffer_empty;
01315         else
01316             res = econv_finished;
01317         goto gotresult;
01318     }
01319 
01320     if (ec->elems[ec->num_trans-1].out_data_start) {
01321         unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
01322         unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
01323         if (data_start != data_end) {
01324             size_t len;
01325             if (output_stop - *output_ptr < data_end - data_start) {
01326                 len = output_stop - *output_ptr;
01327                 memcpy(*output_ptr, data_start, len);
01328                 *output_ptr = output_stop;
01329                 ec->elems[ec->num_trans-1].out_data_start += len;
01330                 res = econv_destination_buffer_full;
01331                 goto gotresult;
01332             }
01333             len = data_end - data_start;
01334             memcpy(*output_ptr, data_start, len);
01335             *output_ptr += len;
01336             ec->elems[ec->num_trans-1].out_data_start =
01337                 ec->elems[ec->num_trans-1].out_data_end =
01338                 ec->elems[ec->num_trans-1].out_buf_start;
01339             has_output = 1;
01340         }
01341     }
01342 
01343     if (ec->in_buf_start &&
01344         ec->in_data_start != ec->in_data_end) {
01345         res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
01346                 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
01347         if (res != econv_source_buffer_empty)
01348             goto gotresult;
01349     }
01350 
01351     if (has_output &&
01352         (flags & ECONV_AFTER_OUTPUT) &&
01353         *input_ptr != input_stop) {
01354         input_stop = *input_ptr;
01355         res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01356         if (res == econv_source_buffer_empty)
01357             res = econv_after_output;
01358     }
01359     else if ((flags & ECONV_AFTER_OUTPUT) ||
01360         ec->num_trans == 1) {
01361         res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01362     }
01363     else {
01364         flags |= ECONV_AFTER_OUTPUT;
01365         do {
01366             res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01367         } while (res == econv_after_output);
01368     }
01369 
01370   gotresult:
01371     ec->last_error.result = res;
01372     if (res == econv_invalid_byte_sequence ||
01373         res == econv_incomplete_input ||
01374         res == econv_undefined_conversion) {
01375         rb_transcoding *error_tc = ec->elems[result_position].tc;
01376         ec->last_error.error_tc = error_tc;
01377         ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
01378         ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
01379         ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
01380         ec->last_error.error_bytes_len = error_tc->recognized_len;
01381         ec->last_error.readagain_len = error_tc->readagain_len;
01382     }
01383 
01384     return res;
01385 }
01386 
01387 static int output_replacement_character(rb_econv_t *ec);
01388 
01389 static int
01390 output_hex_charref(rb_econv_t *ec)
01391 {
01392     int ret;
01393     unsigned char utfbuf[1024];
01394     const unsigned char *utf;
01395     size_t utf_len;
01396     int utf_allocated = 0;
01397     char charef_buf[16];
01398     const unsigned char *p;
01399 
01400     if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
01401         utf = ec->last_error.error_bytes_start;
01402         utf_len = ec->last_error.error_bytes_len;
01403     }
01404     else {
01405         utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
01406                 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
01407                 utfbuf, sizeof(utfbuf),
01408                 &utf_len);
01409         if (!utf)
01410             return -1;
01411         if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
01412             utf_allocated = 1;
01413     }
01414 
01415     if (utf_len % 4 != 0)
01416         goto fail;
01417 
01418     p = utf;
01419     while (4 <= utf_len) {
01420         unsigned int u = 0;
01421         u += p[0] << 24;
01422         u += p[1] << 16;
01423         u += p[2] << 8;
01424         u += p[3];
01425         snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
01426 
01427         ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
01428         if (ret == -1)
01429             goto fail;
01430 
01431         p += 4;
01432         utf_len -= 4;
01433     }
01434 
01435     if (utf_allocated)
01436         xfree((void *)utf);
01437     return 0;
01438 
01439   fail:
01440     if (utf_allocated)
01441         xfree((void *)utf);
01442     return -1;
01443 }
01444 
01445 rb_econv_result_t
01446 rb_econv_convert(rb_econv_t *ec,
01447     const unsigned char **input_ptr, const unsigned char *input_stop,
01448     unsigned char **output_ptr, unsigned char *output_stop,
01449     int flags)
01450 {
01451     rb_econv_result_t ret;
01452 
01453     unsigned char empty_buf;
01454     unsigned char *empty_ptr = &empty_buf;
01455 
01456     ec->started = 1;
01457 
01458     if (!input_ptr) {
01459         input_ptr = (const unsigned char **)&empty_ptr;
01460         input_stop = empty_ptr;
01461     }
01462 
01463     if (!output_ptr) {
01464         output_ptr = &empty_ptr;
01465         output_stop = empty_ptr;
01466     }
01467 
01468   resume:
01469     ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
01470 
01471     if (ret == econv_invalid_byte_sequence ||
01472         ret == econv_incomplete_input) {
01473         /* deal with invalid byte sequence */
01474         /* todo: add more alternative behaviors */
01475         switch (ec->flags & ECONV_INVALID_MASK) {
01476           case ECONV_INVALID_REPLACE:
01477             if (output_replacement_character(ec) == 0)
01478                 goto resume;
01479         }
01480     }
01481 
01482     if (ret == econv_undefined_conversion) {
01483         /* valid character in source encoding
01484          * but no related character(s) in destination encoding */
01485         /* todo: add more alternative behaviors */
01486         switch (ec->flags & ECONV_UNDEF_MASK) {
01487           case ECONV_UNDEF_REPLACE:
01488             if (output_replacement_character(ec) == 0)
01489                 goto resume;
01490             break;
01491 
01492           case ECONV_UNDEF_HEX_CHARREF:
01493             if (output_hex_charref(ec) == 0)
01494                 goto resume;
01495             break;
01496         }
01497     }
01498 
01499     return ret;
01500 }
01501 
01502 const char *
01503 rb_econv_encoding_to_insert_output(rb_econv_t *ec)
01504 {
01505     rb_transcoding *tc = ec->last_tc;
01506     const rb_transcoder *tr;
01507 
01508     if (tc == NULL)
01509         return "";
01510 
01511     tr = tc->transcoder;
01512 
01513     if (tr->asciicompat_type == asciicompat_encoder)
01514         return tr->src_encoding;
01515     return tr->dst_encoding;
01516 }
01517 
01518 static unsigned char *
01519 allocate_converted_string(const char *sname, const char *dname,
01520         const unsigned char *str, size_t len,
01521         unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
01522         size_t *dst_len_ptr)
01523 {
01524     unsigned char *dst_str;
01525     size_t dst_len;
01526     size_t dst_bufsize;
01527 
01528     rb_econv_t *ec;
01529     rb_econv_result_t res;
01530 
01531     const unsigned char *sp;
01532     unsigned char *dp;
01533 
01534     if (caller_dst_buf)
01535         dst_bufsize = caller_dst_bufsize;
01536     else if (len == 0)
01537         dst_bufsize = 1;
01538     else
01539         dst_bufsize = len;
01540 
01541     ec = rb_econv_open(sname, dname, 0);
01542     if (ec == NULL)
01543         return NULL;
01544     if (caller_dst_buf)
01545         dst_str = caller_dst_buf;
01546     else
01547         dst_str = xmalloc(dst_bufsize);
01548     dst_len = 0;
01549     sp = str;
01550     dp = dst_str+dst_len;
01551     res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
01552     dst_len = dp - dst_str;
01553     while (res == econv_destination_buffer_full) {
01554         if (SIZE_MAX/2 < dst_bufsize) {
01555             goto fail;
01556         }
01557         dst_bufsize *= 2;
01558         if (dst_str == caller_dst_buf) {
01559             unsigned char *tmp;
01560             tmp = xmalloc(dst_bufsize);
01561             memcpy(tmp, dst_str, dst_bufsize/2);
01562             dst_str = tmp;
01563         }
01564         else {
01565             dst_str = xrealloc(dst_str, dst_bufsize);
01566         }
01567         dp = dst_str+dst_len;
01568         res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
01569         dst_len = dp - dst_str;
01570     }
01571     if (res != econv_finished) {
01572         goto fail;
01573     }
01574     rb_econv_close(ec);
01575     *dst_len_ptr = dst_len;
01576     return dst_str;
01577 
01578   fail:
01579     if (dst_str != caller_dst_buf)
01580         xfree(dst_str);
01581     rb_econv_close(ec);
01582     return NULL;
01583 }
01584 
01585 /* result: 0:success -1:failure */
01586 int
01587 rb_econv_insert_output(rb_econv_t *ec,
01588     const unsigned char *str, size_t len, const char *str_encoding)
01589 {
01590     const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
01591     unsigned char insert_buf[4096];
01592     const unsigned char *insert_str = NULL;
01593     size_t insert_len;
01594 
01595     int last_trans_index;
01596     rb_transcoding *tc;
01597 
01598     unsigned char **buf_start_p;
01599     unsigned char **data_start_p;
01600     unsigned char **data_end_p;
01601     unsigned char **buf_end_p;
01602 
01603     size_t need;
01604 
01605     ec->started = 1;
01606 
01607     if (len == 0)
01608         return 0;
01609 
01610     if (encoding_equal(insert_encoding, str_encoding)) {
01611         insert_str = str;
01612         insert_len = len;
01613     }
01614     else {
01615         insert_str = allocate_converted_string(str_encoding, insert_encoding,
01616                 str, len, insert_buf, sizeof(insert_buf), &insert_len);
01617         if (insert_str == NULL)
01618             return -1;
01619     }
01620 
01621     need = insert_len;
01622 
01623     last_trans_index = ec->num_trans-1;
01624     if (ec->num_trans == 0) {
01625         tc = NULL;
01626         buf_start_p = &ec->in_buf_start;
01627         data_start_p = &ec->in_data_start;
01628         data_end_p = &ec->in_data_end;
01629         buf_end_p = &ec->in_buf_end;
01630     }
01631     else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
01632         tc = ec->elems[last_trans_index].tc;
01633         need += tc->readagain_len;
01634         if (need < insert_len)
01635             goto fail;
01636         if (last_trans_index == 0) {
01637             buf_start_p = &ec->in_buf_start;
01638             data_start_p = &ec->in_data_start;
01639             data_end_p = &ec->in_data_end;
01640             buf_end_p = &ec->in_buf_end;
01641         }
01642         else {
01643             rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
01644             buf_start_p = &ee->out_buf_start;
01645             data_start_p = &ee->out_data_start;
01646             data_end_p = &ee->out_data_end;
01647             buf_end_p = &ee->out_buf_end;
01648         }
01649     }
01650     else {
01651         rb_econv_elem_t *ee = &ec->elems[last_trans_index];
01652         buf_start_p = &ee->out_buf_start;
01653         data_start_p = &ee->out_data_start;
01654         data_end_p = &ee->out_data_end;
01655         buf_end_p = &ee->out_buf_end;
01656         tc = ec->elems[last_trans_index].tc;
01657     }
01658 
01659     if (*buf_start_p == NULL) {
01660         unsigned char *buf = xmalloc(need);
01661         *buf_start_p = buf;
01662         *data_start_p = buf;
01663         *data_end_p = buf;
01664         *buf_end_p = buf+need;
01665     }
01666     else if ((size_t)(*buf_end_p - *data_end_p) < need) {
01667         MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
01668         *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
01669         *data_start_p = *buf_start_p;
01670         if ((size_t)(*buf_end_p - *data_end_p) < need) {
01671             unsigned char *buf;
01672             size_t s = (*data_end_p - *buf_start_p) + need;
01673             if (s < need)
01674                 goto fail;
01675             buf = xrealloc(*buf_start_p, s);
01676             *data_start_p = buf;
01677             *data_end_p = buf + (*data_end_p - *buf_start_p);
01678             *buf_start_p = buf;
01679             *buf_end_p = buf + s;
01680         }
01681     }
01682 
01683     memcpy(*data_end_p, insert_str, insert_len);
01684     *data_end_p += insert_len;
01685     if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
01686         memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
01687         *data_end_p += tc->readagain_len;
01688         tc->readagain_len = 0;
01689     }
01690 
01691     if (insert_str != str && insert_str != insert_buf)
01692         xfree((void*)insert_str);
01693     return 0;
01694 
01695   fail:
01696     if (insert_str != str && insert_str != insert_buf)
01697         xfree((void*)insert_str);
01698     return -1;
01699 }
01700 
01701 void
01702 rb_econv_close(rb_econv_t *ec)
01703 {
01704     int i;
01705 
01706     if (ec->replacement_allocated) {
01707         xfree((void *)ec->replacement_str);
01708     }
01709     for (i = 0; i < ec->num_trans; i++) {
01710         rb_transcoding_close(ec->elems[i].tc);
01711         if (ec->elems[i].out_buf_start)
01712             xfree(ec->elems[i].out_buf_start);
01713     }
01714     xfree(ec->in_buf_start);
01715     xfree(ec->elems);
01716     xfree(ec);
01717 }
01718 
01719 size_t
01720 rb_econv_memsize(rb_econv_t *ec)
01721 {
01722     size_t size = sizeof(rb_econv_t);
01723     int i;
01724 
01725     if (ec->replacement_allocated) {
01726         size += ec->replacement_len;
01727     }
01728     for (i = 0; i < ec->num_trans; i++) {
01729         size += rb_transcoding_memsize(ec->elems[i].tc);
01730 
01731         if (ec->elems[i].out_buf_start) {
01732             size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
01733         }
01734     }
01735     size += ec->in_buf_end - ec->in_buf_start;
01736     size += sizeof(rb_econv_elem_t) * ec->num_allocated;
01737 
01738     return size;
01739 }
01740 
01741 int
01742 rb_econv_putbackable(rb_econv_t *ec)
01743 {
01744     if (ec->num_trans == 0)
01745         return 0;
01746 #if SIZEOF_SIZE_T > SIZEOF_INT
01747     if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
01748 #endif
01749     return (int)ec->elems[0].tc->readagain_len;
01750 }
01751 
01752 void
01753 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
01754 {
01755     rb_transcoding *tc;
01756     if (ec->num_trans == 0 || n == 0)
01757         return;
01758     tc = ec->elems[0].tc;
01759     memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
01760     tc->readagain_len -= n;
01761 }
01762 
01763 struct asciicompat_encoding_t {
01764     const char *ascii_compat_name;
01765     const char *ascii_incompat_name;
01766 };
01767 
01768 static int
01769 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
01770 {
01771     struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
01772     transcoder_entry_t *entry = (transcoder_entry_t *)val;
01773     const rb_transcoder *tr;
01774 
01775     if (DECORATOR_P(entry->sname, entry->dname))
01776         return ST_CONTINUE;
01777     tr = load_transcoder_entry(entry);
01778     if (tr && tr->asciicompat_type == asciicompat_decoder) {
01779         data->ascii_compat_name = tr->dst_encoding;
01780         return ST_STOP;
01781     }
01782     return ST_CONTINUE;
01783 }
01784 
01785 const char *
01786 rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
01787 {
01788     st_data_t v;
01789     st_table *table2;
01790     struct asciicompat_encoding_t data;
01791 
01792     if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
01793         return NULL;
01794     table2 = (st_table *)v;
01795 
01796     /*
01797      * Assumption:
01798      * There is at most one transcoder for
01799      * converting from ASCII incompatible encoding.
01800      *
01801      * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
01802      */
01803     if (table2->num_entries != 1)
01804         return NULL;
01805 
01806     data.ascii_incompat_name = ascii_incompat_name;
01807     data.ascii_compat_name = NULL;
01808     st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
01809     return data.ascii_compat_name;
01810 }
01811 
01812 VALUE
01813 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
01814 {
01815     unsigned const char *ss, *sp, *se;
01816     unsigned char *ds, *dp, *de;
01817     rb_econv_result_t res;
01818     int max_output;
01819 
01820     if (NIL_P(dst)) {
01821         dst = rb_str_buf_new(len);
01822         if (ec->destination_encoding)
01823             rb_enc_associate(dst, ec->destination_encoding);
01824     }
01825 
01826     if (ec->last_tc)
01827         max_output = ec->last_tc->transcoder->max_output;
01828     else
01829         max_output = 1;
01830 
01831     res = econv_destination_buffer_full;
01832     while (res == econv_destination_buffer_full) {
01833         long dlen = RSTRING_LEN(dst);
01834         if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
01835             unsigned long new_capa = (unsigned long)dlen + len + max_output;
01836             if (LONG_MAX < new_capa)
01837                 rb_raise(rb_eArgError, "too long string");
01838             rb_str_resize(dst, new_capa);
01839             rb_str_set_len(dst, dlen);
01840         }
01841         ss = sp = (const unsigned char *)RSTRING_PTR(src) + off;
01842         se = ss + len;
01843         ds = (unsigned char *)RSTRING_PTR(dst);
01844         de = ds + rb_str_capacity(dst);
01845         dp = ds += dlen;
01846         res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
01847         off += sp - ss;
01848         len -= sp - ss;
01849         rb_str_set_len(dst, dlen + (dp - ds));
01850         rb_econv_check_error(ec);
01851     }
01852 
01853     return dst;
01854 }
01855 
01856 VALUE
01857 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
01858 {
01859     return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
01860 }
01861 
01862 VALUE
01863 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
01864 {
01865     return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
01866 }
01867 
01868 VALUE
01869 rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
01870 {
01871     return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
01872 }
01873 
01874 static int
01875 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
01876 {
01877     transcoder_entry_t *entry;
01878     const rb_transcoder *tr;
01879 
01880     if (ec->started != 0)
01881         return -1;
01882 
01883     entry = get_transcoder_entry(sname, dname);
01884     if (!entry)
01885         return -1;
01886 
01887     tr = load_transcoder_entry(entry);
01888     if (!tr) return -1;
01889 
01890     return rb_econv_add_transcoder_at(ec, tr, n);
01891 }
01892 
01893 static int
01894 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
01895 {
01896     return rb_econv_add_converter(ec, "", decorator_name, n);
01897 }
01898 
01899 int
01900 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
01901 {
01902     const rb_transcoder *tr;
01903 
01904     if (ec->num_trans == 0)
01905         return rb_econv_decorate_at(ec, decorator_name, 0);
01906 
01907     tr = ec->elems[0].tc->transcoder;
01908 
01909     if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
01910         tr->asciicompat_type == asciicompat_decoder)
01911         return rb_econv_decorate_at(ec, decorator_name, 1);
01912 
01913     return rb_econv_decorate_at(ec, decorator_name, 0);
01914 }
01915 
01916 int
01917 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
01918 {
01919     const rb_transcoder *tr;
01920 
01921     if (ec->num_trans == 0)
01922         return rb_econv_decorate_at(ec, decorator_name, 0);
01923 
01924     tr = ec->elems[ec->num_trans-1].tc->transcoder;
01925 
01926     if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
01927         tr->asciicompat_type == asciicompat_encoder)
01928         return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
01929 
01930     return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
01931 }
01932 
01933 void
01934 rb_econv_binmode(rb_econv_t *ec)
01935 {
01936     const char *dname = 0;
01937 
01938     switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
01939       case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
01940         dname = "universal_newline";
01941         break;
01942       case ECONV_CRLF_NEWLINE_DECORATOR:
01943         dname = "crlf_newline";
01944         break;
01945       case ECONV_CR_NEWLINE_DECORATOR:
01946         dname = "cr_newline";
01947         break;
01948     }
01949 
01950     if (dname) {
01951         const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
01952         int num_trans = ec->num_trans;
01953         int i, j = 0;
01954 
01955         for (i=0; i < num_trans; i++) {
01956             if (transcoder == ec->elems[i].tc->transcoder) {
01957                 rb_transcoding_close(ec->elems[i].tc);
01958                 xfree(ec->elems[i].out_buf_start);
01959                 ec->num_trans--;
01960             }
01961             else
01962                 ec->elems[j++] = ec->elems[i];
01963         }
01964     }
01965 
01966     ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
01967 }
01968 
01969 static VALUE
01970 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
01971 {
01972     int has_description = 0;
01973 
01974     if (NIL_P(mesg))
01975         mesg = rb_str_new(NULL, 0);
01976 
01977     if (*sname != '\0' || *dname != '\0') {
01978         if (*sname == '\0')
01979             rb_str_cat2(mesg, dname);
01980         else if (*dname == '\0')
01981             rb_str_cat2(mesg, sname);
01982         else
01983             rb_str_catf(mesg, "%s to %s", sname, dname);
01984         has_description = 1;
01985     }
01986 
01987     if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
01988                    ECONV_XML_TEXT_DECORATOR|
01989                    ECONV_XML_ATTR_CONTENT_DECORATOR|
01990                    ECONV_XML_ATTR_QUOTE_DECORATOR)) {
01991         const char *pre = "";
01992         if (has_description)
01993             rb_str_cat2(mesg, " with ");
01994         if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)  {
01995             rb_str_cat2(mesg, pre); pre = ",";
01996             rb_str_cat2(mesg, "universal_newline");
01997         }
01998         if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
01999             rb_str_cat2(mesg, pre); pre = ",";
02000             rb_str_cat2(mesg, "crlf_newline");
02001         }
02002         if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
02003             rb_str_cat2(mesg, pre); pre = ",";
02004             rb_str_cat2(mesg, "cr_newline");
02005         }
02006         if (ecflags & ECONV_XML_TEXT_DECORATOR) {
02007             rb_str_cat2(mesg, pre); pre = ",";
02008             rb_str_cat2(mesg, "xml_text");
02009         }
02010         if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
02011             rb_str_cat2(mesg, pre); pre = ",";
02012             rb_str_cat2(mesg, "xml_attr_content");
02013         }
02014         if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
02015             rb_str_cat2(mesg, pre); pre = ",";
02016             rb_str_cat2(mesg, "xml_attr_quote");
02017         }
02018         has_description = 1;
02019     }
02020     if (!has_description) {
02021         rb_str_cat2(mesg, "no-conversion");
02022     }
02023 
02024     return mesg;
02025 }
02026 
02027 VALUE
02028 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
02029 {
02030     VALUE mesg, exc;
02031     mesg = rb_str_new_cstr("code converter not found (");
02032     econv_description(sname, dname, ecflags, mesg);
02033     rb_str_cat2(mesg, ")");
02034     exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
02035     return exc;
02036 }
02037 
02038 static VALUE
02039 make_econv_exception(rb_econv_t *ec)
02040 {
02041     VALUE mesg, exc;
02042     if (ec->last_error.result == econv_invalid_byte_sequence ||
02043         ec->last_error.result == econv_incomplete_input) {
02044         const char *err = (const char *)ec->last_error.error_bytes_start;
02045         size_t error_len = ec->last_error.error_bytes_len;
02046         VALUE bytes = rb_str_new(err, error_len);
02047         VALUE dumped = rb_str_dump(bytes);
02048         size_t readagain_len = ec->last_error.readagain_len;
02049         VALUE bytes2 = Qnil;
02050         VALUE dumped2;
02051         int idx;
02052         if (ec->last_error.result == econv_incomplete_input) {
02053             mesg = rb_sprintf("incomplete %s on %s",
02054                     StringValueCStr(dumped),
02055                     ec->last_error.source_encoding);
02056         }
02057         else if (readagain_len) {
02058             bytes2 = rb_str_new(err+error_len, readagain_len);
02059             dumped2 = rb_str_dump(bytes2);
02060             mesg = rb_sprintf("%s followed by %s on %s",
02061                     StringValueCStr(dumped),
02062                     StringValueCStr(dumped2),
02063                     ec->last_error.source_encoding);
02064         }
02065         else {
02066             mesg = rb_sprintf("%s on %s",
02067                     StringValueCStr(dumped),
02068                     ec->last_error.source_encoding);
02069         }
02070 
02071         exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
02072         rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
02073         rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
02074         rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
02075 
02076       set_encs:
02077         rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
02078         rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
02079         idx = rb_enc_find_index(ec->last_error.source_encoding);
02080         if (0 <= idx)
02081             rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
02082         idx = rb_enc_find_index(ec->last_error.destination_encoding);
02083         if (0 <= idx)
02084             rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
02085         return exc;
02086     }
02087     if (ec->last_error.result == econv_undefined_conversion) {
02088         VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
02089                                  ec->last_error.error_bytes_len);
02090         VALUE dumped = Qnil;
02091         int idx;
02092         if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
02093             rb_encoding *utf8 = rb_utf8_encoding();
02094             const char *start, *end;
02095             int n;
02096             start = (const char *)ec->last_error.error_bytes_start;
02097             end = start + ec->last_error.error_bytes_len;
02098             n = rb_enc_precise_mbclen(start, end, utf8);
02099             if (MBCLEN_CHARFOUND_P(n) &&
02100                 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
02101                 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
02102                 dumped = rb_sprintf("U+%04X", cc);
02103             }
02104         }
02105         if (dumped == Qnil)
02106             dumped = rb_str_dump(bytes);
02107         if (strcmp(ec->last_error.source_encoding,
02108                    ec->source_encoding_name) == 0 &&
02109             strcmp(ec->last_error.destination_encoding,
02110                    ec->destination_encoding_name) == 0) {
02111             mesg = rb_sprintf("%s from %s to %s",
02112                     StringValueCStr(dumped),
02113                     ec->last_error.source_encoding,
02114                     ec->last_error.destination_encoding);
02115         }
02116         else {
02117             int i;
02118             mesg = rb_sprintf("%s to %s in conversion from %s",
02119                     StringValueCStr(dumped),
02120                     ec->last_error.destination_encoding,
02121                     ec->source_encoding_name);
02122             for (i = 0; i < ec->num_trans; i++) {
02123                 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
02124                 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
02125                     rb_str_catf(mesg, " to %s",
02126                                 ec->elems[i].tc->transcoder->dst_encoding);
02127             }
02128         }
02129         exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
02130         idx = rb_enc_find_index(ec->last_error.source_encoding);
02131         if (0 <= idx)
02132             rb_enc_associate_index(bytes, idx);
02133         rb_ivar_set(exc, rb_intern("error_char"), bytes);
02134         goto set_encs;
02135     }
02136     return Qnil;
02137 }
02138 
02139 static void
02140 more_output_buffer(
02141         VALUE destination,
02142         unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02143         int max_output,
02144         unsigned char **out_start_ptr,
02145         unsigned char **out_pos,
02146         unsigned char **out_stop_ptr)
02147 {
02148     size_t len = (*out_pos - *out_start_ptr);
02149     size_t new_len = (len + max_output) * 2;
02150     *out_start_ptr = resize_destination(destination, len, new_len);
02151     *out_pos = *out_start_ptr + len;
02152     *out_stop_ptr = *out_start_ptr + new_len;
02153 }
02154 
02155 static int
02156 make_replacement(rb_econv_t *ec)
02157 {
02158     rb_transcoding *tc;
02159     const rb_transcoder *tr;
02160     const unsigned char *replacement;
02161     const char *repl_enc;
02162     const char *ins_enc;
02163     size_t len;
02164 
02165     if (ec->replacement_str)
02166         return 0;
02167 
02168     ins_enc = rb_econv_encoding_to_insert_output(ec);
02169 
02170     tc = ec->last_tc;
02171     if (*ins_enc) {
02172         tr = tc->transcoder;
02173         rb_enc_find(tr->dst_encoding);
02174         replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
02175     }
02176     else {
02177         replacement = (unsigned char *)"?";
02178         len = 1;
02179         repl_enc = "";
02180     }
02181 
02182     ec->replacement_str = replacement;
02183     ec->replacement_len = len;
02184     ec->replacement_enc = repl_enc;
02185     ec->replacement_allocated = 0;
02186     return 0;
02187 }
02188 
02189 int
02190 rb_econv_set_replacement(rb_econv_t *ec,
02191     const unsigned char *str, size_t len, const char *encname)
02192 {
02193     unsigned char *str2;
02194     size_t len2;
02195     const char *encname2;
02196 
02197     encname2 = rb_econv_encoding_to_insert_output(ec);
02198 
02199     if (encoding_equal(encname, encname2)) {
02200         str2 = xmalloc(len);
02201         MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
02202         len2 = len;
02203         encname2 = encname;
02204     }
02205     else {
02206         str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
02207         if (!str2)
02208             return -1;
02209     }
02210 
02211     if (ec->replacement_allocated) {
02212         xfree((void *)ec->replacement_str);
02213     }
02214     ec->replacement_allocated = 1;
02215     ec->replacement_str = str2;
02216     ec->replacement_len = len2;
02217     ec->replacement_enc = encname2;
02218     return 0;
02219 }
02220 
02221 static int
02222 output_replacement_character(rb_econv_t *ec)
02223 {
02224     int ret;
02225 
02226     if (make_replacement(ec) == -1)
02227         return -1;
02228 
02229     ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
02230     if (ret == -1)
02231         return -1;
02232 
02233     return 0;
02234 }
02235 
02236 #if 1
02237 #define hash_fallback rb_hash_aref
02238 
02239 static VALUE
02240 proc_fallback(VALUE fallback, VALUE c)
02241 {
02242     return rb_proc_call(fallback, rb_ary_new4(1, &c));
02243 }
02244 
02245 static VALUE
02246 method_fallback(VALUE fallback, VALUE c)
02247 {
02248     return rb_method_call(1, &c, fallback);
02249 }
02250 
02251 static VALUE
02252 aref_fallback(VALUE fallback, VALUE c)
02253 {
02254     return rb_funcall3(fallback, sym_aref, 1, &c);
02255 }
02256 
02257 static void
02258 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
02259                const unsigned char *in_stop, unsigned char *out_stop,
02260                VALUE destination,
02261                unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02262                const char *src_encoding,
02263                const char *dst_encoding,
02264                int ecflags,
02265                VALUE ecopts)
02266 {
02267     rb_econv_t *ec;
02268     rb_transcoding *last_tc;
02269     rb_econv_result_t ret;
02270     unsigned char *out_start = *out_pos;
02271     int max_output;
02272     VALUE exc;
02273     VALUE fallback = Qnil;
02274     VALUE (*fallback_func)(VALUE, VALUE) = 0;
02275 
02276     ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
02277     if (!ec)
02278         rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
02279 
02280     if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
02281         fallback = rb_hash_aref(ecopts, sym_fallback);
02282         if (RB_TYPE_P(fallback, T_HASH)) {
02283             fallback_func = hash_fallback;
02284         }
02285         else if (rb_obj_is_proc(fallback)) {
02286             fallback_func = proc_fallback;
02287         }
02288         else if (rb_obj_is_method(fallback)) {
02289             fallback_func = method_fallback;
02290         }
02291         else {
02292             fallback_func = aref_fallback;
02293         }
02294     }
02295     last_tc = ec->last_tc;
02296     max_output = last_tc ? last_tc->transcoder->max_output : 1;
02297 
02298   resume:
02299     ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
02300 
02301     if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
02302         VALUE rep = rb_enc_str_new(
02303                 (const char *)ec->last_error.error_bytes_start,
02304                 ec->last_error.error_bytes_len,
02305                 rb_enc_find(ec->last_error.source_encoding));
02306         rep = (*fallback_func)(fallback, rep);
02307         if (rep != Qundef && !NIL_P(rep)) {
02308             StringValue(rep);
02309             ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
02310                     RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
02311             if ((int)ret == -1) {
02312                 rb_raise(rb_eArgError, "too big fallback string");
02313             }
02314             goto resume;
02315         }
02316     }
02317 
02318     if (ret == econv_invalid_byte_sequence ||
02319         ret == econv_incomplete_input ||
02320         ret == econv_undefined_conversion) {
02321         exc = make_econv_exception(ec);
02322         rb_econv_close(ec);
02323         rb_exc_raise(exc);
02324     }
02325 
02326     if (ret == econv_destination_buffer_full) {
02327         more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
02328         goto resume;
02329     }
02330 
02331     rb_econv_close(ec);
02332     return;
02333 }
02334 #else
02335 /* sample transcode_loop implementation in byte-by-byte stream style */
02336 static void
02337 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
02338                const unsigned char *in_stop, unsigned char *out_stop,
02339                VALUE destination,
02340                unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02341                const char *src_encoding,
02342                const char *dst_encoding,
02343                int ecflags,
02344                VALUE ecopts)
02345 {
02346     rb_econv_t *ec;
02347     rb_transcoding *last_tc;
02348     rb_econv_result_t ret;
02349     unsigned char *out_start = *out_pos;
02350     const unsigned char *ptr;
02351     int max_output;
02352     VALUE exc;
02353 
02354     ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
02355     if (!ec)
02356         rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
02357 
02358     last_tc = ec->last_tc;
02359     max_output = last_tc ? last_tc->transcoder->max_output : 1;
02360 
02361     ret = econv_source_buffer_empty;
02362     ptr = *in_pos;
02363     while (ret != econv_finished) {
02364         unsigned char input_byte;
02365         const unsigned char *p = &input_byte;
02366 
02367         if (ret == econv_source_buffer_empty) {
02368             if (ptr < in_stop) {
02369                 input_byte = *ptr;
02370                 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
02371             }
02372             else {
02373                 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
02374             }
02375         }
02376         else {
02377             ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
02378         }
02379         if (&input_byte != p)
02380             ptr += p - &input_byte;
02381         switch (ret) {
02382           case econv_invalid_byte_sequence:
02383           case econv_incomplete_input:
02384           case econv_undefined_conversion:
02385             exc = make_econv_exception(ec);
02386             rb_econv_close(ec);
02387             rb_exc_raise(exc);
02388             break;
02389 
02390           case econv_destination_buffer_full:
02391             more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
02392             break;
02393 
02394           case econv_source_buffer_empty:
02395             break;
02396 
02397           case econv_finished:
02398             break;
02399         }
02400     }
02401     rb_econv_close(ec);
02402     *in_pos = in_stop;
02403     return;
02404 }
02405 #endif
02406 
02407 
02408 /*
02409  *  String-specific code
02410  */
02411 
02412 static unsigned char *
02413 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
02414 {
02415     rb_str_resize(destination, new_len);
02416     return (unsigned char *)RSTRING_PTR(destination);
02417 }
02418 
02419 static int
02420 econv_opts(VALUE opt, int ecflags)
02421 {
02422     VALUE v;
02423 
02424     v = rb_hash_aref(opt, sym_invalid);
02425     if (NIL_P(v)) {
02426     }
02427     else if (v==sym_replace) {
02428         ecflags |= ECONV_INVALID_REPLACE;
02429     }
02430     else {
02431         rb_raise(rb_eArgError, "unknown value for invalid character option");
02432     }
02433 
02434     v = rb_hash_aref(opt, sym_undef);
02435     if (NIL_P(v)) {
02436     }
02437     else if (v==sym_replace) {
02438         ecflags |= ECONV_UNDEF_REPLACE;
02439     }
02440     else {
02441         rb_raise(rb_eArgError, "unknown value for undefined character option");
02442     }
02443 
02444     v = rb_hash_aref(opt, sym_replace);
02445     if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
02446         ecflags |= ECONV_UNDEF_REPLACE;
02447     }
02448 
02449     v = rb_hash_aref(opt, sym_xml);
02450     if (!NIL_P(v)) {
02451         if (v==sym_text) {
02452             ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
02453         }
02454         else if (v==sym_attr) {
02455             ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
02456         }
02457         else if (RB_TYPE_P(v, T_SYMBOL)) {
02458             rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
02459         }
02460         else {
02461             rb_raise(rb_eArgError, "unexpected value for xml option");
02462         }
02463     }
02464 
02465 #ifdef ENABLE_ECONV_NEWLINE_OPTION
02466     v = rb_hash_aref(opt, sym_newline);
02467     if (!NIL_P(v)) {
02468         ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
02469         if (v == sym_universal) {
02470             ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
02471         }
02472         else if (v == sym_crlf) {
02473             ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
02474         }
02475         else if (v == sym_cr) {
02476             ecflags |= ECONV_CR_NEWLINE_DECORATOR;
02477         }
02478         else if (v == sym_lf) {
02479             /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
02480         }
02481         else if (SYMBOL_P(v)) {
02482             rb_raise(rb_eArgError, "unexpected value for newline option: %s",
02483                      rb_id2name(SYM2ID(v)));
02484         }
02485         else {
02486             rb_raise(rb_eArgError, "unexpected value for newline option");
02487         }
02488     }
02489     else
02490 #endif
02491     {
02492         int setflags = 0, newlineflag = 0;
02493 
02494         v = rb_hash_aref(opt, sym_universal_newline);
02495         if (RTEST(v))
02496             setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
02497         newlineflag |= !NIL_P(v);
02498 
02499         v = rb_hash_aref(opt, sym_crlf_newline);
02500         if (RTEST(v))
02501             setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
02502         newlineflag |= !NIL_P(v);
02503 
02504         v = rb_hash_aref(opt, sym_cr_newline);
02505         if (RTEST(v))
02506             setflags |= ECONV_CR_NEWLINE_DECORATOR;
02507         newlineflag |= !NIL_P(v);
02508 
02509         if (newlineflag) {
02510             ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
02511             ecflags |= setflags;
02512         }
02513     }
02514 
02515     return ecflags;
02516 }
02517 
02518 int
02519 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
02520 {
02521     VALUE newhash = Qnil;
02522     VALUE v;
02523 
02524     if (NIL_P(opthash)) {
02525         *opts = Qnil;
02526         return ecflags;
02527     }
02528     ecflags = econv_opts(opthash, ecflags);
02529 
02530     v = rb_hash_aref(opthash, sym_replace);
02531     if (!NIL_P(v)) {
02532         StringValue(v);
02533         if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) {
02534             VALUE dumped = rb_str_dump(v);
02535             rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
02536                      StringValueCStr(dumped),
02537                      rb_enc_name(rb_enc_get(v)));
02538         }
02539         v = rb_str_new_frozen(v);
02540         newhash = rb_hash_new();
02541         rb_hash_aset(newhash, sym_replace, v);
02542     }
02543 
02544     v = rb_hash_aref(opthash, sym_fallback);
02545     if (!NIL_P(v)) {
02546         VALUE h = rb_check_hash_type(v);
02547         if (NIL_P(h)
02548             ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, sym_aref))
02549             : (v = h, 1)) {
02550             if (NIL_P(newhash))
02551                 newhash = rb_hash_new();
02552             rb_hash_aset(newhash, sym_fallback, v);
02553         }
02554     }
02555 
02556     if (!NIL_P(newhash))
02557         rb_hash_freeze(newhash);
02558     *opts = newhash;
02559 
02560     return ecflags;
02561 }
02562 
02563 int
02564 rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
02565 {
02566     return rb_econv_prepare_options(opthash, opts, 0);
02567 }
02568 
02569 rb_econv_t *
02570 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
02571 {
02572     rb_econv_t *ec;
02573     VALUE replacement;
02574 
02575     if (NIL_P(opthash)) {
02576         replacement = Qnil;
02577     }
02578     else {
02579         if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
02580             rb_bug("rb_econv_open_opts called with invalid opthash");
02581         replacement = rb_hash_aref(opthash, sym_replace);
02582     }
02583 
02584     ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
02585     if (!ec)
02586         return ec;
02587 
02588     if (!NIL_P(replacement)) {
02589         int ret;
02590         rb_encoding *enc = rb_enc_get(replacement);
02591 
02592         ret = rb_econv_set_replacement(ec,
02593                 (const unsigned char *)RSTRING_PTR(replacement),
02594                 RSTRING_LEN(replacement),
02595                 rb_enc_name(enc));
02596         if (ret == -1) {
02597             rb_econv_close(ec);
02598             return NULL;
02599         }
02600     }
02601     return ec;
02602 }
02603 
02604 static int
02605 enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
02606 {
02607     rb_encoding *enc;
02608     const char *n;
02609     int encidx;
02610     VALUE encval;
02611 
02612     if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
02613         !(enc = rb_enc_from_index(encidx))) {
02614         enc = NULL;
02615         encidx = 0;
02616         n = StringValueCStr(*arg);
02617     }
02618     else {
02619         n = rb_enc_name(enc);
02620     }
02621 
02622     *name_p = n;
02623     *enc_p = enc;
02624 
02625     return encidx;
02626 }
02627 
02628 static int
02629 str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
02630         const char **sname_p, rb_encoding **senc_p,
02631         const char **dname_p, rb_encoding **denc_p)
02632 {
02633     rb_encoding *senc, *denc;
02634     const char *sname, *dname;
02635     int sencidx, dencidx;
02636 
02637     dencidx = enc_arg(arg1, &dname, &denc);
02638 
02639     if (NIL_P(*arg2)) {
02640         sencidx = rb_enc_get_index(str);
02641         senc = rb_enc_from_index(sencidx);
02642         sname = rb_enc_name(senc);
02643     }
02644     else {
02645         sencidx = enc_arg(arg2, &sname, &senc);
02646     }
02647 
02648     *sname_p = sname;
02649     *senc_p = senc;
02650     *dname_p = dname;
02651     *denc_p = denc;
02652     return dencidx;
02653 }
02654 
02655 static int
02656 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
02657 {
02658     VALUE dest;
02659     VALUE str = *self;
02660     volatile VALUE arg1, arg2;
02661     long blen, slen;
02662     unsigned char *buf, *bp, *sp;
02663     const unsigned char *fromp;
02664     rb_encoding *senc, *denc;
02665     const char *sname, *dname;
02666     int dencidx;
02667 
02668     rb_check_arity(argc, 0, 2);
02669 
02670     if (argc == 0) {
02671         arg1 = rb_enc_default_internal();
02672         if (NIL_P(arg1)) {
02673             if (!ecflags) return -1;
02674             arg1 = rb_obj_encoding(str);
02675         }
02676         ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE;
02677     }
02678     else {
02679         arg1 = argv[0];
02680     }
02681     arg2 = argc<=1 ? Qnil : argv[1];
02682     dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
02683 
02684     if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
02685                     ECONV_XML_TEXT_DECORATOR|
02686                     ECONV_XML_ATTR_CONTENT_DECORATOR|
02687                     ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) {
02688         if (senc && senc == denc) {
02689             return NIL_P(arg2) ? -1 : dencidx;
02690         }
02691         if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
02692             if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02693                 return dencidx;
02694             }
02695         }
02696         if (encoding_equal(sname, dname)) {
02697             return NIL_P(arg2) ? -1 : dencidx;
02698         }
02699     }
02700     else {
02701         if (encoding_equal(sname, dname)) {
02702             sname = "";
02703             dname = "";
02704         }
02705     }
02706 
02707     fromp = sp = (unsigned char *)RSTRING_PTR(str);
02708     slen = RSTRING_LEN(str);
02709     blen = slen + 30; /* len + margin */
02710     dest = rb_str_tmp_new(blen);
02711     bp = (unsigned char *)RSTRING_PTR(dest);
02712 
02713     transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
02714     if (fromp != sp+slen) {
02715         rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
02716     }
02717     buf = (unsigned char *)RSTRING_PTR(dest);
02718     *bp = '\0';
02719     rb_str_set_len(dest, bp - buf);
02720 
02721     /* set encoding */
02722     if (!denc) {
02723         dencidx = rb_define_dummy_encoding(dname);
02724     }
02725     *self = dest;
02726 
02727     return dencidx;
02728 }
02729 
02730 static int
02731 str_transcode(int argc, VALUE *argv, VALUE *self)
02732 {
02733     VALUE opt;
02734     int ecflags = 0;
02735     VALUE ecopts = Qnil;
02736 
02737     argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
02738     if (!NIL_P(opt)) {
02739         ecflags = rb_econv_prepare_opts(opt, &ecopts);
02740     }
02741     return str_transcode0(argc, argv, self, ecflags, ecopts);
02742 }
02743 
02744 static inline VALUE
02745 str_encode_associate(VALUE str, int encidx)
02746 {
02747     int cr = 0;
02748 
02749     rb_enc_associate_index(str, encidx);
02750 
02751     /* transcoded string never be broken. */
02752     if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
02753         rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
02754     }
02755     else {
02756         cr = ENC_CODERANGE_VALID;
02757     }
02758     ENC_CODERANGE_SET(str, cr);
02759     return str;
02760 }
02761 
02762 /*
02763  *  call-seq:
02764  *     str.encode!(encoding [, options] )   -> str
02765  *     str.encode!(dst_encoding, src_encoding [, options] )   -> str
02766  *
02767  *  The first form transcodes the contents of <i>str</i> from
02768  *  str.encoding to +encoding+.
02769  *  The second form transcodes the contents of <i>str</i> from
02770  *  src_encoding to dst_encoding.
02771  *  The options Hash gives details for conversion. See String#encode
02772  *  for details.
02773  *  Returns the string even if no changes were made.
02774  */
02775 
02776 static VALUE
02777 str_encode_bang(int argc, VALUE *argv, VALUE str)
02778 {
02779     VALUE newstr;
02780     int encidx;
02781 
02782     rb_check_frozen(str);
02783 
02784     newstr = str;
02785     encidx = str_transcode(argc, argv, &newstr);
02786 
02787     if (encidx < 0) return str;
02788     if (newstr == str) {
02789         rb_enc_associate_index(str, encidx);
02790         return str;
02791     }
02792     rb_str_shared_replace(str, newstr);
02793     return str_encode_associate(str, encidx);
02794 }
02795 
02796 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
02797 
02798 /*
02799  *  call-seq:
02800  *     str.encode(encoding [, options] )   -> str
02801  *     str.encode(dst_encoding, src_encoding [, options] )   -> str
02802  *     str.encode([options])   -> str
02803  *
02804  *  The first form returns a copy of +str+ transcoded
02805  *  to encoding +encoding+.
02806  *  The second form returns a copy of +str+ transcoded
02807  *  from src_encoding to dst_encoding.
02808  *  The last form returns a copy of +str+ transcoded to
02809  *  <tt>Encoding.default_internal</tt>.
02810  *
02811  *  By default, the first and second form raise
02812  *  Encoding::UndefinedConversionError for characters that are
02813  *  undefined in the destination encoding, and
02814  *  Encoding::InvalidByteSequenceError for invalid byte sequences
02815  *  in the source encoding. The last form by default does not raise
02816  *  exceptions but uses replacement strings.
02817  *
02818  *  Please note that conversion from an encoding +enc+ to the
02819  *  same encoding +enc+ is a no-op, i.e. the receiver is returned without
02820  *  any changes, and no exceptions are raised, even if there are invalid bytes.
02821  *
02822  *  The +options+ Hash gives details for conversion and can have the following
02823  *  keys:
02824  *
02825  *  :invalid ::
02826  *    If the value is +:replace+, #encode replaces invalid byte sequences in
02827  *    +str+ with the replacement character.  The default is to raise the
02828  *    Encoding::InvalidByteSequenceError exception
02829  *  :undef ::
02830  *    If the value is +:replace+, #encode replaces characters which are
02831  *    undefined in the destination encoding with the replacement character.
02832  *    The default is to raise the Encoding::UndefinedConversionError.
02833  *  :replace ::
02834  *    Sets the replacement string to the given value. The default replacement
02835  *    string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
02836  *  :fallback ::
02837  *    Sets the replacement string by the given object for undefined
02838  *    character.  The object should be a Hash, a Proc, a Method, or an
02839  *    object which has [] method.
02840  *    Its key is an undefined character encoded in the source encoding
02841  *    of current transcoder. Its value can be any encoding until it
02842  *    can be converted into the destination encoding of the transcoder.
02843  *  :xml ::
02844  *    The value must be +:text+ or +:attr+.
02845  *    If the value is +:text+ #encode replaces undefined characters with their
02846  *    (upper-case hexadecimal) numeric character references. '&', '<', and '>'
02847  *    are converted to "&amp;", "&lt;", and "&gt;", respectively.
02848  *    If the value is +:attr+, #encode also quotes the replacement result
02849  *    (using '"'), and replaces '"' with "&quot;".
02850  *  :cr_newline ::
02851  *    Replaces LF ("\n") with CR ("\r") if value is true.
02852  *  :crlf_newline ::
02853  *    Replaces LF ("\n") with CRLF ("\r\n") if value is true.
02854  *  :universal_newline ::
02855  *    Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
02856  */
02857 
02858 static VALUE
02859 str_encode(int argc, VALUE *argv, VALUE str)
02860 {
02861     VALUE newstr = str;
02862     int encidx = str_transcode(argc, argv, &newstr);
02863     return encoded_dup(newstr, str, encidx);
02864 }
02865 
02866 VALUE
02867 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
02868 {
02869     int argc = 1;
02870     VALUE *argv = &to;
02871     VALUE newstr = str;
02872     int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
02873     return encoded_dup(newstr, str, encidx);
02874 }
02875 
02876 static VALUE
02877 encoded_dup(VALUE newstr, VALUE str, int encidx)
02878 {
02879     if (encidx < 0) return rb_str_dup(str);
02880     if (newstr == str) {
02881         newstr = rb_str_dup(str);
02882         rb_enc_associate_index(newstr, encidx);
02883         return newstr;
02884     }
02885     else {
02886         RBASIC(newstr)->klass = rb_obj_class(str);
02887     }
02888     return str_encode_associate(newstr, encidx);
02889 }
02890 
02891 static void
02892 econv_free(void *ptr)
02893 {
02894     rb_econv_t *ec = ptr;
02895     rb_econv_close(ec);
02896 }
02897 
02898 static size_t
02899 econv_memsize(const void *ptr)
02900 {
02901     return ptr ? sizeof(rb_econv_t) : 0;
02902 }
02903 
02904 static const rb_data_type_t econv_data_type = {
02905     "econv",
02906     {NULL, econv_free, econv_memsize,},
02907 };
02908 
02909 static VALUE
02910 econv_s_allocate(VALUE klass)
02911 {
02912     return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
02913 }
02914 
02915 static rb_encoding *
02916 make_dummy_encoding(const char *name)
02917 {
02918     rb_encoding *enc;
02919     int idx;
02920     idx = rb_define_dummy_encoding(name);
02921     enc = rb_enc_from_index(idx);
02922     return enc;
02923 }
02924 
02925 static rb_encoding *
02926 make_encoding(const char *name)
02927 {
02928     rb_encoding *enc;
02929     enc = rb_enc_find(name);
02930     if (!enc)
02931         enc = make_dummy_encoding(name);
02932     return enc;
02933 }
02934 
02935 static VALUE
02936 make_encobj(const char *name)
02937 {
02938     return rb_enc_from_encoding(make_encoding(name));
02939 }
02940 
02941 /*
02942  * call-seq:
02943  *   Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
02944  *   Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
02945  *
02946  * Returns the corresponding ASCII compatible encoding.
02947  *
02948  * Returns nil if the argument is an ASCII compatible encoding.
02949  *
02950  * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
02951  * can represents exactly the same characters as the given ASCII incompatible encoding.
02952  * So, no conversion undefined error occurs when converting between the two encodings.
02953  *
02954  *   Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
02955  *   Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
02956  *   Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
02957  *
02958  */
02959 static VALUE
02960 econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
02961 {
02962     const char *arg_name, *result_name;
02963     rb_encoding *arg_enc, *result_enc;
02964 
02965     enc_arg(&arg, &arg_name, &arg_enc);
02966 
02967     result_name = rb_econv_asciicompat_encoding(arg_name);
02968 
02969     if (result_name == NULL)
02970         return Qnil;
02971 
02972     result_enc = make_encoding(result_name);
02973 
02974     return rb_enc_from_encoding(result_enc);
02975 }
02976 
02977 static void
02978 econv_args(int argc, VALUE *argv,
02979     volatile VALUE *snamev_p, volatile VALUE *dnamev_p,
02980     const char **sname_p, const char **dname_p,
02981     rb_encoding **senc_p, rb_encoding **denc_p,
02982     int *ecflags_p,
02983     VALUE *ecopts_p)
02984 {
02985     VALUE opt, flags_v, ecopts;
02986     int sidx, didx;
02987     const char *sname, *dname;
02988     rb_encoding *senc, *denc;
02989     int ecflags;
02990 
02991     argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
02992 
02993     if (!NIL_P(flags_v)) {
02994         if (!NIL_P(opt)) {
02995             rb_error_arity(argc + 1, 2, 3);
02996         }
02997         ecflags = NUM2INT(rb_to_int(flags_v));
02998         ecopts = Qnil;
02999     }
03000     else if (!NIL_P(opt)) {
03001         ecflags = rb_econv_prepare_opts(opt, &ecopts);
03002     }
03003     else {
03004         ecflags = 0;
03005         ecopts = Qnil;
03006     }
03007 
03008     senc = NULL;
03009     sidx = rb_to_encoding_index(*snamev_p);
03010     if (0 <= sidx) {
03011         senc = rb_enc_from_index(sidx);
03012     }
03013     else {
03014         StringValue(*snamev_p);
03015     }
03016 
03017     denc = NULL;
03018     didx = rb_to_encoding_index(*dnamev_p);
03019     if (0 <= didx) {
03020         denc = rb_enc_from_index(didx);
03021     }
03022     else {
03023         StringValue(*dnamev_p);
03024     }
03025 
03026     sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
03027     dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
03028 
03029     *sname_p = sname;
03030     *dname_p = dname;
03031     *senc_p = senc;
03032     *denc_p = denc;
03033     *ecflags_p = ecflags;
03034     *ecopts_p = ecopts;
03035 }
03036 
03037 static int
03038 decorate_convpath(VALUE convpath, int ecflags)
03039 {
03040     int num_decorators;
03041     const char *decorators[MAX_ECFLAGS_DECORATORS];
03042     int i;
03043     int n, len;
03044 
03045     num_decorators = decorator_names(ecflags, decorators);
03046     if (num_decorators == -1)
03047         return -1;
03048 
03049     len = n = RARRAY_LENINT(convpath);
03050     if (n != 0) {
03051         VALUE pair = RARRAY_PTR(convpath)[n-1];
03052         if (RB_TYPE_P(pair, T_ARRAY)) {
03053             const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0]));
03054             const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1]));
03055             transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
03056             const rb_transcoder *tr = load_transcoder_entry(entry);
03057             if (!tr)
03058                 return -1;
03059             if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
03060                     tr->asciicompat_type == asciicompat_encoder) {
03061                 n--;
03062                 rb_ary_store(convpath, len + num_decorators - 1, pair);
03063             }
03064         }
03065         else {
03066             rb_ary_store(convpath, len + num_decorators - 1, pair);
03067         }
03068     }
03069 
03070     for (i = 0; i < num_decorators; i++)
03071         rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
03072 
03073     return 0;
03074 }
03075 
03076 static void
03077 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
03078 {
03079     VALUE *ary_p = arg;
03080     VALUE v;
03081 
03082     if (*ary_p == Qnil) {
03083         *ary_p = rb_ary_new();
03084     }
03085 
03086     if (DECORATOR_P(sname, dname)) {
03087         v = rb_str_new_cstr(dname);
03088     }
03089     else {
03090         v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
03091     }
03092     rb_ary_store(*ary_p, depth, v);
03093 }
03094 
03095 /*
03096  * call-seq:
03097  *   Encoding::Converter.search_convpath(source_encoding, destination_encoding)         -> ary
03098  *   Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt)    -> ary
03099  *
03100  *  Returns a conversion path.
03101  *
03102  *   p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
03103  *   #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
03104  *   #    [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
03105  *
03106  *   p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
03107  *   or
03108  *   p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
03109  *   #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
03110  *   #    [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
03111  *   #    "universal_newline"]
03112  *
03113  *   p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
03114  *   or
03115  *   p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
03116  *   #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
03117  *   #    "universal_newline",
03118  *   #    [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
03119  */
03120 static VALUE
03121 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
03122 {
03123     volatile VALUE snamev, dnamev;
03124     const char *sname, *dname;
03125     rb_encoding *senc, *denc;
03126     int ecflags;
03127     VALUE ecopts;
03128     VALUE convpath;
03129 
03130     econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
03131 
03132     convpath = Qnil;
03133     transcode_search_path(sname, dname, search_convpath_i, &convpath);
03134 
03135     if (NIL_P(convpath))
03136         rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03137 
03138     if (decorate_convpath(convpath, ecflags) == -1)
03139         rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03140 
03141     return convpath;
03142 }
03143 
03144 /*
03145  * Check the existence of a conversion path.
03146  * Returns the number of converters in the conversion path.
03147  * result: >=0:success -1:failure
03148  */
03149 int
03150 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
03151 {
03152     VALUE convpath = Qnil;
03153     transcode_search_path(from_encoding, to_encoding, search_convpath_i,
03154                           &convpath);
03155     return RTEST(convpath);
03156 }
03157 
03158 struct rb_econv_init_by_convpath_t {
03159     rb_econv_t *ec;
03160     int index;
03161     int ret;
03162 };
03163 
03164 static void
03165 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
03166 {
03167     struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
03168     int ret;
03169 
03170     if (a->ret == -1)
03171         return;
03172 
03173     ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
03174 
03175     a->ret = ret;
03176     return;
03177 }
03178 
03179 static rb_econv_t *
03180 rb_econv_init_by_convpath(VALUE self, VALUE convpath,
03181     const char **sname_p, const char **dname_p,
03182     rb_encoding **senc_p, rb_encoding**denc_p)
03183 {
03184     rb_econv_t *ec;
03185     long i;
03186     int ret, first=1;
03187     VALUE elt;
03188     rb_encoding *senc = 0, *denc = 0;
03189     const char *sname, *dname;
03190 
03191     ec = rb_econv_alloc(RARRAY_LENINT(convpath));
03192     DATA_PTR(self) = ec;
03193 
03194     for (i = 0; i < RARRAY_LEN(convpath); i++) {
03195         volatile VALUE snamev, dnamev;
03196         VALUE pair;
03197         elt = rb_ary_entry(convpath, i);
03198         if (!NIL_P(pair = rb_check_array_type(elt))) {
03199             if (RARRAY_LEN(pair) != 2)
03200                 rb_raise(rb_eArgError, "not a 2-element array in convpath");
03201             snamev = rb_ary_entry(pair, 0);
03202             enc_arg(&snamev, &sname, &senc);
03203             dnamev = rb_ary_entry(pair, 1);
03204             enc_arg(&dnamev, &dname, &denc);
03205         }
03206         else {
03207             sname = "";
03208             dname = StringValueCStr(elt);
03209         }
03210         if (DECORATOR_P(sname, dname)) {
03211             ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
03212             if (ret == -1)
03213                 rb_raise(rb_eArgError, "decoration failed: %s", dname);
03214         }
03215         else {
03216             int j = ec->num_trans;
03217             struct rb_econv_init_by_convpath_t arg;
03218             arg.ec = ec;
03219             arg.index = ec->num_trans;
03220             arg.ret = 0;
03221             ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
03222             if (ret == -1 || arg.ret == -1)
03223                 rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname);
03224             if (first) {
03225                 first = 0;
03226                 *senc_p = senc;
03227                 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
03228             }
03229             *denc_p = denc;
03230             *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
03231         }
03232     }
03233 
03234     if (first) {
03235       *senc_p = NULL;
03236       *denc_p = NULL;
03237       *sname_p = "";
03238       *dname_p = "";
03239     }
03240 
03241     ec->source_encoding_name = *sname_p;
03242     ec->destination_encoding_name = *dname_p;
03243 
03244     return ec;
03245 }
03246 
03247 /*
03248  * call-seq:
03249  *   Encoding::Converter.new(source_encoding, destination_encoding)
03250  *   Encoding::Converter.new(source_encoding, destination_encoding, opt)
03251  *   Encoding::Converter.new(convpath)
03252  *
03253  * possible options elements:
03254  *   hash form:
03255  *     :invalid => nil            # raise error on invalid byte sequence (default)
03256  *     :invalid => :replace       # replace invalid byte sequence
03257  *     :undef => nil              # raise error on undefined conversion (default)
03258  *     :undef => :replace         # replace undefined conversion
03259  *     :replace => string         # replacement string ("?" or "\uFFFD" if not specified)
03260  *     :newline => :universal     # decorator for converting CRLF and CR to LF
03261  *     :newline => :crlf          # decorator for converting LF to CRLF
03262  *     :newline => :cr            # decorator for converting LF to CR
03263  *     :universal_newline => true # decorator for converting CRLF and CR to LF
03264  *     :crlf_newline => true      # decorator for converting LF to CRLF
03265  *     :cr_newline => true        # decorator for converting LF to CR
03266  *     :xml => :text              # escape as XML CharData.
03267  *     :xml => :attr              # escape as XML AttValue
03268  *   integer form:
03269  *     Encoding::Converter::INVALID_REPLACE
03270  *     Encoding::Converter::UNDEF_REPLACE
03271  *     Encoding::Converter::UNDEF_HEX_CHARREF
03272  *     Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
03273  *     Encoding::Converter::CRLF_NEWLINE_DECORATOR
03274  *     Encoding::Converter::CR_NEWLINE_DECORATOR
03275  *     Encoding::Converter::XML_TEXT_DECORATOR
03276  *     Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
03277  *     Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
03278  *
03279  * Encoding::Converter.new creates an instance of Encoding::Converter.
03280  *
03281  * Source_encoding and destination_encoding should be a string or
03282  * Encoding object.
03283  *
03284  * opt should be nil, a hash or an integer.
03285  *
03286  * convpath should be an array.
03287  * convpath may contain
03288  * - two-element arrays which contain encodings or encoding names, or
03289  * - strings representing decorator names.
03290  *
03291  * Encoding::Converter.new optionally takes an option.
03292  * The option should be a hash or an integer.
03293  * The option hash can contain :invalid => nil, etc.
03294  * The option integer should be logical-or of constants such as
03295  * Encoding::Converter::INVALID_REPLACE, etc.
03296  *
03297  * [:invalid => nil]
03298  *   Raise error on invalid byte sequence.  This is a default behavior.
03299  * [:invalid => :replace]
03300  *   Replace invalid byte sequence by replacement string.
03301  * [:undef => nil]
03302  *   Raise an error if a character in source_encoding is not defined in destination_encoding.
03303  *   This is a default behavior.
03304  * [:undef => :replace]
03305  *   Replace undefined character in destination_encoding with replacement string.
03306  * [:replace => string]
03307  *   Specify the replacement string.
03308  *   If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
03309  * [:universal_newline => true]
03310  *   Convert CRLF and CR to LF.
03311  * [:crlf_newline => true]
03312  *   Convert LF to CRLF.
03313  * [:cr_newline => true]
03314  *   Convert LF to CR.
03315  * [:xml => :text]
03316  *   Escape as XML CharData.
03317  *   This form can be used as a HTML 4.0 #PCDATA.
03318  *   - '&' -> '&amp;'
03319  *   - '<' -> '&lt;'
03320  *   - '>' -> '&gt;'
03321  *   - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
03322  * [:xml => :attr]
03323  *   Escape as XML AttValue.
03324  *   The converted result is quoted as "...".
03325  *   This form can be used as a HTML 4.0 attribute value.
03326  *   - '&' -> '&amp;'
03327  *   - '<' -> '&lt;'
03328  *   - '>' -> '&gt;'
03329  *   - '"' -> '&quot;'
03330  *   - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
03331  *
03332  * Examples:
03333  *   # UTF-16BE to UTF-8
03334  *   ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
03335  *
03336  *   # Usually, decorators such as newline conversion are inserted last.
03337  *   ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
03338  *   p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
03339  *                 #    "universal_newline"]
03340  *
03341  *   # But, if the last encoding is ASCII incompatible,
03342  *   # decorators are inserted before the last conversion.
03343  *   ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
03344  *   p ec.convpath #=> ["crlf_newline",
03345  *                 #    [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
03346  *
03347  *   # Conversion path can be specified directly.
03348  *   ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
03349  *   p ec.convpath #=> ["universal_newline",
03350  *                 #    [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
03351  *                 #    [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
03352  */
03353 static VALUE
03354 econv_init(int argc, VALUE *argv, VALUE self)
03355 {
03356     VALUE ecopts;
03357     volatile VALUE snamev, dnamev;
03358     const char *sname, *dname;
03359     rb_encoding *senc, *denc;
03360     rb_econv_t *ec;
03361     int ecflags;
03362     VALUE convpath;
03363 
03364     if (rb_check_typeddata(self, &econv_data_type)) {
03365         rb_raise(rb_eTypeError, "already initialized");
03366     }
03367 
03368     if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
03369         ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
03370         ecflags = 0;
03371         ecopts = Qnil;
03372     }
03373     else {
03374         econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
03375         ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
03376     }
03377 
03378     if (!ec) {
03379         rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03380     }
03381 
03382     if (!DECORATOR_P(sname, dname)) {
03383         if (!senc)
03384             senc = make_dummy_encoding(sname);
03385         if (!denc)
03386             denc = make_dummy_encoding(dname);
03387     }
03388 
03389     ec->source_encoding = senc;
03390     ec->destination_encoding = denc;
03391 
03392     DATA_PTR(self) = ec;
03393 
03394     return self;
03395 }
03396 
03397 /*
03398  * call-seq:
03399  *   ec.inspect         -> string
03400  *
03401  * Returns a printable version of <i>ec</i>
03402  *
03403  *   ec = Encoding::Converter.new("iso-8859-1", "utf-8")
03404  *   puts ec.inspect    #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
03405  *
03406  */
03407 static VALUE
03408 econv_inspect(VALUE self)
03409 {
03410     const char *cname = rb_obj_classname(self);
03411     rb_econv_t *ec;
03412 
03413     TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
03414     if (!ec)
03415         return rb_sprintf("#<%s: uninitialized>", cname);
03416     else {
03417         const char *sname = ec->source_encoding_name;
03418         const char *dname = ec->destination_encoding_name;
03419         VALUE str;
03420         str = rb_sprintf("#<%s: ", cname);
03421         econv_description(sname, dname, ec->flags, str);
03422         rb_str_cat2(str, ">");
03423         return str;
03424     }
03425 }
03426 
03427 static rb_econv_t *
03428 check_econv(VALUE self)
03429 {
03430     rb_econv_t *ec;
03431 
03432     TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
03433     if (!ec) {
03434         rb_raise(rb_eTypeError, "uninitialized encoding converter");
03435     }
03436     return ec;
03437 }
03438 
03439 /*
03440  * call-seq:
03441  *   ec.source_encoding -> encoding
03442  *
03443  * Returns the source encoding as an Encoding object.
03444  */
03445 static VALUE
03446 econv_source_encoding(VALUE self)
03447 {
03448     rb_econv_t *ec = check_econv(self);
03449     if (!ec->source_encoding)
03450         return Qnil;
03451     return rb_enc_from_encoding(ec->source_encoding);
03452 }
03453 
03454 /*
03455  * call-seq:
03456  *   ec.destination_encoding -> encoding
03457  *
03458  * Returns the destination encoding as an Encoding object.
03459  */
03460 static VALUE
03461 econv_destination_encoding(VALUE self)
03462 {
03463     rb_econv_t *ec = check_econv(self);
03464     if (!ec->destination_encoding)
03465         return Qnil;
03466     return rb_enc_from_encoding(ec->destination_encoding);
03467 }
03468 
03469 /*
03470  * call-seq:
03471  *   ec.convpath        -> ary
03472  *
03473  * Returns the conversion path of ec.
03474  *
03475  * The result is an array of conversions.
03476  *
03477  *   ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
03478  *   p ec.convpath
03479  *   #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
03480  *   #    [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
03481  *   #    "crlf_newline"]
03482  *
03483  * Each element of the array is a pair of encodings or a string.
03484  * A pair means an encoding conversion.
03485  * A string means a decorator.
03486  *
03487  * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
03488  * a converter from ISO-8859-1 to UTF-8.
03489  * "crlf_newline" means newline converter from LF to CRLF.
03490  */
03491 static VALUE
03492 econv_convpath(VALUE self)
03493 {
03494     rb_econv_t *ec = check_econv(self);
03495     VALUE result;
03496     int i;
03497 
03498     result = rb_ary_new();
03499     for (i = 0; i < ec->num_trans; i++) {
03500         const rb_transcoder *tr = ec->elems[i].tc->transcoder;
03501         VALUE v;
03502         if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
03503             v = rb_str_new_cstr(tr->dst_encoding);
03504         else
03505             v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
03506         rb_ary_push(result, v);
03507     }
03508     return result;
03509 }
03510 
03511 /*
03512  * call-seq:
03513  *   ec == other        -> true or false
03514  */
03515 static VALUE
03516 econv_equal(VALUE self, VALUE other)
03517 {
03518     rb_econv_t *ec1 = check_econv(self);
03519     rb_econv_t *ec2;
03520     int i;
03521 
03522     if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
03523         return Qnil;
03524     }
03525     ec2 = DATA_PTR(other);
03526     if (!ec2) return Qfalse;
03527     if (ec1->source_encoding_name != ec2->source_encoding_name &&
03528         strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
03529         return Qfalse;
03530     if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
03531         strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
03532         return Qfalse;
03533     if (ec1->flags != ec2->flags) return Qfalse;
03534     if (ec1->replacement_enc != ec2->replacement_enc &&
03535         strcmp(ec1->replacement_enc, ec2->replacement_enc))
03536         return Qfalse;
03537     if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
03538     if (ec1->replacement_str != ec2->replacement_str &&
03539         memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
03540         return Qfalse;
03541 
03542     if (ec1->num_trans != ec2->num_trans) return Qfalse;
03543     for (i = 0; i < ec1->num_trans; i++) {
03544         if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
03545             return Qfalse;
03546     }
03547     return Qtrue;
03548 }
03549 
03550 static VALUE
03551 econv_result_to_symbol(rb_econv_result_t res)
03552 {
03553     switch (res) {
03554       case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
03555       case econv_incomplete_input: return sym_incomplete_input;
03556       case econv_undefined_conversion: return sym_undefined_conversion;
03557       case econv_destination_buffer_full: return sym_destination_buffer_full;
03558       case econv_source_buffer_empty: return sym_source_buffer_empty;
03559       case econv_finished: return sym_finished;
03560       case econv_after_output: return sym_after_output;
03561       default: return INT2NUM(res); /* should not be reached */
03562     }
03563 }
03564 
03565 /*
03566  * call-seq:
03567  *   ec.primitive_convert(source_buffer, destination_buffer) -> symbol
03568  *   ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
03569  *   ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
03570  *   ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
03571  *
03572  * possible opt elements:
03573  *   hash form:
03574  *     :partial_input => true           # source buffer may be part of larger source
03575  *     :after_output => true            # stop conversion after output before input
03576  *   integer form:
03577  *     Encoding::Converter::PARTIAL_INPUT
03578  *     Encoding::Converter::AFTER_OUTPUT
03579  *
03580  * possible results:
03581  *    :invalid_byte_sequence
03582  *    :incomplete_input
03583  *    :undefined_conversion
03584  *    :after_output
03585  *    :destination_buffer_full
03586  *    :source_buffer_empty
03587  *    :finished
03588  *
03589  * primitive_convert converts source_buffer into destination_buffer.
03590  *
03591  * source_buffer should be a string or nil.
03592  * nil means an empty string.
03593  *
03594  * destination_buffer should be a string.
03595  *
03596  * destination_byteoffset should be an integer or nil.
03597  * nil means the end of destination_buffer.
03598  * If it is omitted, nil is assumed.
03599  *
03600  * destination_bytesize should be an integer or nil.
03601  * nil means unlimited.
03602  * If it is omitted, nil is assumed.
03603  *
03604  * opt should be nil, a hash or an integer.
03605  * nil means no flags.
03606  * If it is omitted, nil is assumed.
03607  *
03608  * primitive_convert converts the content of source_buffer from beginning
03609  * and store the result into destination_buffer.
03610  *
03611  * destination_byteoffset and destination_bytesize specify the region which
03612  * the converted result is stored.
03613  * destination_byteoffset specifies the start position in destination_buffer in bytes.
03614  * If destination_byteoffset is nil,
03615  * destination_buffer.bytesize is used for appending the result.
03616  * destination_bytesize specifies maximum number of bytes.
03617  * If destination_bytesize is nil,
03618  * destination size is unlimited.
03619  * After conversion, destination_buffer is resized to
03620  * destination_byteoffset + actually produced number of bytes.
03621  * Also destination_buffer's encoding is set to destination_encoding.
03622  *
03623  * primitive_convert drops the converted part of source_buffer.
03624  * the dropped part is converted in destination_buffer or
03625  * buffered in Encoding::Converter object.
03626  *
03627  * primitive_convert stops conversion when one of following condition met.
03628  * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
03629  *   +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
03630  * - unexpected end of source buffer (:incomplete_input)
03631  *   this occur only when :partial_input is not specified.
03632  *   +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
03633  * - character not representable in output encoding (:undefined_conversion)
03634  *   +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
03635  * - after some output is generated, before input is done (:after_output)
03636  *   this occur only when :after_output is specified.
03637  * - destination buffer is full (:destination_buffer_full)
03638  *   this occur only when destination_bytesize is non-nil.
03639  * - source buffer is empty (:source_buffer_empty)
03640  *   this occur only when :partial_input is specified.
03641  * - conversion is finished (:finished)
03642  *
03643  * example:
03644  *   ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
03645  *   ret = ec.primitive_convert(src="pi", dst="", nil, 100)
03646  *   p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
03647  *
03648  *   ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
03649  *   ret = ec.primitive_convert(src="pi", dst="", nil, 1)
03650  *   p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
03651  *   ret = ec.primitive_convert(src, dst="", nil, 1)
03652  *   p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
03653  *   ret = ec.primitive_convert(src, dst="", nil, 1)
03654  *   p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
03655  *   ret = ec.primitive_convert(src, dst="", nil, 1)
03656  *   p [ret, src, dst] #=> [:finished, "", "i"]
03657  *
03658  */
03659 static VALUE
03660 econv_primitive_convert(int argc, VALUE *argv, VALUE self)
03661 {
03662     VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
03663     rb_econv_t *ec = check_econv(self);
03664     rb_econv_result_t res;
03665     const unsigned char *ip, *is;
03666     unsigned char *op, *os;
03667     long output_byteoffset, output_bytesize;
03668     unsigned long output_byteend;
03669     int flags;
03670 
03671     argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
03672 
03673     if (NIL_P(output_byteoffset_v))
03674         output_byteoffset = 0; /* dummy */
03675     else
03676         output_byteoffset = NUM2LONG(output_byteoffset_v);
03677 
03678     if (NIL_P(output_bytesize_v))
03679         output_bytesize = 0; /* dummy */
03680     else
03681         output_bytesize = NUM2LONG(output_bytesize_v);
03682 
03683     if (!NIL_P(flags_v)) {
03684         if (!NIL_P(opt)) {
03685             rb_error_arity(argc + 1, 2, 5);
03686         }
03687         flags = NUM2INT(rb_to_int(flags_v));
03688     }
03689     else if (!NIL_P(opt)) {
03690         VALUE v;
03691         flags = 0;
03692         v = rb_hash_aref(opt, sym_partial_input);
03693         if (RTEST(v))
03694             flags |= ECONV_PARTIAL_INPUT;
03695         v = rb_hash_aref(opt, sym_after_output);
03696         if (RTEST(v))
03697             flags |= ECONV_AFTER_OUTPUT;
03698     }
03699     else {
03700         flags = 0;
03701     }
03702 
03703     StringValue(output);
03704     if (!NIL_P(input))
03705         StringValue(input);
03706     rb_str_modify(output);
03707 
03708     if (NIL_P(output_bytesize_v)) {
03709         output_bytesize = RSTRING_EMBED_LEN_MAX;
03710         if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
03711             output_bytesize = RSTRING_LEN(input);
03712     }
03713 
03714   retry:
03715 
03716     if (NIL_P(output_byteoffset_v))
03717         output_byteoffset = RSTRING_LEN(output);
03718 
03719     if (output_byteoffset < 0)
03720         rb_raise(rb_eArgError, "negative output_byteoffset");
03721 
03722     if (RSTRING_LEN(output) < output_byteoffset)
03723         rb_raise(rb_eArgError, "output_byteoffset too big");
03724 
03725     if (output_bytesize < 0)
03726         rb_raise(rb_eArgError, "negative output_bytesize");
03727 
03728     output_byteend = (unsigned long)output_byteoffset +
03729                      (unsigned long)output_bytesize;
03730 
03731     if (output_byteend < (unsigned long)output_byteoffset ||
03732         LONG_MAX < output_byteend)
03733         rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
03734 
03735     if (rb_str_capacity(output) < output_byteend)
03736         rb_str_resize(output, output_byteend);
03737 
03738     if (NIL_P(input)) {
03739         ip = is = NULL;
03740     }
03741     else {
03742         ip = (const unsigned char *)RSTRING_PTR(input);
03743         is = ip + RSTRING_LEN(input);
03744     }
03745 
03746     op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
03747     os = op + output_bytesize;
03748 
03749     res = rb_econv_convert(ec, &ip, is, &op, os, flags);
03750     rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
03751     if (!NIL_P(input))
03752         rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
03753 
03754     if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
03755         if (LONG_MAX / 2 < output_bytesize)
03756             rb_raise(rb_eArgError, "too long conversion result");
03757         output_bytesize *= 2;
03758         output_byteoffset_v = Qnil;
03759         goto retry;
03760     }
03761 
03762     if (ec->destination_encoding) {
03763         rb_enc_associate(output, ec->destination_encoding);
03764     }
03765 
03766     return econv_result_to_symbol(res);
03767 }
03768 
03769 /*
03770  * call-seq:
03771  *   ec.convert(source_string) -> destination_string
03772  *
03773  * Convert source_string and return destination_string.
03774  *
03775  * source_string is assumed as a part of source.
03776  * i.e.  :partial_input=>true is specified internally.
03777  * finish method should be used last.
03778  *
03779  *   ec = Encoding::Converter.new("utf-8", "euc-jp")
03780  *   puts ec.convert("\u3042").dump     #=> "\xA4\xA2"
03781  *   puts ec.finish.dump                #=> ""
03782  *
03783  *   ec = Encoding::Converter.new("euc-jp", "utf-8")
03784  *   puts ec.convert("\xA4").dump       #=> ""
03785  *   puts ec.convert("\xA2").dump       #=> "\xE3\x81\x82"
03786  *   puts ec.finish.dump                #=> ""
03787  *
03788  *   ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
03789  *   puts ec.convert("\xE3").dump       #=> "".force_encoding("ISO-2022-JP")
03790  *   puts ec.convert("\x81").dump       #=> "".force_encoding("ISO-2022-JP")
03791  *   puts ec.convert("\x82").dump       #=> "\e$B$\"".force_encoding("ISO-2022-JP")
03792  *   puts ec.finish.dump                #=> "\e(B".force_encoding("ISO-2022-JP")
03793  *
03794  * If a conversion error occur,
03795  * Encoding::UndefinedConversionError or
03796  * Encoding::InvalidByteSequenceError is raised.
03797  * Encoding::Converter#convert doesn't supply methods to recover or restart
03798  * from these exceptions.
03799  * When you want to handle these conversion errors,
03800  * use Encoding::Converter#primitive_convert.
03801  *
03802  */
03803 static VALUE
03804 econv_convert(VALUE self, VALUE source_string)
03805 {
03806     VALUE ret, dst;
03807     VALUE av[5];
03808     int ac;
03809     rb_econv_t *ec = check_econv(self);
03810 
03811     StringValue(source_string);
03812 
03813     dst = rb_str_new(NULL, 0);
03814 
03815     av[0] = rb_str_dup(source_string);
03816     av[1] = dst;
03817     av[2] = Qnil;
03818     av[3] = Qnil;
03819     av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
03820     ac = 5;
03821 
03822     ret = econv_primitive_convert(ac, av, self);
03823 
03824     if (ret == sym_invalid_byte_sequence ||
03825         ret == sym_undefined_conversion ||
03826         ret == sym_incomplete_input) {
03827         VALUE exc = make_econv_exception(ec);
03828         rb_exc_raise(exc);
03829     }
03830 
03831     if (ret == sym_finished) {
03832         rb_raise(rb_eArgError, "converter already finished");
03833     }
03834 
03835     if (ret != sym_source_buffer_empty) {
03836         rb_bug("unexpected result of econv_primitive_convert");
03837     }
03838 
03839     return dst;
03840 }
03841 
03842 /*
03843  * call-seq:
03844  *   ec.finish -> string
03845  *
03846  * Finishes the converter.
03847  * It returns the last part of the converted string.
03848  *
03849  *   ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
03850  *   p ec.convert("\u3042")     #=> "\e$B$\""
03851  *   p ec.finish                #=> "\e(B"
03852  */
03853 static VALUE
03854 econv_finish(VALUE self)
03855 {
03856     VALUE ret, dst;
03857     VALUE av[5];
03858     int ac;
03859     rb_econv_t *ec = check_econv(self);
03860 
03861     dst = rb_str_new(NULL, 0);
03862 
03863     av[0] = Qnil;
03864     av[1] = dst;
03865     av[2] = Qnil;
03866     av[3] = Qnil;
03867     av[4] = INT2NUM(0);
03868     ac = 5;
03869 
03870     ret = econv_primitive_convert(ac, av, self);
03871 
03872     if (ret == sym_invalid_byte_sequence ||
03873         ret == sym_undefined_conversion ||
03874         ret == sym_incomplete_input) {
03875         VALUE exc = make_econv_exception(ec);
03876         rb_exc_raise(exc);
03877     }
03878 
03879     if (ret != sym_finished) {
03880         rb_bug("unexpected result of econv_primitive_convert");
03881     }
03882 
03883     return dst;
03884 }
03885 
03886 /*
03887  * call-seq:
03888  *   ec.primitive_errinfo -> array
03889  *
03890  * primitive_errinfo returns important information regarding the last error
03891  * as a 5-element array:
03892  *
03893  *   [result, enc1, enc2, error_bytes, readagain_bytes]
03894  *
03895  * result is the last result of primitive_convert.
03896  *
03897  * Other elements are only meaningful when result is
03898  * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
03899  *
03900  * enc1 and enc2 indicate a conversion step as a pair of strings.
03901  * For example, a converter from EUC-JP to ISO-8859-1 converts
03902  * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
03903  * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
03904  *
03905  * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
03906  * error_bytes is discarded portion.
03907  * readagain_bytes is buffered portion which is read again on next conversion.
03908  *
03909  * Example:
03910  *
03911  *   # \xff is invalid as EUC-JP.
03912  *   ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
03913  *   ec.primitive_convert(src="\xff", dst="", nil, 10)
03914  *   p ec.primitive_errinfo
03915  *   #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
03916  *
03917  *   # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
03918  *   # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
03919  *   # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
03920  *   ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
03921  *   ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
03922  *   p ec.primitive_errinfo
03923  *   #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
03924  *
03925  *   # partial character is invalid
03926  *   ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
03927  *   ec.primitive_convert(src="\xa4", dst="", nil, 10)
03928  *   p ec.primitive_errinfo
03929  *   #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
03930  *
03931  *   # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
03932  *   # partial characters.
03933  *   ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
03934  *   ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
03935  *   p ec.primitive_errinfo
03936  *   #=> [:source_buffer_empty, nil, nil, nil, nil]
03937  *
03938  *   # \xd8\x00\x00@ is invalid as UTF-16BE because
03939  *   # no low surrogate after high surrogate (\xd8\x00).
03940  *   # It is detected by 3rd byte (\00) which is part of next character.
03941  *   # So the high surrogate (\xd8\x00) is discarded and
03942  *   # the 3rd byte is read again later.
03943  *   # Since the byte is buffered in ec, it is dropped from src.
03944  *   ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
03945  *   ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
03946  *   p ec.primitive_errinfo
03947  *   #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
03948  *   p src
03949  *   #=> "@"
03950  *
03951  *   # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
03952  *   # The problem is detected by 4th byte.
03953  *   ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
03954  *   ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
03955  *   p ec.primitive_errinfo
03956  *   #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
03957  *   p src
03958  *   #=> ""
03959  *
03960  */
03961 static VALUE
03962 econv_primitive_errinfo(VALUE self)
03963 {
03964     rb_econv_t *ec = check_econv(self);
03965 
03966     VALUE ary;
03967 
03968     ary = rb_ary_new2(5);
03969 
03970     rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
03971     rb_ary_store(ary, 4, Qnil);
03972 
03973     if (ec->last_error.source_encoding)
03974         rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
03975 
03976     if (ec->last_error.destination_encoding)
03977         rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
03978 
03979     if (ec->last_error.error_bytes_start) {
03980         rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
03981         rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
03982     }
03983 
03984     return ary;
03985 }
03986 
03987 /*
03988  * call-seq:
03989  *   ec.insert_output(string) -> nil
03990  *
03991  * Inserts string into the encoding converter.
03992  * The string will be converted to the destination encoding and
03993  * output on later conversions.
03994  *
03995  * If the destination encoding is stateful,
03996  * string is converted according to the state and the state is updated.
03997  *
03998  * This method should be used only when a conversion error occurs.
03999  *
04000  *  ec = Encoding::Converter.new("utf-8", "iso-8859-1")
04001  *  src = "HIRAGANA LETTER A is \u{3042}."
04002  *  dst = ""
04003  *  p ec.primitive_convert(src, dst)    #=> :undefined_conversion
04004  *  puts "[#{dst.dump}, #{src.dump}]"   #=> ["HIRAGANA LETTER A is ", "."]
04005  *  ec.insert_output("<err>")
04006  *  p ec.primitive_convert(src, dst)    #=> :finished
04007  *  puts "[#{dst.dump}, #{src.dump}]"   #=> ["HIRAGANA LETTER A is <err>.", ""]
04008  *
04009  *  ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
04010  *  src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
04011  *  dst = ""
04012  *  p ec.primitive_convert(src, dst)    #=> :undefined_conversion
04013  *  puts "[#{dst.dump}, #{src.dump}]"   #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
04014  *  ec.insert_output "?"                # state change required to output "?".
04015  *  p ec.primitive_convert(src, dst)    #=> :finished
04016  *  puts "[#{dst.dump}, #{src.dump}]"   #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
04017  *
04018  */
04019 static VALUE
04020 econv_insert_output(VALUE self, VALUE string)
04021 {
04022     const char *insert_enc;
04023 
04024     int ret;
04025 
04026     rb_econv_t *ec = check_econv(self);
04027 
04028     StringValue(string);
04029     insert_enc = rb_econv_encoding_to_insert_output(ec);
04030     string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
04031 
04032     ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
04033     if (ret == -1) {
04034         rb_raise(rb_eArgError, "too big string");
04035     }
04036 
04037     return Qnil;
04038 }
04039 
04040 /*
04041  * call-seq
04042  *   ec.putback                    -> string
04043  *   ec.putback(max_numbytes)      -> string
04044  *
04045  * Put back the bytes which will be converted.
04046  *
04047  * The bytes are caused by invalid_byte_sequence error.
04048  * When invalid_byte_sequence error, some bytes are discarded and
04049  * some bytes are buffered to be converted later.
04050  * The latter bytes can be put back.
04051  * It can be observed by
04052  * Encoding::InvalidByteSequenceError#readagain_bytes and
04053  * Encoding::Converter#primitive_errinfo.
04054  *
04055  *   ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
04056  *   src = "\x00\xd8\x61\x00"
04057  *   dst = ""
04058  *   p ec.primitive_convert(src, dst)   #=> :invalid_byte_sequence
04059  *   p ec.primitive_errinfo     #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
04060  *   p ec.putback               #=> "a\x00"
04061  *   p ec.putback               #=> ""          # no more bytes to put back
04062  *
04063  */
04064 static VALUE
04065 econv_putback(int argc, VALUE *argv, VALUE self)
04066 {
04067     rb_econv_t *ec = check_econv(self);
04068     int n;
04069     int putbackable;
04070     VALUE str, max;
04071 
04072     rb_scan_args(argc, argv, "01", &max);
04073 
04074     if (NIL_P(max))
04075         n = rb_econv_putbackable(ec);
04076     else {
04077         n = NUM2INT(max);
04078         putbackable = rb_econv_putbackable(ec);
04079         if (putbackable < n)
04080             n = putbackable;
04081     }
04082 
04083     str = rb_str_new(NULL, n);
04084     rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
04085 
04086     if (ec->source_encoding) {
04087         rb_enc_associate(str, ec->source_encoding);
04088     }
04089 
04090     return str;
04091 }
04092 
04093 /*
04094  * call-seq:
04095  *   ec.last_error -> exception or nil
04096  *
04097  * Returns an exception object for the last conversion.
04098  * Returns nil if the last conversion did not produce an error.
04099  *
04100  * "error" means that
04101  * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
04102  * Encoding::Converter#convert and
04103  * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
04104  * Encoding::Converter#primitive_convert.
04105  *
04106  *  ec = Encoding::Converter.new("utf-8", "iso-8859-1")
04107  *  p ec.primitive_convert(src="\xf1abcd", dst="")       #=> :invalid_byte_sequence
04108  *  p ec.last_error      #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
04109  *  p ec.primitive_convert(src, dst, nil, 1)             #=> :destination_buffer_full
04110  *  p ec.last_error      #=> nil
04111  *
04112  */
04113 static VALUE
04114 econv_last_error(VALUE self)
04115 {
04116     rb_econv_t *ec = check_econv(self);
04117     VALUE exc;
04118 
04119     exc = make_econv_exception(ec);
04120     if (NIL_P(exc))
04121         return Qnil;
04122     return exc;
04123 }
04124 
04125 /*
04126  * call-seq:
04127  *   ec.replacement -> string
04128  *
04129  * Returns the replacement string.
04130  *
04131  *  ec = Encoding::Converter.new("euc-jp", "us-ascii")
04132  *  p ec.replacement    #=> "?"
04133  *
04134  *  ec = Encoding::Converter.new("euc-jp", "utf-8")
04135  *  p ec.replacement    #=> "\uFFFD"
04136  */
04137 static VALUE
04138 econv_get_replacement(VALUE self)
04139 {
04140     rb_econv_t *ec = check_econv(self);
04141     int ret;
04142     rb_encoding *enc;
04143 
04144     ret = make_replacement(ec);
04145     if (ret == -1) {
04146         rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
04147     }
04148 
04149     enc = rb_enc_find(ec->replacement_enc);
04150     return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
04151 }
04152 
04153 /*
04154  * call-seq:
04155  *   ec.replacement = string
04156  *
04157  * Sets the replacement string.
04158  *
04159  *  ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
04160  *  ec.replacement = "<undef>"
04161  *  p ec.convert("a \u3042 b")      #=> "a <undef> b"
04162  */
04163 static VALUE
04164 econv_set_replacement(VALUE self, VALUE arg)
04165 {
04166     rb_econv_t *ec = check_econv(self);
04167     VALUE string = arg;
04168     int ret;
04169     rb_encoding *enc;
04170 
04171     StringValue(string);
04172     enc = rb_enc_get(string);
04173 
04174     ret = rb_econv_set_replacement(ec,
04175             (const unsigned char *)RSTRING_PTR(string),
04176             RSTRING_LEN(string),
04177             rb_enc_name(enc));
04178 
04179     if (ret == -1) {
04180         /* xxx: rb_eInvalidByteSequenceError? */
04181         rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
04182     }
04183 
04184     return arg;
04185 }
04186 
04187 VALUE
04188 rb_econv_make_exception(rb_econv_t *ec)
04189 {
04190     return make_econv_exception(ec);
04191 }
04192 
04193 void
04194 rb_econv_check_error(rb_econv_t *ec)
04195 {
04196     VALUE exc;
04197 
04198     exc = make_econv_exception(ec);
04199     if (NIL_P(exc))
04200         return;
04201     rb_exc_raise(exc);
04202 }
04203 
04204 /*
04205  * call-seq:
04206  *   ecerr.source_encoding_name         -> string
04207  *
04208  * Returns the source encoding name as a string.
04209  */
04210 static VALUE
04211 ecerr_source_encoding_name(VALUE self)
04212 {
04213     return rb_attr_get(self, rb_intern("source_encoding_name"));
04214 }
04215 
04216 /*
04217  * call-seq:
04218  *   ecerr.source_encoding              -> encoding
04219  *
04220  * Returns the source encoding as an encoding object.
04221  *
04222  * Note that the result may not be equal to the source encoding of
04223  * the encoding converter if the conversion has multiple steps.
04224  *
04225  *  ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
04226  *  begin
04227  *    ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
04228  *  rescue Encoding::UndefinedConversionError
04229  *    p $!.source_encoding              #=> #<Encoding:UTF-8>
04230  *    p $!.destination_encoding         #=> #<Encoding:EUC-JP>
04231  *    p $!.source_encoding_name         #=> "UTF-8"
04232  *    p $!.destination_encoding_name    #=> "EUC-JP"
04233  *  end
04234  *
04235  */
04236 static VALUE
04237 ecerr_source_encoding(VALUE self)
04238 {
04239     return rb_attr_get(self, rb_intern("source_encoding"));
04240 }
04241 
04242 /*
04243  * call-seq:
04244  *   ecerr.destination_encoding_name         -> string
04245  *
04246  * Returns the destination encoding name as a string.
04247  */
04248 static VALUE
04249 ecerr_destination_encoding_name(VALUE self)
04250 {
04251     return rb_attr_get(self, rb_intern("destination_encoding_name"));
04252 }
04253 
04254 /*
04255  * call-seq:
04256  *   ecerr.destination_encoding         -> string
04257  *
04258  * Returns the destination encoding as an encoding object.
04259  */
04260 static VALUE
04261 ecerr_destination_encoding(VALUE self)
04262 {
04263     return rb_attr_get(self, rb_intern("destination_encoding"));
04264 }
04265 
04266 /*
04267  * call-seq:
04268  *   ecerr.error_char         -> string
04269  *
04270  * Returns the one-character string which cause Encoding::UndefinedConversionError.
04271  *
04272  *  ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
04273  *  begin
04274  *    ec.convert("\xa0")
04275  *  rescue Encoding::UndefinedConversionError
04276  *    puts $!.error_char.dump   #=> "\xC2\xA0"
04277  *    p $!.error_char.encoding  #=> #<Encoding:UTF-8>
04278  *  end
04279  *
04280  */
04281 static VALUE
04282 ecerr_error_char(VALUE self)
04283 {
04284     return rb_attr_get(self, rb_intern("error_char"));
04285 }
04286 
04287 /*
04288  * call-seq:
04289  *   ecerr.error_bytes         -> string
04290  *
04291  * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
04292  *
04293  *  ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
04294  *  begin
04295  *    ec.convert("abc\xA1\xFFdef")
04296  *  rescue Encoding::InvalidByteSequenceError
04297  *    p $!      #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
04298  *    puts $!.error_bytes.dump          #=> "\xA1"
04299  *    puts $!.readagain_bytes.dump      #=> "\xFF"
04300  *  end
04301  */
04302 static VALUE
04303 ecerr_error_bytes(VALUE self)
04304 {
04305     return rb_attr_get(self, rb_intern("error_bytes"));
04306 }
04307 
04308 /*
04309  * call-seq:
04310  *   ecerr.readagain_bytes         -> string
04311  *
04312  * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
04313  */
04314 static VALUE
04315 ecerr_readagain_bytes(VALUE self)
04316 {
04317     return rb_attr_get(self, rb_intern("readagain_bytes"));
04318 }
04319 
04320 /*
04321  * call-seq:
04322  *   ecerr.incomplete_input?         -> true or false
04323  *
04324  * Returns true if the invalid byte sequence error is caused by
04325  * premature end of string.
04326  *
04327  *  ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
04328  *
04329  *  begin
04330  *    ec.convert("abc\xA1z")
04331  *  rescue Encoding::InvalidByteSequenceError
04332  *    p $!      #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
04333  *    p $!.incomplete_input?    #=> false
04334  *  end
04335  *
04336  *  begin
04337  *    ec.convert("abc\xA1")
04338  *    ec.finish
04339  *  rescue Encoding::InvalidByteSequenceError
04340  *    p $!      #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
04341  *    p $!.incomplete_input?    #=> true
04342  *  end
04343  */
04344 static VALUE
04345 ecerr_incomplete_input(VALUE self)
04346 {
04347     return rb_attr_get(self, rb_intern("incomplete_input"));
04348 }
04349 
04350 /*
04351  *  Document-class: Encoding::UndefinedConversionError
04352  *
04353  *  Raised by Encoding and String methods when a transcoding operation
04354  *  fails.
04355  */
04356 
04357 /*
04358  *  Document-class: Encoding::InvalidByteSequenceError
04359  *
04360  *  Raised by Encoding and String methods when the string being
04361  *  transcoded contains a byte invalid for the either the source or
04362  *  target encoding.
04363  */
04364 
04365 /*
04366  *  Document-class: Encoding::ConverterNotFoundError
04367  *
04368  *  Raised by transcoding methods when a named encoding does not
04369  *  correspond with a known converter.
04370  */
04371 
04372 void
04373 Init_transcode(void)
04374 {
04375     rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
04376     rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
04377     rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
04378 
04379     transcoder_table = st_init_strcasetable();
04380 
04381     sym_invalid = ID2SYM(rb_intern("invalid"));
04382     sym_undef = ID2SYM(rb_intern("undef"));
04383     sym_replace = ID2SYM(rb_intern("replace"));
04384     sym_fallback = ID2SYM(rb_intern("fallback"));
04385     sym_aref = ID2SYM(rb_intern("[]"));
04386     sym_xml = ID2SYM(rb_intern("xml"));
04387     sym_text = ID2SYM(rb_intern("text"));
04388     sym_attr = ID2SYM(rb_intern("attr"));
04389 
04390     sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
04391     sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
04392     sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
04393     sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
04394     sym_finished = ID2SYM(rb_intern("finished"));
04395     sym_after_output = ID2SYM(rb_intern("after_output"));
04396     sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
04397     sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
04398     sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
04399     sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
04400     sym_partial_input = ID2SYM(rb_intern("partial_input"));
04401 
04402 #ifdef ENABLE_ECONV_NEWLINE_OPTION
04403     sym_newline = ID2SYM(rb_intern("newline"));
04404     sym_universal = ID2SYM(rb_intern("universal"));
04405     sym_crlf = ID2SYM(rb_intern("crlf"));
04406     sym_cr = ID2SYM(rb_intern("cr"));
04407     sym_lf = ID2SYM(rb_intern("lf"));
04408 #endif
04409 
04410     rb_define_method(rb_cString, "encode", str_encode, -1);
04411     rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
04412 
04413     rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData);
04414     rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
04415     rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
04416     rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
04417     rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
04418     rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
04419     rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
04420     rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
04421     rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
04422     rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
04423     rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
04424     rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
04425     rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
04426     rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
04427     rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
04428     rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
04429     rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
04430     rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
04431     rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
04432 
04433     /* Document-const: INVALID_MASK
04434      *
04435      * Mask for invalid byte sequences
04436      */
04437     rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
04438 
04439     /* Document-const: INVALID_REPLACE
04440      *
04441      * Replace invalid byte sequences
04442      */
04443     rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
04444 
04445     /* Document-const: UNDEF_MASK
04446      *
04447      * Mask for a valid character in the source encoding but no related
04448      * character(s) in destination encoding.
04449      */
04450     rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
04451 
04452     /* Document-const: UNDEF_REPLACE
04453      *
04454      * Replace byte sequences that are undefined in the destination encoding.
04455      */
04456     rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
04457 
04458     /* Document-const: UNDEF_HEX_CHARREF
04459      *
04460      * Replace byte sequences that are undefined in the destination encoding
04461      * with an XML hexadecimal character reference.  This is valid for XML
04462      * conversion.
04463      */
04464     rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
04465 
04466     /* Document-const: PARTIAL_INPUT
04467      *
04468      * Indicates the source may be part of a larger string.  See
04469      * primitive_convert for an example.
04470      */
04471     rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
04472 
04473     /* Document-const: AFTER_OUTPUT
04474      *
04475      * Stop converting after some output is complete but before all of the
04476      * input was consumed.  See primitive_convert for an example.
04477      */
04478     rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
04479 
04480     /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
04481      *
04482      * Decorator for converting CRLF and CR to LF
04483      */
04484     rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
04485 
04486     /* Document-const: CRLF_NEWLINE_DECORATOR
04487      *
04488      * Decorator for converting LF to CRLF
04489      */
04490     rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
04491 
04492     /* Document-const: CR_NEWLINE_DECORATOR
04493      *
04494      * Decorator for converting LF to CR
04495      */
04496     rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
04497 
04498     /* Document-const: XML_TEXT_DECORATOR
04499      *
04500      * Escape as XML CharData
04501      */
04502     rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
04503 
04504     /* Document-const: XML_ATTR_CONTENT_DECORATOR
04505      *
04506      * Escape as XML AttValue
04507      */
04508     rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
04509 
04510     /* Document-const: XML_ATTR_QUOTE_DECORATOR
04511      *
04512      * Escape as XML AttValue
04513      */
04514     rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
04515 
04516     rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
04517     rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
04518     rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
04519     rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
04520     rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
04521 
04522     rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
04523     rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
04524     rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
04525     rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
04526     rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
04527     rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
04528     rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
04529 
04530     Init_newline();
04531 }
04532