Ruby
2.0.0p247(2013-06-27revision41674)
|
00001 /* 00002 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA). 00003 * Copyright (c) 1996-2010, The nkf Project. 00004 * 00005 * This software is provided 'as-is', without any express or implied 00006 * warranty. In no event will the authors be held liable for any damages 00007 * arising from the use of this software. 00008 * 00009 * Permission is granted to anyone to use this software for any purpose, 00010 * including commercial applications, and to alter it and redistribute it 00011 * freely, subject to the following restrictions: 00012 * 00013 * 1. The origin of this software must not be misrepresented; you must not 00014 * claim that you wrote the original software. If you use this software 00015 * in a product, an acknowledgment in the product documentation would be 00016 * appreciated but is not required. 00017 * 00018 * 2. Altered source versions must be plainly marked as such, and must not be 00019 * misrepresented as being the original software. 00020 * 00021 * 3. This notice may not be removed or altered from any source distribution. 00022 */ 00023 #define NKF_VERSION "2.1.3" 00024 #define NKF_RELEASE_DATE "2012-11-22" 00025 #define COPY_RIGHT \ 00026 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \ 00027 "Copyright (C) 1996-2012, The nkf Project." 00028 00029 #include "config.h" 00030 #include "nkf.h" 00031 #include "utf8tbl.h" 00032 #ifdef __WIN32__ 00033 #include <windows.h> 00034 #include <locale.h> 00035 #endif 00036 #if defined(__OS2__) 00037 # define INCL_DOS 00038 # define INCL_DOSERRORS 00039 # include <os2.h> 00040 #endif 00041 #include <assert.h> 00042 00043 00044 /* state of output_mode and input_mode 00045 00046 c2 0 means ASCII 00047 JIS_X_0201_1976_K 00048 ISO_8859_1 00049 JIS_X_0208 00050 EOF all termination 00051 c1 32bit data 00052 00053 */ 00054 00055 /* MIME ENCODE */ 00056 00057 #define FIXED_MIME 7 00058 #define STRICT_MIME 8 00059 00060 /* byte order */ 00061 enum byte_order { 00062 ENDIAN_BIG = 1, 00063 ENDIAN_LITTLE = 2, 00064 ENDIAN_2143 = 3, 00065 ENDIAN_3412 = 4 00066 }; 00067 00068 /* ASCII CODE */ 00069 00070 #define BS 0x08 00071 #define TAB 0x09 00072 #define LF 0x0a 00073 #define CR 0x0d 00074 #define ESC 0x1b 00075 #define SP 0x20 00076 #define DEL 0x7f 00077 #define SI 0x0f 00078 #define SO 0x0e 00079 #define SS2 0x8e 00080 #define SS3 0x8f 00081 #define CRLF 0x0D0A 00082 00083 00084 /* encodings */ 00085 00086 enum nkf_encodings { 00087 ASCII, 00088 ISO_8859_1, 00089 ISO_2022_JP, 00090 CP50220, 00091 CP50221, 00092 CP50222, 00093 ISO_2022_JP_1, 00094 ISO_2022_JP_3, 00095 ISO_2022_JP_2004, 00096 SHIFT_JIS, 00097 WINDOWS_31J, 00098 CP10001, 00099 EUC_JP, 00100 EUCJP_NKF, 00101 CP51932, 00102 EUCJP_MS, 00103 EUCJP_ASCII, 00104 SHIFT_JISX0213, 00105 SHIFT_JIS_2004, 00106 EUC_JISX0213, 00107 EUC_JIS_2004, 00108 UTF_8, 00109 UTF_8N, 00110 UTF_8_BOM, 00111 UTF8_MAC, 00112 UTF_16, 00113 UTF_16BE, 00114 UTF_16BE_BOM, 00115 UTF_16LE, 00116 UTF_16LE_BOM, 00117 UTF_32, 00118 UTF_32BE, 00119 UTF_32BE_BOM, 00120 UTF_32LE, 00121 UTF_32LE_BOM, 00122 BINARY, 00123 NKF_ENCODING_TABLE_SIZE, 00124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */ 00125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */ 00126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */ 00127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */ 00128 JIS_X_0208 = 0x1168, /* @B */ 00129 JIS_X_0212 = 0x1159, /* D */ 00130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */ 00131 JIS_X_0213_2 = 0x1229, /* P */ 00132 JIS_X_0213_1 = 0x1233 /* Q */ 00133 }; 00134 00135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0); 00136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0); 00137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0); 00138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0); 00139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0); 00140 static void j_oconv(nkf_char c2, nkf_char c1); 00141 static void s_oconv(nkf_char c2, nkf_char c1); 00142 static void e_oconv(nkf_char c2, nkf_char c1); 00143 static void w_oconv(nkf_char c2, nkf_char c1); 00144 static void w_oconv16(nkf_char c2, nkf_char c1); 00145 static void w_oconv32(nkf_char c2, nkf_char c1); 00146 00147 typedef struct { 00148 const char *name; 00149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0); 00150 void (*oconv)(nkf_char c2, nkf_char c1); 00151 } nkf_native_encoding; 00152 00153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv }; 00154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv }; 00155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv }; 00156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv }; 00157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv }; 00158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 }; 00159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 }; 00160 00161 typedef struct { 00162 const int id; 00163 const char *name; 00164 const nkf_native_encoding *base_encoding; 00165 } nkf_encoding; 00166 00167 nkf_encoding nkf_encoding_table[] = { 00168 {ASCII, "US-ASCII", &NkfEncodingASCII}, 00169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII}, 00170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP}, 00171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP}, 00172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP}, 00173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP}, 00174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP}, 00175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP}, 00176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP}, 00177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS}, 00178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS}, 00179 {CP10001, "CP10001", &NkfEncodingShift_JIS}, 00180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP}, 00181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP}, 00182 {CP51932, "CP51932", &NkfEncodingEUC_JP}, 00183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP}, 00184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP}, 00185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS}, 00186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS}, 00187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP}, 00188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP}, 00189 {UTF_8, "UTF-8", &NkfEncodingUTF_8}, 00190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8}, 00191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8}, 00192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8}, 00193 {UTF_16, "UTF-16", &NkfEncodingUTF_16}, 00194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16}, 00195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16}, 00196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16}, 00197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16}, 00198 {UTF_32, "UTF-32", &NkfEncodingUTF_32}, 00199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32}, 00200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32}, 00201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32}, 00202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32}, 00203 {BINARY, "BINARY", &NkfEncodingASCII}, 00204 {-1, NULL, NULL} 00205 }; 00206 00207 struct { 00208 const char *name; 00209 const int id; 00210 } encoding_name_to_id_table[] = { 00211 {"US-ASCII", ASCII}, 00212 {"ASCII", ASCII}, 00213 {"646", ASCII}, 00214 {"ROMAN8", ASCII}, 00215 {"ISO-2022-JP", ISO_2022_JP}, 00216 {"ISO2022JP-CP932", CP50220}, 00217 {"CP50220", CP50220}, 00218 {"CP50221", CP50221}, 00219 {"CSISO2022JP", CP50221}, 00220 {"CP50222", CP50222}, 00221 {"ISO-2022-JP-1", ISO_2022_JP_1}, 00222 {"ISO-2022-JP-3", ISO_2022_JP_3}, 00223 {"ISO-2022-JP-2004", ISO_2022_JP_2004}, 00224 {"SHIFT_JIS", SHIFT_JIS}, 00225 {"SJIS", SHIFT_JIS}, 00226 {"MS_Kanji", SHIFT_JIS}, 00227 {"PCK", SHIFT_JIS}, 00228 {"WINDOWS-31J", WINDOWS_31J}, 00229 {"CSWINDOWS31J", WINDOWS_31J}, 00230 {"CP932", WINDOWS_31J}, 00231 {"MS932", WINDOWS_31J}, 00232 {"CP10001", CP10001}, 00233 {"EUCJP", EUC_JP}, 00234 {"EUC-JP", EUC_JP}, 00235 {"EUCJP-NKF", EUCJP_NKF}, 00236 {"CP51932", CP51932}, 00237 {"EUC-JP-MS", EUCJP_MS}, 00238 {"EUCJP-MS", EUCJP_MS}, 00239 {"EUCJPMS", EUCJP_MS}, 00240 {"EUC-JP-ASCII", EUCJP_ASCII}, 00241 {"EUCJP-ASCII", EUCJP_ASCII}, 00242 {"SHIFT_JISX0213", SHIFT_JISX0213}, 00243 {"SHIFT_JIS-2004", SHIFT_JIS_2004}, 00244 {"EUC-JISX0213", EUC_JISX0213}, 00245 {"EUC-JIS-2004", EUC_JIS_2004}, 00246 {"UTF-8", UTF_8}, 00247 {"UTF-8N", UTF_8N}, 00248 {"UTF-8-BOM", UTF_8_BOM}, 00249 {"UTF8-MAC", UTF8_MAC}, 00250 {"UTF-8-MAC", UTF8_MAC}, 00251 {"UTF-16", UTF_16}, 00252 {"UTF-16BE", UTF_16BE}, 00253 {"UTF-16BE-BOM", UTF_16BE_BOM}, 00254 {"UTF-16LE", UTF_16LE}, 00255 {"UTF-16LE-BOM", UTF_16LE_BOM}, 00256 {"UTF-32", UTF_32}, 00257 {"UTF-32BE", UTF_32BE}, 00258 {"UTF-32BE-BOM", UTF_32BE_BOM}, 00259 {"UTF-32LE", UTF_32LE}, 00260 {"UTF-32LE-BOM", UTF_32LE_BOM}, 00261 {"BINARY", BINARY}, 00262 {NULL, -1} 00263 }; 00264 00265 #if defined(DEFAULT_CODE_JIS) 00266 #define DEFAULT_ENCIDX ISO_2022_JP 00267 #elif defined(DEFAULT_CODE_SJIS) 00268 #define DEFAULT_ENCIDX SHIFT_JIS 00269 #elif defined(DEFAULT_CODE_WINDOWS_31J) 00270 #define DEFAULT_ENCIDX WINDOWS_31J 00271 #elif defined(DEFAULT_CODE_EUC) 00272 #define DEFAULT_ENCIDX EUC_JP 00273 #elif defined(DEFAULT_CODE_UTF8) 00274 #define DEFAULT_ENCIDX UTF_8 00275 #endif 00276 00277 00278 #define is_alnum(c) \ 00279 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9')) 00280 00281 /* I don't trust portablity of toupper */ 00282 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c) 00283 #define nkf_isoctal(c) ('0'<=c && c<='7') 00284 #define nkf_isdigit(c) ('0'<=c && c<='9') 00285 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F')) 00286 #define nkf_isblank(c) (c == SP || c == TAB) 00287 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF) 00288 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) 00289 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c)) 00290 #define nkf_isprint(c) (SP<=c && c<='~') 00291 #define nkf_isgraph(c) ('!'<=c && c<='~') 00292 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \ 00293 ('A'<=c&&c<='F') ? (c-'A'+10) : \ 00294 ('a'<=c&&c<='f') ? (c-'a'+10) : 0) 00295 #define bin2hex(c) ("0123456789ABCDEF"[c&15]) 00296 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3) 00297 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \ 00298 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \ 00299 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22))) 00300 00301 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END) 00302 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F) 00303 00304 #define HOLD_SIZE 1024 00305 #if defined(INT_IS_SHORT) 00306 #define IOBUF_SIZE 2048 00307 #else 00308 #define IOBUF_SIZE 16384 00309 #endif 00310 00311 #define DEFAULT_J 'B' 00312 #define DEFAULT_R 'B' 00313 00314 00315 #define GETA1 0x22 00316 #define GETA2 0x2e 00317 00318 00319 /* MIME preprocessor */ 00320 00321 #ifdef EASYWIN /*Easy Win */ 00322 extern POINT _BufferSize; 00323 #endif 00324 00325 struct input_code{ 00326 const char *name; 00327 nkf_char stat; 00328 nkf_char score; 00329 nkf_char index; 00330 nkf_char buf[3]; 00331 void (*status_func)(struct input_code *, nkf_char); 00332 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0); 00333 int _file_stat; 00334 }; 00335 00336 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */ 00337 static nkf_encoding *input_encoding = NULL; 00338 static nkf_encoding *output_encoding = NULL; 00339 00340 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE) 00341 /* UCS Mapping 00342 * 0: Shift_JIS, eucJP-ascii 00343 * 1: eucJP-ms 00344 * 2: CP932, CP51932 00345 * 3: CP10001 00346 */ 00347 #define UCS_MAP_ASCII 0 00348 #define UCS_MAP_MS 1 00349 #define UCS_MAP_CP932 2 00350 #define UCS_MAP_CP10001 3 00351 static int ms_ucs_map_f = UCS_MAP_ASCII; 00352 #endif 00353 #ifdef UTF8_INPUT_ENABLE 00354 /* no NEC special, NEC-selected IBM extended and IBM extended characters */ 00355 static int no_cp932ext_f = FALSE; 00356 /* ignore ZERO WIDTH NO-BREAK SPACE */ 00357 static int no_best_fit_chars_f = FALSE; 00358 static int input_endian = ENDIAN_BIG; 00359 static int input_bom_f = FALSE; 00360 static nkf_char unicode_subchar = '?'; /* the regular substitution character */ 00361 static void (*encode_fallback)(nkf_char c) = NULL; 00362 static void w_status(struct input_code *, nkf_char); 00363 #endif 00364 #ifdef UTF8_OUTPUT_ENABLE 00365 static int output_bom_f = FALSE; 00366 static int output_endian = ENDIAN_BIG; 00367 #endif 00368 00369 static void std_putc(nkf_char c); 00370 static nkf_char std_getc(FILE *f); 00371 static nkf_char std_ungetc(nkf_char c,FILE *f); 00372 00373 static nkf_char broken_getc(FILE *f); 00374 static nkf_char broken_ungetc(nkf_char c,FILE *f); 00375 00376 static nkf_char mime_getc(FILE *f); 00377 00378 static void mime_putc(nkf_char c); 00379 00380 /* buffers */ 00381 00382 #if !defined(PERL_XS) && !defined(WIN32DLL) 00383 static unsigned char stdibuf[IOBUF_SIZE]; 00384 static unsigned char stdobuf[IOBUF_SIZE]; 00385 #endif 00386 00387 #define NKF_UNSPECIFIED (-TRUE) 00388 00389 /* flags */ 00390 static int unbuf_f = FALSE; 00391 static int estab_f = FALSE; 00392 static int nop_f = FALSE; 00393 static int binmode_f = TRUE; /* binary mode */ 00394 static int rot_f = FALSE; /* rot14/43 mode */ 00395 static int hira_f = FALSE; /* hira/kata henkan */ 00396 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */ 00397 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */ 00398 static int mime_decode_f = FALSE; /* mime decode is explicitly on */ 00399 static int mimebuf_f = FALSE; /* MIME buffered input */ 00400 static int broken_f = FALSE; /* convert ESC-less broken JIS */ 00401 static int iso8859_f = FALSE; /* ISO8859 through */ 00402 static int mimeout_f = FALSE; /* base64 mode */ 00403 static int x0201_f = NKF_UNSPECIFIED; /* convert JIS X 0201 */ 00404 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */ 00405 00406 #ifdef UNICODE_NORMALIZATION 00407 static int nfc_f = FALSE; 00408 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */ 00409 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc; 00410 #endif 00411 00412 #ifdef INPUT_OPTION 00413 static int cap_f = FALSE; 00414 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */ 00415 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc; 00416 00417 static int url_f = FALSE; 00418 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */ 00419 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc; 00420 #endif 00421 00422 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00) 00423 #define CLASS_MASK NKF_INT32_C(0xFF000000) 00424 #define CLASS_UNICODE NKF_INT32_C(0x01000000) 00425 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF) 00426 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF) 00427 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF) 00428 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3) 00429 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE) 00430 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE) 00431 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX) 00432 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX) 00433 00434 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00)) 00435 00436 #ifdef NUMCHAR_OPTION 00437 static int numchar_f = FALSE; 00438 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */ 00439 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc; 00440 #endif 00441 00442 #ifdef CHECK_OPTION 00443 static int noout_f = FALSE; 00444 static void no_putc(nkf_char c); 00445 static int debug_f = FALSE; 00446 static void debug(const char *str); 00447 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0; 00448 #endif 00449 00450 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */ 00451 static void set_input_codename(const char *codename); 00452 00453 #ifdef EXEC_IO 00454 static int exec_f = 0; 00455 #endif 00456 00457 #ifdef SHIFTJIS_CP932 00458 /* invert IBM extended characters to others */ 00459 static int cp51932_f = FALSE; 00460 00461 /* invert NEC-selected IBM extended characters to IBM extended characters */ 00462 static int cp932inv_f = TRUE; 00463 00464 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */ 00465 #endif /* SHIFTJIS_CP932 */ 00466 00467 static int x0212_f = FALSE; 00468 static int x0213_f = FALSE; 00469 00470 static unsigned char prefix_table[256]; 00471 00472 static void e_status(struct input_code *, nkf_char); 00473 static void s_status(struct input_code *, nkf_char); 00474 00475 struct input_code input_code_list[] = { 00476 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0}, 00477 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0}, 00478 #ifdef UTF8_INPUT_ENABLE 00479 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0}, 00480 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0}, 00481 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0}, 00482 #endif 00483 {NULL, 0, 0, 0, {0, 0, 0}, NULL, NULL, 0} 00484 }; 00485 00486 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */ 00487 static int base64_count = 0; 00488 00489 /* X0208 -> ASCII converter */ 00490 00491 /* fold parameter */ 00492 static int f_line = 0; /* chars in line */ 00493 static int f_prev = 0; 00494 static int fold_preserve_f = FALSE; /* preserve new lines */ 00495 static int fold_f = FALSE; 00496 static int fold_len = 0; 00497 00498 /* options */ 00499 static unsigned char kanji_intro = DEFAULT_J; 00500 static unsigned char ascii_intro = DEFAULT_R; 00501 00502 /* Folding */ 00503 00504 #define FOLD_MARGIN 10 00505 #define DEFAULT_FOLD 60 00506 00507 static int fold_margin = FOLD_MARGIN; 00508 00509 /* process default */ 00510 00511 static nkf_char 00512 no_connection2(ARG_UNUSED nkf_char c2, ARG_UNUSED nkf_char c1, ARG_UNUSED nkf_char c0) 00513 { 00514 fprintf(stderr,"nkf internal module connection failure.\n"); 00515 exit(EXIT_FAILURE); 00516 return 0; /* LINT */ 00517 } 00518 00519 static void 00520 no_connection(nkf_char c2, nkf_char c1) 00521 { 00522 no_connection2(c2,c1,0); 00523 } 00524 00525 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2; 00526 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection; 00527 00528 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection; 00529 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection; 00530 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection; 00531 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection; 00532 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection; 00533 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection; 00534 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection; 00535 00536 /* static redirections */ 00537 00538 static void (*o_putc)(nkf_char c) = std_putc; 00539 00540 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */ 00541 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc; 00542 00543 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */ 00544 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc; 00545 00546 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */ 00547 00548 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */ 00549 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc; 00550 00551 /* for strict mime */ 00552 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */ 00553 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc; 00554 00555 /* Global states */ 00556 static int output_mode = ASCII; /* output kanji mode */ 00557 static int input_mode = ASCII; /* input kanji mode */ 00558 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */ 00559 00560 /* X0201 / X0208 conversion tables */ 00561 00562 /* X0201 kana conversion table */ 00563 /* 90-9F A0-DF */ 00564 static const unsigned char cv[]= { 00565 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57, 00566 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21, 00567 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29, 00568 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43, 00569 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26, 00570 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d, 00571 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35, 00572 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d, 00573 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46, 00574 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c, 00575 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52, 00576 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e, 00577 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62, 00578 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69, 00579 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d, 00580 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c, 00581 0x00,0x00}; 00582 00583 00584 /* X0201 kana conversion table for daguten */ 00585 /* 90-9F A0-DF */ 00586 static const unsigned char dv[]= { 00587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00591 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74, 00592 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e, 00593 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36, 00594 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e, 00595 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47, 00596 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00, 00597 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53, 00598 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00, 00599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00603 0x00,0x00}; 00604 00605 /* X0201 kana conversion table for han-daguten */ 00606 /* 90-9F A0-DF */ 00607 static const unsigned char ev[]= { 00608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00618 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54, 00619 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00, 00620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00624 0x00,0x00}; 00625 00626 /* X0201 kana to X0213 conversion table for han-daguten */ 00627 /* 90-9F A0-DF */ 00628 static const unsigned char ev_x0213[]= { 00629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00634 0x00,0x00,0x00,0x00,0x25,0x77,0x25,0x78, 00635 0x25,0x79,0x25,0x7a,0x25,0x7b,0x00,0x00, 00636 0x00,0x00,0x00,0x00,0x25,0x7c,0x00,0x00, 00637 0x00,0x00,0x00,0x00,0x25,0x7d,0x00,0x00, 00638 0x25,0x7e,0x00,0x00,0x00,0x00,0x00,0x00, 00639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00645 0x00,0x00}; 00646 00647 00648 /* X0208 kigou conversion table */ 00649 /* 0x8140 - 0x819e */ 00650 static const unsigned char fv[] = { 00651 00652 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a, 00653 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00, 00654 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00, 00655 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f, 00656 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27, 00657 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d, 00658 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00, 00659 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00, 00660 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00, 00661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 00662 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40, 00663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 00664 } ; 00665 00666 00667 00668 static int option_mode = 0; 00669 static int file_out_f = FALSE; 00670 #ifdef OVERWRITE 00671 static int overwrite_f = FALSE; 00672 static int preserve_time_f = FALSE; 00673 static int backup_f = FALSE; 00674 static char *backup_suffix = ""; 00675 #endif 00676 00677 static int eolmode_f = 0; /* CR, LF, CRLF */ 00678 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */ 00679 static nkf_char prev_cr = 0; /* CR or 0 */ 00680 #ifdef EASYWIN /*Easy Win */ 00681 static int end_check; 00682 #endif /*Easy Win */ 00683 00684 static void * 00685 nkf_xmalloc(size_t size) 00686 { 00687 void *ptr; 00688 00689 if (size == 0) size = 1; 00690 00691 ptr = malloc(size); 00692 if (ptr == NULL) { 00693 perror("can't malloc"); 00694 exit(EXIT_FAILURE); 00695 } 00696 00697 return ptr; 00698 } 00699 00700 static void * 00701 nkf_xrealloc(void *ptr, size_t size) 00702 { 00703 if (size == 0) size = 1; 00704 00705 ptr = realloc(ptr, size); 00706 if (ptr == NULL) { 00707 perror("can't realloc"); 00708 exit(EXIT_FAILURE); 00709 } 00710 00711 return ptr; 00712 } 00713 00714 #define nkf_xfree(ptr) free(ptr) 00715 00716 static int 00717 nkf_str_caseeql(const char *src, const char *target) 00718 { 00719 int i; 00720 for (i = 0; src[i] && target[i]; i++) { 00721 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE; 00722 } 00723 if (src[i] || target[i]) return FALSE; 00724 else return TRUE; 00725 } 00726 00727 static nkf_encoding* 00728 nkf_enc_from_index(int idx) 00729 { 00730 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) { 00731 return 0; 00732 } 00733 return &nkf_encoding_table[idx]; 00734 } 00735 00736 static int 00737 nkf_enc_find_index(const char *name) 00738 { 00739 int i; 00740 if (name[0] == 'X' && *(name+1) == '-') name += 2; 00741 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) { 00742 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) { 00743 return encoding_name_to_id_table[i].id; 00744 } 00745 } 00746 return -1; 00747 } 00748 00749 static nkf_encoding* 00750 nkf_enc_find(const char *name) 00751 { 00752 int idx = -1; 00753 idx = nkf_enc_find_index(name); 00754 if (idx < 0) return 0; 00755 return nkf_enc_from_index(idx); 00756 } 00757 00758 #define nkf_enc_name(enc) (enc)->name 00759 #define nkf_enc_to_index(enc) (enc)->id 00760 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding 00761 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv 00762 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv 00763 #define nkf_enc_asciicompat(enc) (\ 00764 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\ 00765 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP) 00766 #define nkf_enc_unicode_p(enc) (\ 00767 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\ 00768 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\ 00769 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32) 00770 #define nkf_enc_cp5022x_p(enc) (\ 00771 nkf_enc_to_index(enc) == CP50220 ||\ 00772 nkf_enc_to_index(enc) == CP50221 ||\ 00773 nkf_enc_to_index(enc) == CP50222) 00774 00775 #ifdef DEFAULT_CODE_LOCALE 00776 static const char* 00777 nkf_locale_charmap() 00778 { 00779 #ifdef HAVE_LANGINFO_H 00780 return nl_langinfo(CODESET); 00781 #elif defined(__WIN32__) 00782 static char buf[16]; 00783 sprintf(buf, "CP%d", GetACP()); 00784 return buf; 00785 #elif defined(__OS2__) 00786 # if defined(INT_IS_SHORT) 00787 /* OS/2 1.x */ 00788 return NULL; 00789 # else 00790 /* OS/2 32bit */ 00791 static char buf[16]; 00792 ULONG ulCP[1], ulncp; 00793 DosQueryCp(sizeof(ulCP), ulCP, &ulncp); 00794 if (ulCP[0] == 932 || ulCP[0] == 943) 00795 strcpy(buf, "Shift_JIS"); 00796 else 00797 sprintf(buf, "CP%lu", ulCP[0]); 00798 return buf; 00799 # endif 00800 #endif 00801 return NULL; 00802 } 00803 00804 static nkf_encoding* 00805 nkf_locale_encoding() 00806 { 00807 nkf_encoding *enc = 0; 00808 const char *encname = nkf_locale_charmap(); 00809 if (encname) 00810 enc = nkf_enc_find(encname); 00811 return enc; 00812 } 00813 #endif /* DEFAULT_CODE_LOCALE */ 00814 00815 static nkf_encoding* 00816 nkf_utf8_encoding() 00817 { 00818 return &nkf_encoding_table[UTF_8]; 00819 } 00820 00821 static nkf_encoding* 00822 nkf_default_encoding() 00823 { 00824 nkf_encoding *enc = 0; 00825 #ifdef DEFAULT_CODE_LOCALE 00826 enc = nkf_locale_encoding(); 00827 #elif defined(DEFAULT_ENCIDX) 00828 enc = nkf_enc_from_index(DEFAULT_ENCIDX); 00829 #endif 00830 if (!enc) enc = nkf_utf8_encoding(); 00831 return enc; 00832 } 00833 00834 typedef struct { 00835 long capa; 00836 long len; 00837 nkf_char *ptr; 00838 } nkf_buf_t; 00839 00840 static nkf_buf_t * 00841 nkf_buf_new(int length) 00842 { 00843 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t)); 00844 buf->ptr = nkf_xmalloc(sizeof(nkf_char) * length); 00845 buf->capa = length; 00846 buf->len = 0; 00847 return buf; 00848 } 00849 00850 #if 0 00851 static void 00852 nkf_buf_dispose(nkf_buf_t *buf) 00853 { 00854 nkf_xfree(buf->ptr); 00855 nkf_xfree(buf); 00856 } 00857 #endif 00858 00859 #define nkf_buf_length(buf) ((buf)->len) 00860 #define nkf_buf_empty_p(buf) ((buf)->len == 0) 00861 00862 static nkf_char 00863 nkf_buf_at(nkf_buf_t *buf, int index) 00864 { 00865 assert(index <= buf->len); 00866 return buf->ptr[index]; 00867 } 00868 00869 static void 00870 nkf_buf_clear(nkf_buf_t *buf) 00871 { 00872 buf->len = 0; 00873 } 00874 00875 static void 00876 nkf_buf_push(nkf_buf_t *buf, nkf_char c) 00877 { 00878 if (buf->capa <= buf->len) { 00879 exit(EXIT_FAILURE); 00880 } 00881 buf->ptr[buf->len++] = c; 00882 } 00883 00884 static nkf_char 00885 nkf_buf_pop(nkf_buf_t *buf) 00886 { 00887 assert(!nkf_buf_empty_p(buf)); 00888 return buf->ptr[--buf->len]; 00889 } 00890 00891 /* Normalization Form C */ 00892 #ifndef PERL_XS 00893 #ifdef WIN32DLL 00894 #define fprintf dllprintf 00895 #endif 00896 00897 static void 00898 version(void) 00899 { 00900 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n"); 00901 } 00902 00903 static void 00904 usage(void) 00905 { 00906 fprintf(HELP_OUTPUT, 00907 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n" 00908 #ifdef UTF8_OUTPUT_ENABLE 00909 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" 00910 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n" 00911 #else 00912 #endif 00913 #ifdef UTF8_INPUT_ENABLE 00914 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" 00915 " UTF option is -W[8,[16,32][B,L]]\n" 00916 #else 00917 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" 00918 #endif 00919 ); 00920 fprintf(HELP_OUTPUT, 00921 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n" 00922 " M[BQ] MIME encode [B:base64 Q:quoted]\n" 00923 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n" 00924 ); 00925 fprintf(HELP_OUTPUT, 00926 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n" 00927 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n" 00928 " 4: JISX0208 Katakana to JISX0201 Katakana\n" 00929 " X,x Convert Halfwidth Katakana to Fullwidth or preserve it\n" 00930 ); 00931 fprintf(HELP_OUTPUT, 00932 " O Output to File (DEFAULT 'nkf.out')\n" 00933 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n" 00934 ); 00935 fprintf(HELP_OUTPUT, 00936 " --ic=<encoding> Specify the input encoding\n" 00937 " --oc=<encoding> Specify the output encoding\n" 00938 " --hiragana --katakana Hiragana/Katakana Conversion\n" 00939 " --katakana-hiragana Converts each other\n" 00940 ); 00941 fprintf(HELP_OUTPUT, 00942 #ifdef INPUT_OPTION 00943 " --{cap, url}-input Convert hex after ':' or '%%'\n" 00944 #endif 00945 #ifdef NUMCHAR_OPTION 00946 " --numchar-input Convert Unicode Character Reference\n" 00947 #endif 00948 #ifdef UTF8_INPUT_ENABLE 00949 " --fb-{skip, html, xml, perl, java, subchar}\n" 00950 " Specify unassigned character's replacement\n" 00951 #endif 00952 ); 00953 fprintf(HELP_OUTPUT, 00954 #ifdef OVERWRITE 00955 " --in-place[=SUF] Overwrite original files\n" 00956 " --overwrite[=SUF] Preserve timestamp of original files\n" 00957 #endif 00958 " -g --guess Guess the input code\n" 00959 " -v --version Print the version\n" 00960 " --help/-V Print this help / configuration\n" 00961 ); 00962 version(); 00963 } 00964 00965 static void 00966 show_configuration(void) 00967 { 00968 fprintf(HELP_OUTPUT, 00969 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n" 00970 " Compile-time options:\n" 00971 " Compiled at: " __DATE__ " " __TIME__ "\n" 00972 ); 00973 fprintf(HELP_OUTPUT, 00974 " Default output encoding: " 00975 #ifdef DEFAULT_CODE_LOCALE 00976 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding()) 00977 #elif defined(DEFAULT_ENCIDX) 00978 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding()) 00979 #else 00980 "NONE\n" 00981 #endif 00982 ); 00983 fprintf(HELP_OUTPUT, 00984 " Default output end of line: " 00985 #if DEFAULT_NEWLINE == CR 00986 "CR" 00987 #elif DEFAULT_NEWLINE == CRLF 00988 "CRLF" 00989 #else 00990 "LF" 00991 #endif 00992 "\n" 00993 " Decode MIME encoded string: " 00994 #if MIME_DECODE_DEFAULT 00995 "ON" 00996 #else 00997 "OFF" 00998 #endif 00999 "\n" 01000 " Convert JIS X 0201 Katakana: " 01001 #if X0201_DEFAULT 01002 "ON" 01003 #else 01004 "OFF" 01005 #endif 01006 "\n" 01007 " --help, --version output: " 01008 #if HELP_OUTPUT_HELP_OUTPUT 01009 "HELP_OUTPUT" 01010 #else 01011 "STDOUT" 01012 #endif 01013 "\n"); 01014 } 01015 #endif /*PERL_XS*/ 01016 01017 #ifdef OVERWRITE 01018 static char* 01019 get_backup_filename(const char *suffix, const char *filename) 01020 { 01021 char *backup_filename; 01022 int asterisk_count = 0; 01023 int i, j; 01024 int filename_length = strlen(filename); 01025 01026 for(i = 0; suffix[i]; i++){ 01027 if(suffix[i] == '*') asterisk_count++; 01028 } 01029 01030 if(asterisk_count){ 01031 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1); 01032 for(i = 0, j = 0; suffix[i];){ 01033 if(suffix[i] == '*'){ 01034 backup_filename[j] = '\0'; 01035 strncat(backup_filename, filename, filename_length); 01036 i++; 01037 j += filename_length; 01038 }else{ 01039 backup_filename[j++] = suffix[i++]; 01040 } 01041 } 01042 backup_filename[j] = '\0'; 01043 }else{ 01044 j = filename_length + strlen(suffix); 01045 backup_filename = nkf_xmalloc(j + 1); 01046 strcpy(backup_filename, filename); 01047 strcat(backup_filename, suffix); 01048 backup_filename[j] = '\0'; 01049 } 01050 return backup_filename; 01051 } 01052 #endif 01053 01054 #ifdef UTF8_INPUT_ENABLE 01055 static void 01056 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c) 01057 { 01058 int shift = 20; 01059 c &= VALUE_MASK; 01060 while(shift >= 0){ 01061 if(c >= NKF_INT32_C(1)<<shift){ 01062 while(shift >= 0){ 01063 (*f)(0, bin2hex(c>>shift)); 01064 shift -= 4; 01065 } 01066 }else{ 01067 shift -= 4; 01068 } 01069 } 01070 return; 01071 } 01072 01073 static void 01074 encode_fallback_html(nkf_char c) 01075 { 01076 (*oconv)(0, '&'); 01077 (*oconv)(0, '#'); 01078 c &= VALUE_MASK; 01079 if(c >= NKF_INT32_C(1000000)) 01080 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10); 01081 if(c >= NKF_INT32_C(100000)) 01082 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10); 01083 if(c >= 10000) 01084 (*oconv)(0, 0x30+(c/10000 )%10); 01085 if(c >= 1000) 01086 (*oconv)(0, 0x30+(c/1000 )%10); 01087 if(c >= 100) 01088 (*oconv)(0, 0x30+(c/100 )%10); 01089 if(c >= 10) 01090 (*oconv)(0, 0x30+(c/10 )%10); 01091 if(c >= 0) 01092 (*oconv)(0, 0x30+ c %10); 01093 (*oconv)(0, ';'); 01094 return; 01095 } 01096 01097 static void 01098 encode_fallback_xml(nkf_char c) 01099 { 01100 (*oconv)(0, '&'); 01101 (*oconv)(0, '#'); 01102 (*oconv)(0, 'x'); 01103 nkf_each_char_to_hex(oconv, c); 01104 (*oconv)(0, ';'); 01105 return; 01106 } 01107 01108 static void 01109 encode_fallback_java(nkf_char c) 01110 { 01111 (*oconv)(0, '\\'); 01112 c &= VALUE_MASK; 01113 if(!nkf_char_unicode_bmp_p(c)){ 01114 (*oconv)(0, 'U'); 01115 (*oconv)(0, '0'); 01116 (*oconv)(0, '0'); 01117 (*oconv)(0, bin2hex(c>>20)); 01118 (*oconv)(0, bin2hex(c>>16)); 01119 }else{ 01120 (*oconv)(0, 'u'); 01121 } 01122 (*oconv)(0, bin2hex(c>>12)); 01123 (*oconv)(0, bin2hex(c>> 8)); 01124 (*oconv)(0, bin2hex(c>> 4)); 01125 (*oconv)(0, bin2hex(c )); 01126 return; 01127 } 01128 01129 static void 01130 encode_fallback_perl(nkf_char c) 01131 { 01132 (*oconv)(0, '\\'); 01133 (*oconv)(0, 'x'); 01134 (*oconv)(0, '{'); 01135 nkf_each_char_to_hex(oconv, c); 01136 (*oconv)(0, '}'); 01137 return; 01138 } 01139 01140 static void 01141 encode_fallback_subchar(nkf_char c) 01142 { 01143 c = unicode_subchar; 01144 (*oconv)((c>>8)&0xFF, c&0xFF); 01145 return; 01146 } 01147 #endif 01148 01149 static const struct { 01150 const char *name; 01151 const char *alias; 01152 } long_option[] = { 01153 {"ic=", ""}, 01154 {"oc=", ""}, 01155 {"base64","jMB"}, 01156 {"euc","e"}, 01157 {"euc-input","E"}, 01158 {"fj","jm"}, 01159 {"help",""}, 01160 {"jis","j"}, 01161 {"jis-input","J"}, 01162 {"mac","sLm"}, 01163 {"mime","jM"}, 01164 {"mime-input","m"}, 01165 {"msdos","sLw"}, 01166 {"sjis","s"}, 01167 {"sjis-input","S"}, 01168 {"unix","eLu"}, 01169 {"version","v"}, 01170 {"windows","sLw"}, 01171 {"hiragana","h1"}, 01172 {"katakana","h2"}, 01173 {"katakana-hiragana","h3"}, 01174 {"guess=", ""}, 01175 {"guess", "g2"}, 01176 {"cp932", ""}, 01177 {"no-cp932", ""}, 01178 #ifdef X0212_ENABLE 01179 {"x0212", ""}, 01180 #endif 01181 #ifdef UTF8_OUTPUT_ENABLE 01182 {"utf8", "w"}, 01183 {"utf16", "w16"}, 01184 {"ms-ucs-map", ""}, 01185 {"fb-skip", ""}, 01186 {"fb-html", ""}, 01187 {"fb-xml", ""}, 01188 {"fb-perl", ""}, 01189 {"fb-java", ""}, 01190 {"fb-subchar", ""}, 01191 {"fb-subchar=", ""}, 01192 #endif 01193 #ifdef UTF8_INPUT_ENABLE 01194 {"utf8-input", "W"}, 01195 {"utf16-input", "W16"}, 01196 {"no-cp932ext", ""}, 01197 {"no-best-fit-chars",""}, 01198 #endif 01199 #ifdef UNICODE_NORMALIZATION 01200 {"utf8mac-input", ""}, 01201 #endif 01202 #ifdef OVERWRITE 01203 {"overwrite", ""}, 01204 {"overwrite=", ""}, 01205 {"in-place", ""}, 01206 {"in-place=", ""}, 01207 #endif 01208 #ifdef INPUT_OPTION 01209 {"cap-input", ""}, 01210 {"url-input", ""}, 01211 #endif 01212 #ifdef NUMCHAR_OPTION 01213 {"numchar-input", ""}, 01214 #endif 01215 #ifdef CHECK_OPTION 01216 {"no-output", ""}, 01217 {"debug", ""}, 01218 #endif 01219 #ifdef SHIFTJIS_CP932 01220 {"cp932inv", ""}, 01221 #endif 01222 #ifdef EXEC_IO 01223 {"exec-in", ""}, 01224 {"exec-out", ""}, 01225 #endif 01226 {"prefix=", ""}, 01227 }; 01228 01229 static void 01230 set_input_encoding(nkf_encoding *enc) 01231 { 01232 switch (nkf_enc_to_index(enc)) { 01233 case ISO_8859_1: 01234 iso8859_f = TRUE; 01235 break; 01236 case CP50221: 01237 case CP50222: 01238 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01239 case CP50220: 01240 #ifdef SHIFTJIS_CP932 01241 cp51932_f = TRUE; 01242 #endif 01243 #ifdef UTF8_OUTPUT_ENABLE 01244 ms_ucs_map_f = UCS_MAP_CP932; 01245 #endif 01246 break; 01247 case ISO_2022_JP_1: 01248 x0212_f = TRUE; 01249 break; 01250 case ISO_2022_JP_3: 01251 x0212_f = TRUE; 01252 x0213_f = TRUE; 01253 break; 01254 case ISO_2022_JP_2004: 01255 x0212_f = TRUE; 01256 x0213_f = TRUE; 01257 break; 01258 case SHIFT_JIS: 01259 break; 01260 case WINDOWS_31J: 01261 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01262 #ifdef SHIFTJIS_CP932 01263 cp51932_f = TRUE; 01264 #endif 01265 #ifdef UTF8_OUTPUT_ENABLE 01266 ms_ucs_map_f = UCS_MAP_CP932; 01267 #endif 01268 break; 01269 break; 01270 case CP10001: 01271 #ifdef SHIFTJIS_CP932 01272 cp51932_f = TRUE; 01273 #endif 01274 #ifdef UTF8_OUTPUT_ENABLE 01275 ms_ucs_map_f = UCS_MAP_CP10001; 01276 #endif 01277 break; 01278 case EUC_JP: 01279 break; 01280 case EUCJP_NKF: 01281 break; 01282 case CP51932: 01283 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01284 #ifdef SHIFTJIS_CP932 01285 cp51932_f = TRUE; 01286 #endif 01287 #ifdef UTF8_OUTPUT_ENABLE 01288 ms_ucs_map_f = UCS_MAP_CP932; 01289 #endif 01290 break; 01291 case EUCJP_MS: 01292 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01293 #ifdef SHIFTJIS_CP932 01294 cp51932_f = FALSE; 01295 #endif 01296 #ifdef UTF8_OUTPUT_ENABLE 01297 ms_ucs_map_f = UCS_MAP_MS; 01298 #endif 01299 break; 01300 case EUCJP_ASCII: 01301 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01302 #ifdef SHIFTJIS_CP932 01303 cp51932_f = FALSE; 01304 #endif 01305 #ifdef UTF8_OUTPUT_ENABLE 01306 ms_ucs_map_f = UCS_MAP_ASCII; 01307 #endif 01308 break; 01309 case SHIFT_JISX0213: 01310 case SHIFT_JIS_2004: 01311 x0213_f = TRUE; 01312 #ifdef SHIFTJIS_CP932 01313 cp51932_f = FALSE; 01314 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01315 #endif 01316 break; 01317 case EUC_JISX0213: 01318 case EUC_JIS_2004: 01319 x0213_f = TRUE; 01320 #ifdef SHIFTJIS_CP932 01321 cp51932_f = FALSE; 01322 #endif 01323 break; 01324 #ifdef UTF8_INPUT_ENABLE 01325 #ifdef UNICODE_NORMALIZATION 01326 case UTF8_MAC: 01327 nfc_f = TRUE; 01328 break; 01329 #endif 01330 case UTF_16: 01331 case UTF_16BE: 01332 case UTF_16BE_BOM: 01333 input_endian = ENDIAN_BIG; 01334 break; 01335 case UTF_16LE: 01336 case UTF_16LE_BOM: 01337 input_endian = ENDIAN_LITTLE; 01338 break; 01339 case UTF_32: 01340 case UTF_32BE: 01341 case UTF_32BE_BOM: 01342 input_endian = ENDIAN_BIG; 01343 break; 01344 case UTF_32LE: 01345 case UTF_32LE_BOM: 01346 input_endian = ENDIAN_LITTLE; 01347 break; 01348 #endif 01349 } 01350 } 01351 01352 static void 01353 set_output_encoding(nkf_encoding *enc) 01354 { 01355 switch (nkf_enc_to_index(enc)) { 01356 case CP50220: 01357 #ifdef SHIFTJIS_CP932 01358 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01359 #endif 01360 #ifdef UTF8_OUTPUT_ENABLE 01361 ms_ucs_map_f = UCS_MAP_CP932; 01362 #endif 01363 break; 01364 case CP50221: 01365 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01366 #ifdef SHIFTJIS_CP932 01367 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01368 #endif 01369 #ifdef UTF8_OUTPUT_ENABLE 01370 ms_ucs_map_f = UCS_MAP_CP932; 01371 #endif 01372 break; 01373 case ISO_2022_JP: 01374 #ifdef SHIFTJIS_CP932 01375 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01376 #endif 01377 break; 01378 case ISO_2022_JP_1: 01379 x0212_f = TRUE; 01380 #ifdef SHIFTJIS_CP932 01381 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01382 #endif 01383 break; 01384 case ISO_2022_JP_3: 01385 case ISO_2022_JP_2004: 01386 x0212_f = TRUE; 01387 x0213_f = TRUE; 01388 #ifdef SHIFTJIS_CP932 01389 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01390 #endif 01391 break; 01392 case SHIFT_JIS: 01393 break; 01394 case WINDOWS_31J: 01395 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01396 #ifdef UTF8_OUTPUT_ENABLE 01397 ms_ucs_map_f = UCS_MAP_CP932; 01398 #endif 01399 break; 01400 case CP10001: 01401 #ifdef UTF8_OUTPUT_ENABLE 01402 ms_ucs_map_f = UCS_MAP_CP10001; 01403 #endif 01404 break; 01405 case EUC_JP: 01406 x0212_f = TRUE; 01407 #ifdef SHIFTJIS_CP932 01408 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01409 #endif 01410 #ifdef UTF8_OUTPUT_ENABLE 01411 ms_ucs_map_f = UCS_MAP_ASCII; 01412 #endif 01413 break; 01414 case EUCJP_NKF: 01415 x0212_f = FALSE; 01416 #ifdef SHIFTJIS_CP932 01417 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01418 #endif 01419 #ifdef UTF8_OUTPUT_ENABLE 01420 ms_ucs_map_f = UCS_MAP_ASCII; 01421 #endif 01422 break; 01423 case CP51932: 01424 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01425 #ifdef SHIFTJIS_CP932 01426 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01427 #endif 01428 #ifdef UTF8_OUTPUT_ENABLE 01429 ms_ucs_map_f = UCS_MAP_CP932; 01430 #endif 01431 break; 01432 case EUCJP_MS: 01433 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01434 x0212_f = TRUE; 01435 #ifdef UTF8_OUTPUT_ENABLE 01436 ms_ucs_map_f = UCS_MAP_MS; 01437 #endif 01438 break; 01439 case EUCJP_ASCII: 01440 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ 01441 x0212_f = TRUE; 01442 #ifdef UTF8_OUTPUT_ENABLE 01443 ms_ucs_map_f = UCS_MAP_ASCII; 01444 #endif 01445 break; 01446 case SHIFT_JISX0213: 01447 case SHIFT_JIS_2004: 01448 x0213_f = TRUE; 01449 #ifdef SHIFTJIS_CP932 01450 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01451 #endif 01452 break; 01453 case EUC_JISX0213: 01454 case EUC_JIS_2004: 01455 x0212_f = TRUE; 01456 x0213_f = TRUE; 01457 #ifdef SHIFTJIS_CP932 01458 if (cp932inv_f == TRUE) cp932inv_f = FALSE; 01459 #endif 01460 break; 01461 #ifdef UTF8_OUTPUT_ENABLE 01462 case UTF_8_BOM: 01463 output_bom_f = TRUE; 01464 break; 01465 case UTF_16: 01466 case UTF_16BE_BOM: 01467 output_bom_f = TRUE; 01468 break; 01469 case UTF_16LE: 01470 output_endian = ENDIAN_LITTLE; 01471 output_bom_f = FALSE; 01472 break; 01473 case UTF_16LE_BOM: 01474 output_endian = ENDIAN_LITTLE; 01475 output_bom_f = TRUE; 01476 break; 01477 case UTF_32: 01478 case UTF_32BE_BOM: 01479 output_bom_f = TRUE; 01480 break; 01481 case UTF_32LE: 01482 output_endian = ENDIAN_LITTLE; 01483 output_bom_f = FALSE; 01484 break; 01485 case UTF_32LE_BOM: 01486 output_endian = ENDIAN_LITTLE; 01487 output_bom_f = TRUE; 01488 break; 01489 #endif 01490 } 01491 } 01492 01493 static struct input_code* 01494 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0)) 01495 { 01496 if (iconv_func){ 01497 struct input_code *p = input_code_list; 01498 while (p->name){ 01499 if (iconv_func == p->iconv_func){ 01500 return p; 01501 } 01502 p++; 01503 } 01504 } 01505 return 0; 01506 } 01507 01508 static void 01509 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0)) 01510 { 01511 #ifdef INPUT_CODE_FIX 01512 if (f || !input_encoding) 01513 #endif 01514 if (estab_f != f){ 01515 estab_f = f; 01516 } 01517 01518 if (iconv_func 01519 #ifdef INPUT_CODE_FIX 01520 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */ 01521 #endif 01522 ){ 01523 iconv = iconv_func; 01524 } 01525 #ifdef CHECK_OPTION 01526 if (estab_f && iconv_for_check != iconv){ 01527 struct input_code *p = find_inputcode_byfunc(iconv); 01528 if (p){ 01529 set_input_codename(p->name); 01530 debug(p->name); 01531 } 01532 iconv_for_check = iconv; 01533 } 01534 #endif 01535 } 01536 01537 #ifdef X0212_ENABLE 01538 static nkf_char 01539 x0212_shift(nkf_char c) 01540 { 01541 nkf_char ret = c; 01542 c &= 0x7f; 01543 if (is_eucg3(ret)){ 01544 if (0x75 <= c && c <= 0x7f){ 01545 ret = c + (0x109 - 0x75); 01546 } 01547 }else{ 01548 if (0x75 <= c && c <= 0x7f){ 01549 ret = c + (0x113 - 0x75); 01550 } 01551 } 01552 return ret; 01553 } 01554 01555 01556 static nkf_char 01557 x0212_unshift(nkf_char c) 01558 { 01559 nkf_char ret = c; 01560 if (0x7f <= c && c <= 0x88){ 01561 ret = c + (0x75 - 0x7f); 01562 }else if (0x89 <= c && c <= 0x92){ 01563 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89)); 01564 } 01565 return ret; 01566 } 01567 #endif /* X0212_ENABLE */ 01568 01569 static int 01570 is_x0213_2_in_x0212(nkf_char c1) 01571 { 01572 static const char x0213_2_table[] = 01573 {0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1}; 01574 int ku = c1 - 0x20; 01575 if (ku <= 15) 01576 return x0213_2_table[ku]; /* 1, 3-5, 8, 12-15 */ 01577 if (78 <= ku && ku <= 94) 01578 return 1; 01579 return 0; 01580 } 01581 01582 static nkf_char 01583 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1) 01584 { 01585 nkf_char ndx; 01586 if (is_eucg3(c2)){ 01587 ndx = c2 & 0x7f; 01588 if (x0213_f && is_x0213_2_in_x0212(ndx)){ 01589 if((0x21 <= ndx && ndx <= 0x2F)){ 01590 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3; 01591 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); 01592 return 0; 01593 }else if(0x6E <= ndx && ndx <= 0x7E){ 01594 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe; 01595 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); 01596 return 0; 01597 } 01598 return 1; 01599 } 01600 #ifdef X0212_ENABLE 01601 else if(nkf_isgraph(ndx)){ 01602 nkf_char val = 0; 01603 const unsigned short *ptr; 01604 ptr = x0212_shiftjis[ndx - 0x21]; 01605 if (ptr){ 01606 val = ptr[(c1 & 0x7f) - 0x21]; 01607 } 01608 if (val){ 01609 c2 = val >> 8; 01610 c1 = val & 0xff; 01611 if (p2) *p2 = c2; 01612 if (p1) *p1 = c1; 01613 return 0; 01614 } 01615 c2 = x0212_shift(c2); 01616 } 01617 #endif /* X0212_ENABLE */ 01618 } 01619 if(0x7F < c2) return 1; 01620 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1); 01621 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); 01622 return 0; 01623 } 01624 01625 static nkf_char 01626 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1) 01627 { 01628 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE) 01629 nkf_char val; 01630 #endif 01631 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} }; 01632 if (0xFC < c1) return 1; 01633 #ifdef SHIFTJIS_CP932 01634 if (!cp932inv_f && !x0213_f && is_ibmext_in_sjis(c2)){ 01635 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40]; 01636 if (val){ 01637 c2 = val >> 8; 01638 c1 = val & 0xff; 01639 } 01640 } 01641 if (cp932inv_f 01642 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){ 01643 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40]; 01644 if (val){ 01645 c2 = val >> 8; 01646 c1 = val & 0xff; 01647 } 01648 } 01649 #endif /* SHIFTJIS_CP932 */ 01650 #ifdef X0212_ENABLE 01651 if (!x0213_f && is_ibmext_in_sjis(c2)){ 01652 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40]; 01653 if (val){ 01654 if (val > 0x7FFF){ 01655 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f); 01656 c1 = val & 0xff; 01657 }else{ 01658 c2 = val >> 8; 01659 c1 = val & 0xff; 01660 } 01661 if (p2) *p2 = c2; 01662 if (p1) *p1 = c1; 01663 return 0; 01664 } 01665 } 01666 #endif 01667 if(c2 >= 0x80){ 01668 if(x0213_f && c2 >= 0xF0){ 01669 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */ 01670 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1]; 01671 }else{ /* 78<=k<=94 */ 01672 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B); 01673 if (0x9E < c1) c2++; 01674 } 01675 }else{ 01676 #define SJ0162 0x00e1 /* 01 - 62 ku offset */ 01677 #define SJ6394 0x0161 /* 63 - 94 ku offset */ 01678 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394); 01679 if (0x9E < c1) c2++; 01680 } 01681 if (c1 < 0x9F) 01682 c1 = c1 - ((c1 > DEL) ? SP : 0x1F); 01683 else { 01684 c1 = c1 - 0x7E; 01685 } 01686 } 01687 01688 #ifdef X0212_ENABLE 01689 c2 = x0212_unshift(c2); 01690 #endif 01691 if (p2) *p2 = c2; 01692 if (p1) *p1 = c1; 01693 return 0; 01694 } 01695 01696 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE) 01697 static void 01698 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4) 01699 { 01700 val &= VALUE_MASK; 01701 if (val < 0x80){ 01702 *p1 = val; 01703 *p2 = 0; 01704 *p3 = 0; 01705 *p4 = 0; 01706 }else if (val < 0x800){ 01707 *p1 = 0xc0 | (val >> 6); 01708 *p2 = 0x80 | (val & 0x3f); 01709 *p3 = 0; 01710 *p4 = 0; 01711 } else if (nkf_char_unicode_bmp_p(val)) { 01712 *p1 = 0xe0 | (val >> 12); 01713 *p2 = 0x80 | ((val >> 6) & 0x3f); 01714 *p3 = 0x80 | ( val & 0x3f); 01715 *p4 = 0; 01716 } else if (nkf_char_unicode_value_p(val)) { 01717 *p1 = 0xf0 | (val >> 18); 01718 *p2 = 0x80 | ((val >> 12) & 0x3f); 01719 *p3 = 0x80 | ((val >> 6) & 0x3f); 01720 *p4 = 0x80 | ( val & 0x3f); 01721 } else { 01722 *p1 = 0; 01723 *p2 = 0; 01724 *p3 = 0; 01725 *p4 = 0; 01726 } 01727 } 01728 01729 static nkf_char 01730 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) 01731 { 01732 nkf_char wc; 01733 if (c1 <= 0x7F) { 01734 /* single byte */ 01735 wc = c1; 01736 } 01737 else if (c1 <= 0xC1) { 01738 /* trail byte or invalid */ 01739 return -1; 01740 } 01741 else if (c1 <= 0xDF) { 01742 /* 2 bytes */ 01743 wc = (c1 & 0x1F) << 6; 01744 wc |= (c2 & 0x3F); 01745 } 01746 else if (c1 <= 0xEF) { 01747 /* 3 bytes */ 01748 wc = (c1 & 0x0F) << 12; 01749 wc |= (c2 & 0x3F) << 6; 01750 wc |= (c3 & 0x3F); 01751 } 01752 else if (c2 <= 0xF4) { 01753 /* 4 bytes */ 01754 wc = (c1 & 0x0F) << 18; 01755 wc |= (c2 & 0x3F) << 12; 01756 wc |= (c3 & 0x3F) << 6; 01757 wc |= (c4 & 0x3F); 01758 } 01759 else { 01760 return -1; 01761 } 01762 return wc; 01763 } 01764 #endif 01765 01766 #ifdef UTF8_INPUT_ENABLE 01767 static int 01768 unicode_to_jis_common2(nkf_char c1, nkf_char c0, 01769 const unsigned short *const *pp, nkf_char psize, 01770 nkf_char *p2, nkf_char *p1) 01771 { 01772 nkf_char c2; 01773 const unsigned short *p; 01774 unsigned short val; 01775 01776 if (pp == 0) return 1; 01777 01778 c1 -= 0x80; 01779 if (c1 < 0 || psize <= c1) return 1; 01780 p = pp[c1]; 01781 if (p == 0) return 1; 01782 01783 c0 -= 0x80; 01784 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1; 01785 val = p[c0]; 01786 if (val == 0) return 1; 01787 if (no_cp932ext_f && ( 01788 (val>>8) == 0x2D || /* NEC special characters */ 01789 val > NKF_INT32_C(0xF300) /* IBM extended characters */ 01790 )) return 1; 01791 01792 c2 = val >> 8; 01793 if (val > 0x7FFF){ 01794 c2 &= 0x7f; 01795 c2 |= PREFIX_EUCG3; 01796 } 01797 if (c2 == SO) c2 = JIS_X_0201_1976_K; 01798 c1 = val & 0xFF; 01799 if (p2) *p2 = c2; 01800 if (p1) *p1 = c1; 01801 return 0; 01802 } 01803 01804 static int 01805 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1) 01806 { 01807 const unsigned short *const *pp; 01808 const unsigned short *const *const *ppp; 01809 static const char no_best_fit_chars_table_C2[] = 01810 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01811 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01812 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2, 01813 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1}; 01814 static const char no_best_fit_chars_table_C2_ms[] = 01815 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01816 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01817 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 01818 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0}; 01819 static const char no_best_fit_chars_table_932_C2[] = 01820 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01821 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01822 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 01823 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0}; 01824 static const char no_best_fit_chars_table_932_C3[] = 01825 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01826 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 01827 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 01828 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1}; 01829 nkf_char ret = 0; 01830 01831 if(c2 < 0x80){ 01832 *p2 = 0; 01833 *p1 = c2; 01834 }else if(c2 < 0xe0){ 01835 if(no_best_fit_chars_f){ 01836 if(ms_ucs_map_f == UCS_MAP_CP932){ 01837 switch(c2){ 01838 case 0xC2: 01839 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1; 01840 break; 01841 case 0xC3: 01842 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1; 01843 break; 01844 } 01845 }else if(!cp932inv_f){ 01846 switch(c2){ 01847 case 0xC2: 01848 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1; 01849 break; 01850 case 0xC3: 01851 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1; 01852 break; 01853 } 01854 }else if(ms_ucs_map_f == UCS_MAP_MS){ 01855 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1; 01856 }else if(ms_ucs_map_f == UCS_MAP_CP10001){ 01857 switch(c2){ 01858 case 0xC2: 01859 switch(c1){ 01860 case 0xA2: 01861 case 0xA3: 01862 case 0xA5: 01863 case 0xA6: 01864 case 0xAC: 01865 case 0xAF: 01866 case 0xB8: 01867 return 1; 01868 } 01869 break; 01870 } 01871 } 01872 } 01873 pp = 01874 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 : 01875 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms : 01876 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac : 01877 x0213_f ? utf8_to_euc_2bytes_x0213 : 01878 utf8_to_euc_2bytes; 01879 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1); 01880 }else if(c0 < 0xF0){ 01881 if(no_best_fit_chars_f){ 01882 if(ms_ucs_map_f == UCS_MAP_CP932){ 01883 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1; 01884 }else if(ms_ucs_map_f == UCS_MAP_MS){ 01885 switch(c2){ 01886 case 0xE2: 01887 switch(c1){ 01888 case 0x80: 01889 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1; 01890 break; 01891 case 0x88: 01892 if(c0 == 0x92) return 1; 01893 break; 01894 } 01895 break; 01896 case 0xE3: 01897 if(c1 == 0x80 || c0 == 0x9C) return 1; 01898 break; 01899 } 01900 }else if(ms_ucs_map_f == UCS_MAP_CP10001){ 01901 switch(c2){ 01902 case 0xE3: 01903 switch(c1){ 01904 case 0x82: 01905 if(c0 == 0x94) return 1; 01906 break; 01907 case 0x83: 01908 if(c0 == 0xBB) return 1; 01909 break; 01910 } 01911 break; 01912 } 01913 }else{ 01914 switch(c2){ 01915 case 0xE2: 01916 switch(c1){ 01917 case 0x80: 01918 if(c0 == 0x95) return 1; 01919 break; 01920 case 0x88: 01921 if(c0 == 0xA5) return 1; 01922 break; 01923 } 01924 break; 01925 case 0xEF: 01926 switch(c1){ 01927 case 0xBC: 01928 if(c0 == 0x8D) return 1; 01929 break; 01930 case 0xBD: 01931 if(c0 == 0x9E && !cp932inv_f) return 1; 01932 break; 01933 case 0xBF: 01934 if(0xA0 <= c0 && c0 <= 0xA5) return 1; 01935 break; 01936 } 01937 break; 01938 } 01939 } 01940 } 01941 ppp = 01942 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 : 01943 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms : 01944 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac : 01945 x0213_f ? utf8_to_euc_3bytes_x0213 : 01946 utf8_to_euc_3bytes; 01947 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1); 01948 }else return -1; 01949 #ifdef SHIFTJIS_CP932 01950 if (!ret && !cp932inv_f && is_eucg3(*p2)) { 01951 nkf_char s2, s1; 01952 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) { 01953 s2e_conv(s2, s1, p2, p1); 01954 }else{ 01955 ret = 1; 01956 } 01957 } 01958 #endif 01959 return ret; 01960 } 01961 01962 #ifdef UTF8_OUTPUT_ENABLE 01963 #define X0213_SURROGATE_FIND(tbl, size, euc) do { \ 01964 int i; \ 01965 for (i = 0; i < size; i++) \ 01966 if (tbl[i][0] == euc) { \ 01967 low = tbl[i][2]; \ 01968 break; \ 01969 } \ 01970 } while (0) 01971 01972 static nkf_char 01973 e2w_conv(nkf_char c2, nkf_char c1) 01974 { 01975 const unsigned short *p; 01976 01977 if (c2 == JIS_X_0201_1976_K) { 01978 if (ms_ucs_map_f == UCS_MAP_CP10001) { 01979 switch (c1) { 01980 case 0x20: 01981 return 0xA0; 01982 case 0x7D: 01983 return 0xA9; 01984 } 01985 } 01986 p = euc_to_utf8_1byte; 01987 #ifdef X0212_ENABLE 01988 } else if (is_eucg3(c2)){ 01989 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){ 01990 return 0xA6; 01991 } 01992 c2 = (c2&0x7f) - 0x21; 01993 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes) 01994 p = 01995 x0213_f ? x0212_to_utf8_2bytes_x0213[c2] : 01996 x0212_to_utf8_2bytes[c2]; 01997 else 01998 return 0; 01999 #endif 02000 } else { 02001 c2 &= 0x7f; 02002 c2 = (c2&0x7f) - 0x21; 02003 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes) 02004 p = 02005 x0213_f ? euc_to_utf8_2bytes_x0213[c2] : 02006 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] : 02007 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] : 02008 euc_to_utf8_2bytes_ms[c2]; 02009 else 02010 return 0; 02011 } 02012 if (!p) return 0; 02013 c1 = (c1 & 0x7f) - 0x21; 02014 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte) { 02015 nkf_char val = p[c1]; 02016 if (x0213_f && 0xD800<=val && val<=0xDBFF) { 02017 nkf_char euc = (c2+0x21)<<8 | (c1+0x21); 02018 nkf_char low = 0; 02019 if (p==x0212_to_utf8_2bytes_x0213[c2]) { 02020 X0213_SURROGATE_FIND(x0213_2_surrogate_table, sizeof_x0213_2_surrogate_table, euc); 02021 } else { 02022 X0213_SURROGATE_FIND(x0213_1_surrogate_table, sizeof_x0213_1_surrogate_table, euc); 02023 } 02024 if (!low) return 0; 02025 return UTF16_TO_UTF32(val, low); 02026 } else { 02027 return val; 02028 } 02029 } 02030 return 0; 02031 } 02032 02033 static nkf_char 02034 e2w_combining(nkf_char comb, nkf_char c2, nkf_char c1) 02035 { 02036 nkf_char euc; 02037 int i; 02038 for (i = 0; i < sizeof_x0213_combining_chars; i++) 02039 if (x0213_combining_chars[i] == comb) 02040 break; 02041 if (i >= sizeof_x0213_combining_chars) 02042 return 0; 02043 euc = (c2&0x7f)<<8 | (c1&0x7f); 02044 for (i = 0; i < sizeof_x0213_combining_table; i++) 02045 if (x0213_combining_table[i][0] == euc) 02046 return x0213_combining_table[i][1]; 02047 return 0; 02048 } 02049 #endif 02050 02051 static nkf_char 02052 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1) 02053 { 02054 nkf_char ret = 0; 02055 02056 if (!c1){ 02057 *p2 = 0; 02058 *p1 = c2; 02059 }else if (0xc0 <= c2 && c2 <= 0xef) { 02060 ret = unicode_to_jis_common(c2, c1, c0, p2, p1); 02061 #ifdef NUMCHAR_OPTION 02062 if (ret > 0){ 02063 if (p2) *p2 = 0; 02064 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0)); 02065 ret = 0; 02066 } 02067 #endif 02068 } 02069 return ret; 02070 } 02071 02072 #ifdef UTF8_INPUT_ENABLE 02073 static nkf_char 02074 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1) 02075 { 02076 nkf_char c1, c2, c3, c4; 02077 nkf_char ret = 0; 02078 val &= VALUE_MASK; 02079 if (val < 0x80) { 02080 *p2 = 0; 02081 *p1 = val; 02082 } 02083 else if (nkf_char_unicode_bmp_p(val)){ 02084 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); 02085 ret = unicode_to_jis_common(c1, c2, c3, p2, p1); 02086 if (ret > 0){ 02087 *p2 = 0; 02088 *p1 = nkf_char_unicode_new(val); 02089 ret = 0; 02090 } 02091 } 02092 else { 02093 int i; 02094 if (x0213_f) { 02095 c1 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ 02096 c2 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ 02097 for (i = 0; i < sizeof_x0213_1_surrogate_table; i++) 02098 if (x0213_1_surrogate_table[i][1] == c1 && x0213_1_surrogate_table[i][2] == c2) { 02099 val = x0213_1_surrogate_table[i][0]; 02100 *p2 = val >> 8; 02101 *p1 = val & 0xFF; 02102 return 0; 02103 } 02104 for (i = 0; i < sizeof_x0213_2_surrogate_table; i++) 02105 if (x0213_2_surrogate_table[i][1] == c1 && x0213_2_surrogate_table[i][2] == c2) { 02106 val = x0213_2_surrogate_table[i][0]; 02107 *p2 = PREFIX_EUCG3 | (val >> 8); 02108 *p1 = val & 0xFF; 02109 return 0; 02110 } 02111 } 02112 *p2 = 0; 02113 *p1 = nkf_char_unicode_new(val); 02114 } 02115 return ret; 02116 } 02117 #endif 02118 02119 static nkf_char 02120 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0) 02121 { 02122 if (c2 == JIS_X_0201_1976_K || c2 == SS2){ 02123 if (iso2022jp_f && !x0201_f) { 02124 c2 = GETA1; c1 = GETA2; 02125 } else { 02126 c2 = JIS_X_0201_1976_K; 02127 c1 &= 0x7f; 02128 } 02129 #ifdef X0212_ENABLE 02130 }else if (c2 == 0x8f){ 02131 if (c0 == 0){ 02132 return -1; 02133 } 02134 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) { 02135 /* encoding is eucJP-ms, so invert to Unicode Private User Area */ 02136 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC); 02137 c2 = 0; 02138 } else { 02139 c2 = (c2 << 8) | (c1 & 0x7f); 02140 c1 = c0 & 0x7f; 02141 #ifdef SHIFTJIS_CP932 02142 if (cp51932_f){ 02143 nkf_char s2, s1; 02144 if (e2s_conv(c2, c1, &s2, &s1) == 0){ 02145 s2e_conv(s2, s1, &c2, &c1); 02146 if (c2 < 0x100){ 02147 c1 &= 0x7f; 02148 c2 &= 0x7f; 02149 } 02150 } 02151 } 02152 #endif /* SHIFTJIS_CP932 */ 02153 } 02154 #endif /* X0212_ENABLE */ 02155 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) { 02156 /* NOP */ 02157 } else { 02158 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) { 02159 /* encoding is eucJP-ms, so invert to Unicode Private User Area */ 02160 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000); 02161 c2 = 0; 02162 } else { 02163 c1 &= 0x7f; 02164 c2 &= 0x7f; 02165 #ifdef SHIFTJIS_CP932 02166 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){ 02167 nkf_char s2, s1; 02168 if (e2s_conv(c2, c1, &s2, &s1) == 0){ 02169 s2e_conv(s2, s1, &c2, &c1); 02170 if (c2 < 0x100){ 02171 c1 &= 0x7f; 02172 c2 &= 0x7f; 02173 } 02174 } 02175 } 02176 #endif /* SHIFTJIS_CP932 */ 02177 } 02178 } 02179 (*oconv)(c2, c1); 02180 return 0; 02181 } 02182 02183 static nkf_char 02184 s_iconv(ARG_UNUSED nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0) 02185 { 02186 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) { 02187 if (iso2022jp_f && !x0201_f) { 02188 c2 = GETA1; c1 = GETA2; 02189 } else { 02190 c1 &= 0x7f; 02191 } 02192 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) { 02193 /* NOP */ 02194 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) { 02195 /* CP932 UDC */ 02196 if(c1 == 0x7F) return 0; 02197 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000); 02198 c2 = 0; 02199 } else { 02200 nkf_char ret = s2e_conv(c2, c1, &c2, &c1); 02201 if (ret) return ret; 02202 } 02203 (*oconv)(c2, c1); 02204 return 0; 02205 } 02206 02207 static int 02208 x0213_wait_combining_p(nkf_char wc) 02209 { 02210 int i; 02211 for (i = 0; i < sizeof_x0213_combining_table; i++) { 02212 if (x0213_combining_table[i][1] == wc) { 02213 return TRUE; 02214 } 02215 } 02216 return FALSE; 02217 } 02218 02219 static int 02220 x0213_combining_p(nkf_char wc) 02221 { 02222 int i; 02223 for (i = 0; i < sizeof_x0213_combining_chars; i++) { 02224 if (x0213_combining_chars[i] == wc) { 02225 return TRUE; 02226 } 02227 } 02228 return FALSE; 02229 } 02230 02231 static nkf_char 02232 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3) 02233 { 02234 nkf_char ret = 0, c4 = 0; 02235 static const char w_iconv_utf8_1st_byte[] = 02236 { /* 0xC0 - 0xFF */ 02237 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 02238 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 02239 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 02240 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70}; 02241 02242 if (c3 > 0xFF) { 02243 c4 = c3 & 0xFF; 02244 c3 >>= 8; 02245 } 02246 02247 if (c1 < 0 || 0xff < c1) { 02248 }else if (c1 == 0) { /* 0 : 1 byte*/ 02249 c3 = 0; 02250 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */ 02251 return 0; 02252 } else{ 02253 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) { 02254 case 21: 02255 if (c2 < 0x80 || 0xBF < c2) return 0; 02256 break; 02257 case 30: 02258 if (c3 == 0) return -1; 02259 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80) 02260 return 0; 02261 break; 02262 case 31: 02263 case 33: 02264 if (c3 == 0) return -1; 02265 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80) 02266 return 0; 02267 break; 02268 case 32: 02269 if (c3 == 0) return -1; 02270 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80) 02271 return 0; 02272 break; 02273 case 40: 02274 if (c3 == 0) return -2; 02275 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) 02276 return 0; 02277 break; 02278 case 41: 02279 if (c3 == 0) return -2; 02280 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) 02281 return 0; 02282 break; 02283 case 42: 02284 if (c3 == 0) return -2; 02285 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) 02286 return 0; 02287 break; 02288 default: 02289 return 0; 02290 break; 02291 } 02292 } 02293 if (c1 == 0 || c1 == EOF){ 02294 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */ 02295 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4)); 02296 c1 = 0; 02297 } else { 02298 if (x0213_f && x0213_wait_combining_p(nkf_utf8_to_unicode(c1, c2, c3, c4))) 02299 return -3; 02300 ret = w2e_conv(c1, c2, c3, &c1, &c2); 02301 } 02302 if (ret == 0){ 02303 (*oconv)(c1, c2); 02304 } 02305 return ret; 02306 } 02307 02308 static nkf_char 02309 w_iconv_nocombine(nkf_char c1, nkf_char c2, nkf_char c3) 02310 { 02311 /* continue from the line below 'return -3;' in w_iconv() */ 02312 nkf_char ret = w2e_conv(c1, c2, c3, &c1, &c2); 02313 if (ret == 0){ 02314 (*oconv)(c1, c2); 02315 } 02316 return ret; 02317 } 02318 02319 #define NKF_ICONV_INVALID_CODE_RANGE -13 02320 #define NKF_ICONV_WAIT_COMBINING_CHAR -14 02321 #define NKF_ICONV_NOT_COMBINED -15 02322 static size_t 02323 unicode_iconv(nkf_char wc, int nocombine) 02324 { 02325 nkf_char c1, c2; 02326 int ret = 0; 02327 02328 if (wc < 0x80) { 02329 c2 = 0; 02330 c1 = wc; 02331 }else if ((wc>>11) == 27) { 02332 /* unpaired surrogate */ 02333 return NKF_ICONV_INVALID_CODE_RANGE; 02334 }else if (wc < 0xFFFF) { 02335 if (!nocombine && x0213_f && x0213_wait_combining_p(wc)) 02336 return NKF_ICONV_WAIT_COMBINING_CHAR; 02337 ret = w16e_conv(wc, &c2, &c1); 02338 if (ret) return ret; 02339 }else if (wc < 0x10FFFF) { 02340 c2 = 0; 02341 c1 = nkf_char_unicode_new(wc); 02342 } else { 02343 return NKF_ICONV_INVALID_CODE_RANGE; 02344 } 02345 (*oconv)(c2, c1); 02346 return 0; 02347 } 02348 02349 static nkf_char 02350 unicode_iconv_combine(nkf_char wc, nkf_char wc2) 02351 { 02352 nkf_char c1, c2; 02353 int i; 02354 02355 if (wc2 < 0x80) { 02356 return NKF_ICONV_NOT_COMBINED; 02357 }else if ((wc2>>11) == 27) { 02358 /* unpaired surrogate */ 02359 return NKF_ICONV_INVALID_CODE_RANGE; 02360 }else if (wc2 < 0xFFFF) { 02361 if (!x0213_combining_p(wc2)) 02362 return NKF_ICONV_NOT_COMBINED; 02363 for (i = 0; i < sizeof_x0213_combining_table; i++) { 02364 if (x0213_combining_table[i][1] == wc && 02365 x0213_combining_table[i][2] == wc2) { 02366 c2 = x0213_combining_table[i][0] >> 8; 02367 c1 = x0213_combining_table[i][0] & 0x7f; 02368 (*oconv)(c2, c1); 02369 return 0; 02370 } 02371 } 02372 }else if (wc2 < 0x10FFFF) { 02373 return NKF_ICONV_NOT_COMBINED; 02374 } else { 02375 return NKF_ICONV_INVALID_CODE_RANGE; 02376 } 02377 return NKF_ICONV_NOT_COMBINED; 02378 } 02379 02380 static nkf_char 02381 w_iconv_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6) 02382 { 02383 nkf_char wc, wc2; 02384 wc = nkf_utf8_to_unicode(c1, c2, c3, 0); 02385 wc2 = nkf_utf8_to_unicode(c4, c5, c6, 0); 02386 if (wc2 < 0) 02387 return wc2; 02388 return unicode_iconv_combine(wc, wc2); 02389 } 02390 02391 #define NKF_ICONV_NEED_ONE_MORE_BYTE (size_t)-1 02392 #define NKF_ICONV_NEED_TWO_MORE_BYTES (size_t)-2 02393 static size_t 02394 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) 02395 { 02396 nkf_char wc; 02397 02398 if (c1 == EOF) { 02399 (*oconv)(EOF, 0); 02400 return 0; 02401 } 02402 02403 if (input_endian == ENDIAN_BIG) { 02404 if (0xD8 <= c1 && c1 <= 0xDB) { 02405 if (0xDC <= c3 && c3 <= 0xDF) { 02406 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4); 02407 } else return NKF_ICONV_NEED_TWO_MORE_BYTES; 02408 } else { 02409 wc = c1 << 8 | c2; 02410 } 02411 } else { 02412 if (0xD8 <= c2 && c2 <= 0xDB) { 02413 if (0xDC <= c4 && c4 <= 0xDF) { 02414 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3); 02415 } else return NKF_ICONV_NEED_TWO_MORE_BYTES; 02416 } else { 02417 wc = c2 << 8 | c1; 02418 } 02419 } 02420 02421 return (*unicode_iconv)(wc, FALSE); 02422 } 02423 02424 static size_t 02425 nkf_iconv_utf_16_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) 02426 { 02427 nkf_char wc, wc2; 02428 02429 if (input_endian == ENDIAN_BIG) { 02430 if (0xD8 <= c3 && c3 <= 0xDB) { 02431 return NKF_ICONV_NOT_COMBINED; 02432 } else { 02433 wc = c1 << 8 | c2; 02434 wc2 = c3 << 8 | c4; 02435 } 02436 } else { 02437 if (0xD8 <= c2 && c2 <= 0xDB) { 02438 return NKF_ICONV_NOT_COMBINED; 02439 } else { 02440 wc = c2 << 8 | c1; 02441 wc2 = c4 << 8 | c3; 02442 } 02443 } 02444 02445 return unicode_iconv_combine(wc, wc2); 02446 } 02447 02448 static size_t 02449 nkf_iconv_utf_16_nocombine(nkf_char c1, nkf_char c2) 02450 { 02451 nkf_char wc; 02452 if (input_endian == ENDIAN_BIG) 02453 wc = c1 << 8 | c2; 02454 else 02455 wc = c2 << 8 | c1; 02456 return (*unicode_iconv)(wc, TRUE); 02457 } 02458 02459 static nkf_char 02460 w_iconv16(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0) 02461 { 02462 (*oconv)(c2, c1); 02463 return 16; /* different from w_iconv32 */ 02464 } 02465 02466 static nkf_char 02467 w_iconv32(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0) 02468 { 02469 (*oconv)(c2, c1); 02470 return 32; /* different from w_iconv16 */ 02471 } 02472 02473 static nkf_char 02474 utf32_to_nkf_char(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) 02475 { 02476 nkf_char wc; 02477 02478 switch(input_endian){ 02479 case ENDIAN_BIG: 02480 wc = c2 << 16 | c3 << 8 | c4; 02481 break; 02482 case ENDIAN_LITTLE: 02483 wc = c3 << 16 | c2 << 8 | c1; 02484 break; 02485 case ENDIAN_2143: 02486 wc = c1 << 16 | c4 << 8 | c3; 02487 break; 02488 case ENDIAN_3412: 02489 wc = c4 << 16 | c1 << 8 | c2; 02490 break; 02491 default: 02492 return NKF_ICONV_INVALID_CODE_RANGE; 02493 } 02494 return wc; 02495 } 02496 02497 static size_t 02498 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) 02499 { 02500 nkf_char wc; 02501 02502 if (c1 == EOF) { 02503 (*oconv)(EOF, 0); 02504 return 0; 02505 } 02506 02507 wc = utf32_to_nkf_char(c1, c2, c3, c4); 02508 if (wc < 0) 02509 return wc; 02510 02511 return (*unicode_iconv)(wc, FALSE); 02512 } 02513 02514 static nkf_char 02515 nkf_iconv_utf_32_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6, nkf_char c7, nkf_char c8) 02516 { 02517 nkf_char wc, wc2; 02518 02519 wc = utf32_to_nkf_char(c1, c2, c3, c4); 02520 if (wc < 0) 02521 return wc; 02522 wc2 = utf32_to_nkf_char(c5, c6, c7, c8); 02523 if (wc2 < 0) 02524 return wc2; 02525 02526 return unicode_iconv_combine(wc, wc2); 02527 } 02528 02529 static size_t 02530 nkf_iconv_utf_32_nocombine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) 02531 { 02532 nkf_char wc; 02533 02534 wc = utf32_to_nkf_char(c1, c2, c3, c4); 02535 return (*unicode_iconv)(wc, TRUE); 02536 } 02537 #endif 02538 02539 #define output_ascii_escape_sequence(mode) do { \ 02540 if (output_mode != ASCII && output_mode != ISO_8859_1) { \ 02541 (*o_putc)(ESC); \ 02542 (*o_putc)('('); \ 02543 (*o_putc)(ascii_intro); \ 02544 output_mode = mode; \ 02545 } \ 02546 } while (0) 02547 02548 static void 02549 output_escape_sequence(int mode) 02550 { 02551 if (output_mode == mode) 02552 return; 02553 switch(mode) { 02554 case ISO_8859_1: 02555 (*o_putc)(ESC); 02556 (*o_putc)('.'); 02557 (*o_putc)('A'); 02558 break; 02559 case JIS_X_0201_1976_K: 02560 (*o_putc)(ESC); 02561 (*o_putc)('('); 02562 (*o_putc)('I'); 02563 break; 02564 case JIS_X_0208: 02565 (*o_putc)(ESC); 02566 (*o_putc)('$'); 02567 (*o_putc)(kanji_intro); 02568 break; 02569 case JIS_X_0212: 02570 (*o_putc)(ESC); 02571 (*o_putc)('$'); 02572 (*o_putc)('('); 02573 (*o_putc)('D'); 02574 break; 02575 case JIS_X_0213_1: 02576 (*o_putc)(ESC); 02577 (*o_putc)('$'); 02578 (*o_putc)('('); 02579 (*o_putc)('Q'); 02580 break; 02581 case JIS_X_0213_2: 02582 (*o_putc)(ESC); 02583 (*o_putc)('$'); 02584 (*o_putc)('('); 02585 (*o_putc)('P'); 02586 break; 02587 } 02588 output_mode = mode; 02589 } 02590 02591 static void 02592 j_oconv(nkf_char c2, nkf_char c1) 02593 { 02594 #ifdef NUMCHAR_OPTION 02595 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02596 w16e_conv(c1, &c2, &c1); 02597 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02598 c2 = c1 & VALUE_MASK; 02599 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) { 02600 /* CP5022x UDC */ 02601 c1 &= 0xFFF; 02602 c2 = 0x7F + c1 / 94; 02603 c1 = 0x21 + c1 % 94; 02604 } else { 02605 if (encode_fallback) (*encode_fallback)(c1); 02606 return; 02607 } 02608 } 02609 } 02610 #endif 02611 if (c2 == 0) { 02612 output_ascii_escape_sequence(ASCII); 02613 (*o_putc)(c1); 02614 } 02615 else if (c2 == EOF) { 02616 output_ascii_escape_sequence(ASCII); 02617 (*o_putc)(EOF); 02618 } 02619 else if (c2 == ISO_8859_1) { 02620 output_ascii_escape_sequence(ISO_8859_1); 02621 (*o_putc)(c1|0x80); 02622 } 02623 else if (c2 == JIS_X_0201_1976_K) { 02624 output_escape_sequence(JIS_X_0201_1976_K); 02625 (*o_putc)(c1); 02626 #ifdef X0212_ENABLE 02627 } else if (is_eucg3(c2)){ 02628 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212); 02629 (*o_putc)(c2 & 0x7f); 02630 (*o_putc)(c1); 02631 #endif 02632 } else { 02633 if(ms_ucs_map_f 02634 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1 02635 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return; 02636 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208); 02637 (*o_putc)(c2); 02638 (*o_putc)(c1); 02639 } 02640 } 02641 02642 static void 02643 e_oconv(nkf_char c2, nkf_char c1) 02644 { 02645 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02646 w16e_conv(c1, &c2, &c1); 02647 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02648 c2 = c1 & VALUE_MASK; 02649 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) { 02650 /* eucJP-ms UDC */ 02651 c1 &= 0xFFF; 02652 c2 = c1 / 94; 02653 c2 += c2 < 10 ? 0x75 : 0x8FEB; 02654 c1 = 0x21 + c1 % 94; 02655 if (is_eucg3(c2)){ 02656 (*o_putc)(0x8f); 02657 (*o_putc)((c2 & 0x7f) | 0x080); 02658 (*o_putc)(c1 | 0x080); 02659 }else{ 02660 (*o_putc)((c2 & 0x7f) | 0x080); 02661 (*o_putc)(c1 | 0x080); 02662 } 02663 return; 02664 } else { 02665 if (encode_fallback) (*encode_fallback)(c1); 02666 return; 02667 } 02668 } 02669 } 02670 02671 if (c2 == EOF) { 02672 (*o_putc)(EOF); 02673 } else if (c2 == 0) { 02674 output_mode = ASCII; 02675 (*o_putc)(c1); 02676 } else if (c2 == JIS_X_0201_1976_K) { 02677 output_mode = EUC_JP; 02678 (*o_putc)(SS2); (*o_putc)(c1|0x80); 02679 } else if (c2 == ISO_8859_1) { 02680 output_mode = ISO_8859_1; 02681 (*o_putc)(c1 | 0x080); 02682 #ifdef X0212_ENABLE 02683 } else if (is_eucg3(c2)){ 02684 output_mode = EUC_JP; 02685 #ifdef SHIFTJIS_CP932 02686 if (!cp932inv_f){ 02687 nkf_char s2, s1; 02688 if (e2s_conv(c2, c1, &s2, &s1) == 0){ 02689 s2e_conv(s2, s1, &c2, &c1); 02690 } 02691 } 02692 #endif 02693 if (c2 == 0) { 02694 output_mode = ASCII; 02695 (*o_putc)(c1); 02696 }else if (is_eucg3(c2)){ 02697 if (x0212_f){ 02698 (*o_putc)(0x8f); 02699 (*o_putc)((c2 & 0x7f) | 0x080); 02700 (*o_putc)(c1 | 0x080); 02701 } 02702 }else{ 02703 (*o_putc)((c2 & 0x7f) | 0x080); 02704 (*o_putc)(c1 | 0x080); 02705 } 02706 #endif 02707 } else { 02708 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) { 02709 set_iconv(FALSE, 0); 02710 return; /* too late to rescue this char */ 02711 } 02712 output_mode = EUC_JP; 02713 (*o_putc)(c2 | 0x080); 02714 (*o_putc)(c1 | 0x080); 02715 } 02716 } 02717 02718 static void 02719 s_oconv(nkf_char c2, nkf_char c1) 02720 { 02721 #ifdef NUMCHAR_OPTION 02722 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02723 w16e_conv(c1, &c2, &c1); 02724 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02725 c2 = c1 & VALUE_MASK; 02726 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) { 02727 /* CP932 UDC */ 02728 c1 &= 0xFFF; 02729 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB); 02730 c1 = c1 % 188; 02731 c1 += 0x40 + (c1 > 0x3e); 02732 (*o_putc)(c2); 02733 (*o_putc)(c1); 02734 return; 02735 } else { 02736 if(encode_fallback)(*encode_fallback)(c1); 02737 return; 02738 } 02739 } 02740 } 02741 #endif 02742 if (c2 == EOF) { 02743 (*o_putc)(EOF); 02744 return; 02745 } else if (c2 == 0) { 02746 output_mode = ASCII; 02747 (*o_putc)(c1); 02748 } else if (c2 == JIS_X_0201_1976_K) { 02749 output_mode = SHIFT_JIS; 02750 (*o_putc)(c1|0x80); 02751 } else if (c2 == ISO_8859_1) { 02752 output_mode = ISO_8859_1; 02753 (*o_putc)(c1 | 0x080); 02754 #ifdef X0212_ENABLE 02755 } else if (is_eucg3(c2)){ 02756 output_mode = SHIFT_JIS; 02757 if (e2s_conv(c2, c1, &c2, &c1) == 0){ 02758 (*o_putc)(c2); 02759 (*o_putc)(c1); 02760 } 02761 #endif 02762 } else { 02763 if (!nkf_isprint(c1) || !nkf_isprint(c2)) { 02764 set_iconv(FALSE, 0); 02765 return; /* too late to rescue this char */ 02766 } 02767 output_mode = SHIFT_JIS; 02768 e2s_conv(c2, c1, &c2, &c1); 02769 02770 #ifdef SHIFTJIS_CP932 02771 if (cp932inv_f 02772 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){ 02773 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40]; 02774 if (c){ 02775 c2 = c >> 8; 02776 c1 = c & 0xff; 02777 } 02778 } 02779 #endif /* SHIFTJIS_CP932 */ 02780 02781 (*o_putc)(c2); 02782 if (prefix_table[(unsigned char)c1]){ 02783 (*o_putc)(prefix_table[(unsigned char)c1]); 02784 } 02785 (*o_putc)(c1); 02786 } 02787 } 02788 02789 #ifdef UTF8_OUTPUT_ENABLE 02790 #define OUTPUT_UTF8(val) do { \ 02791 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); \ 02792 (*o_putc)(c1); \ 02793 if (c2) (*o_putc)(c2); \ 02794 if (c3) (*o_putc)(c3); \ 02795 if (c4) (*o_putc)(c4); \ 02796 } while (0) 02797 02798 static void 02799 w_oconv(nkf_char c2, nkf_char c1) 02800 { 02801 nkf_char c3, c4; 02802 nkf_char val, val2; 02803 02804 if (output_bom_f) { 02805 output_bom_f = FALSE; 02806 (*o_putc)('\357'); 02807 (*o_putc)('\273'); 02808 (*o_putc)('\277'); 02809 } 02810 02811 if (c2 == EOF) { 02812 (*o_putc)(EOF); 02813 return; 02814 } 02815 02816 if (c2 == 0 && nkf_char_unicode_p(c1)){ 02817 val = c1 & VALUE_MASK; 02818 OUTPUT_UTF8(val); 02819 return; 02820 } 02821 02822 if (c2 == 0) { 02823 (*o_putc)(c1); 02824 } else { 02825 val = e2w_conv(c2, c1); 02826 if (val){ 02827 val2 = e2w_combining(val, c2, c1); 02828 if (val2) 02829 OUTPUT_UTF8(val2); 02830 OUTPUT_UTF8(val); 02831 } 02832 } 02833 } 02834 02835 #define OUTPUT_UTF16_BYTES(c1, c2) do { \ 02836 if (output_endian == ENDIAN_LITTLE){ \ 02837 (*o_putc)(c1); \ 02838 (*o_putc)(c2); \ 02839 }else{ \ 02840 (*o_putc)(c2); \ 02841 (*o_putc)(c1); \ 02842 } \ 02843 } while (0) 02844 02845 #define OUTPUT_UTF16(val) do { \ 02846 if (nkf_char_unicode_bmp_p(val)) { \ 02847 c2 = (val >> 8) & 0xff; \ 02848 c1 = val & 0xff; \ 02849 OUTPUT_UTF16_BYTES(c1, c2); \ 02850 } else { \ 02851 val &= VALUE_MASK; \ 02852 if (val <= UNICODE_MAX) { \ 02853 c2 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ \ 02854 c1 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ \ 02855 OUTPUT_UTF16_BYTES(c2 & 0xff, (c2 >> 8) & 0xff); \ 02856 OUTPUT_UTF16_BYTES(c1 & 0xff, (c1 >> 8) & 0xff); \ 02857 } \ 02858 } \ 02859 } while (0) 02860 02861 static void 02862 w_oconv16(nkf_char c2, nkf_char c1) 02863 { 02864 if (output_bom_f) { 02865 output_bom_f = FALSE; 02866 OUTPUT_UTF16_BYTES(0xFF, 0xFE); 02867 } 02868 02869 if (c2 == EOF) { 02870 (*o_putc)(EOF); 02871 return; 02872 } 02873 02874 if (c2 == 0 && nkf_char_unicode_p(c1)) { 02875 OUTPUT_UTF16(c1); 02876 } else if (c2) { 02877 nkf_char val, val2; 02878 val = e2w_conv(c2, c1); 02879 if (!val) return; 02880 val2 = e2w_combining(val, c2, c1); 02881 if (val2) 02882 OUTPUT_UTF16(val2); 02883 OUTPUT_UTF16(val); 02884 } else { 02885 OUTPUT_UTF16_BYTES(c1, c2); 02886 } 02887 } 02888 02889 #define OUTPUT_UTF32(c) do { \ 02890 if (output_endian == ENDIAN_LITTLE){ \ 02891 (*o_putc)( (c) & 0xFF); \ 02892 (*o_putc)(((c) >> 8) & 0xFF); \ 02893 (*o_putc)(((c) >> 16) & 0xFF); \ 02894 (*o_putc)(0); \ 02895 }else{ \ 02896 (*o_putc)(0); \ 02897 (*o_putc)(((c) >> 16) & 0xFF); \ 02898 (*o_putc)(((c) >> 8) & 0xFF); \ 02899 (*o_putc)( (c) & 0xFF); \ 02900 } \ 02901 } while (0) 02902 02903 static void 02904 w_oconv32(nkf_char c2, nkf_char c1) 02905 { 02906 if (output_bom_f) { 02907 output_bom_f = FALSE; 02908 if (output_endian == ENDIAN_LITTLE){ 02909 (*o_putc)(0xFF); 02910 (*o_putc)(0xFE); 02911 (*o_putc)(0); 02912 (*o_putc)(0); 02913 }else{ 02914 (*o_putc)(0); 02915 (*o_putc)(0); 02916 (*o_putc)(0xFE); 02917 (*o_putc)(0xFF); 02918 } 02919 } 02920 02921 if (c2 == EOF) { 02922 (*o_putc)(EOF); 02923 return; 02924 } 02925 02926 if (c2 == ISO_8859_1) { 02927 c1 |= 0x80; 02928 } else if (c2 == 0 && nkf_char_unicode_p(c1)) { 02929 c1 &= VALUE_MASK; 02930 } else if (c2) { 02931 nkf_char val, val2; 02932 val = e2w_conv(c2, c1); 02933 if (!val) return; 02934 val2 = e2w_combining(val, c2, c1); 02935 if (val2) 02936 OUTPUT_UTF32(val2); 02937 c1 = val; 02938 } 02939 OUTPUT_UTF32(c1); 02940 } 02941 #endif 02942 02943 #define SCORE_L2 (1) /* Kanji Level 2 */ 02944 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */ 02945 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */ 02946 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */ 02947 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */ 02948 #define SCORE_X0213 (SCORE_X0212 << 1) /* JIS X 0213 */ 02949 #define SCORE_NO_EXIST (SCORE_X0213 << 1) /* Undefined Characters */ 02950 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */ 02951 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */ 02952 02953 #define SCORE_INIT (SCORE_iMIME) 02954 02955 static const nkf_char score_table_A0[] = { 02956 0, 0, 0, 0, 02957 0, 0, 0, 0, 02958 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, 02959 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_X0213, 02960 }; 02961 02962 static const nkf_char score_table_F0[] = { 02963 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2, 02964 SCORE_L2, SCORE_DEPEND, SCORE_X0213, SCORE_X0213, 02965 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932, 02966 SCORE_CP932, SCORE_X0213, SCORE_X0213, SCORE_ERROR, 02967 }; 02968 02969 static const nkf_char score_table_8FA0[] = { 02970 0, SCORE_X0213, SCORE_X0212, SCORE_X0213, 02971 SCORE_X0213, SCORE_X0213, SCORE_X0212, SCORE_X0212, 02972 SCORE_X0213, SCORE_X0212, SCORE_X0212, SCORE_X0212, 02973 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213, 02974 }; 02975 02976 static const nkf_char score_table_8FE0[] = { 02977 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212, 02978 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212, 02979 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212, 02980 SCORE_X0212, SCORE_X0212, SCORE_X0213, SCORE_X0213, 02981 }; 02982 02983 static const nkf_char score_table_8FF0[] = { 02984 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0212, 02985 SCORE_X0212, SCORE_X0213, SCORE_X0213, SCORE_X0213, 02986 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213, 02987 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213, 02988 }; 02989 02990 static void 02991 set_code_score(struct input_code *ptr, nkf_char score) 02992 { 02993 if (ptr){ 02994 ptr->score |= score; 02995 } 02996 } 02997 02998 static void 02999 clr_code_score(struct input_code *ptr, nkf_char score) 03000 { 03001 if (ptr){ 03002 ptr->score &= ~score; 03003 } 03004 } 03005 03006 static void 03007 code_score(struct input_code *ptr) 03008 { 03009 nkf_char c2 = ptr->buf[0]; 03010 nkf_char c1 = ptr->buf[1]; 03011 if (c2 < 0){ 03012 set_code_score(ptr, SCORE_ERROR); 03013 }else if (c2 == SS2){ 03014 set_code_score(ptr, SCORE_KANA); 03015 }else if (c2 == 0x8f){ 03016 if ((c1 & 0x70) == 0x20){ 03017 set_code_score(ptr, score_table_8FA0[c1 & 0x0f]); 03018 }else if ((c1 & 0x70) == 0x60){ 03019 set_code_score(ptr, score_table_8FE0[c1 & 0x0f]); 03020 }else if ((c1 & 0x70) == 0x70){ 03021 set_code_score(ptr, score_table_8FF0[c1 & 0x0f]); 03022 }else{ 03023 set_code_score(ptr, SCORE_X0212); 03024 } 03025 #ifdef UTF8_OUTPUT_ENABLE 03026 }else if (!e2w_conv(c2, c1)){ 03027 set_code_score(ptr, SCORE_NO_EXIST); 03028 #endif 03029 }else if ((c2 & 0x70) == 0x20){ 03030 set_code_score(ptr, score_table_A0[c2 & 0x0f]); 03031 }else if ((c2 & 0x70) == 0x70){ 03032 set_code_score(ptr, score_table_F0[c2 & 0x0f]); 03033 }else if ((c2 & 0x70) >= 0x50){ 03034 set_code_score(ptr, SCORE_L2); 03035 } 03036 } 03037 03038 static void 03039 status_disable(struct input_code *ptr) 03040 { 03041 ptr->stat = -1; 03042 ptr->buf[0] = -1; 03043 code_score(ptr); 03044 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0); 03045 } 03046 03047 static void 03048 status_push_ch(struct input_code *ptr, nkf_char c) 03049 { 03050 ptr->buf[ptr->index++] = c; 03051 } 03052 03053 static void 03054 status_clear(struct input_code *ptr) 03055 { 03056 ptr->stat = 0; 03057 ptr->index = 0; 03058 } 03059 03060 static void 03061 status_reset(struct input_code *ptr) 03062 { 03063 status_clear(ptr); 03064 ptr->score = SCORE_INIT; 03065 } 03066 03067 static void 03068 status_reinit(struct input_code *ptr) 03069 { 03070 status_reset(ptr); 03071 ptr->_file_stat = 0; 03072 } 03073 03074 static void 03075 status_check(struct input_code *ptr, nkf_char c) 03076 { 03077 if (c <= DEL && estab_f){ 03078 status_reset(ptr); 03079 } 03080 } 03081 03082 static void 03083 s_status(struct input_code *ptr, nkf_char c) 03084 { 03085 switch(ptr->stat){ 03086 case -1: 03087 status_check(ptr, c); 03088 break; 03089 case 0: 03090 if (c <= DEL){ 03091 break; 03092 }else if (nkf_char_unicode_p(c)){ 03093 break; 03094 }else if (0xa1 <= c && c <= 0xdf){ 03095 status_push_ch(ptr, SS2); 03096 status_push_ch(ptr, c); 03097 code_score(ptr); 03098 status_clear(ptr); 03099 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){ 03100 ptr->stat = 1; 03101 status_push_ch(ptr, c); 03102 }else if (0xed <= c && c <= 0xee){ 03103 ptr->stat = 3; 03104 status_push_ch(ptr, c); 03105 #ifdef SHIFTJIS_CP932 03106 }else if (is_ibmext_in_sjis(c)){ 03107 ptr->stat = 2; 03108 status_push_ch(ptr, c); 03109 #endif /* SHIFTJIS_CP932 */ 03110 #ifdef X0212_ENABLE 03111 }else if (0xf0 <= c && c <= 0xfc){ 03112 ptr->stat = 1; 03113 status_push_ch(ptr, c); 03114 #endif /* X0212_ENABLE */ 03115 }else{ 03116 status_disable(ptr); 03117 } 03118 break; 03119 case 1: 03120 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){ 03121 status_push_ch(ptr, c); 03122 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]); 03123 code_score(ptr); 03124 status_clear(ptr); 03125 }else{ 03126 status_disable(ptr); 03127 } 03128 break; 03129 case 2: 03130 #ifdef SHIFTJIS_CP932 03131 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) { 03132 status_push_ch(ptr, c); 03133 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) { 03134 set_code_score(ptr, SCORE_CP932); 03135 status_clear(ptr); 03136 break; 03137 } 03138 } 03139 #endif /* SHIFTJIS_CP932 */ 03140 status_disable(ptr); 03141 break; 03142 case 3: 03143 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){ 03144 status_push_ch(ptr, c); 03145 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]); 03146 set_code_score(ptr, SCORE_CP932); 03147 status_clear(ptr); 03148 }else{ 03149 status_disable(ptr); 03150 } 03151 break; 03152 } 03153 } 03154 03155 static void 03156 e_status(struct input_code *ptr, nkf_char c) 03157 { 03158 switch (ptr->stat){ 03159 case -1: 03160 status_check(ptr, c); 03161 break; 03162 case 0: 03163 if (c <= DEL){ 03164 break; 03165 }else if (nkf_char_unicode_p(c)){ 03166 break; 03167 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){ 03168 ptr->stat = 1; 03169 status_push_ch(ptr, c); 03170 #ifdef X0212_ENABLE 03171 }else if (0x8f == c){ 03172 ptr->stat = 2; 03173 status_push_ch(ptr, c); 03174 #endif /* X0212_ENABLE */ 03175 }else{ 03176 status_disable(ptr); 03177 } 03178 break; 03179 case 1: 03180 if (0xa1 <= c && c <= 0xfe){ 03181 status_push_ch(ptr, c); 03182 code_score(ptr); 03183 status_clear(ptr); 03184 }else{ 03185 status_disable(ptr); 03186 } 03187 break; 03188 #ifdef X0212_ENABLE 03189 case 2: 03190 if (0xa1 <= c && c <= 0xfe){ 03191 ptr->stat = 1; 03192 status_push_ch(ptr, c); 03193 }else{ 03194 status_disable(ptr); 03195 } 03196 #endif /* X0212_ENABLE */ 03197 } 03198 } 03199 03200 #ifdef UTF8_INPUT_ENABLE 03201 static void 03202 w_status(struct input_code *ptr, nkf_char c) 03203 { 03204 switch (ptr->stat){ 03205 case -1: 03206 status_check(ptr, c); 03207 break; 03208 case 0: 03209 if (c <= DEL){ 03210 break; 03211 }else if (nkf_char_unicode_p(c)){ 03212 break; 03213 }else if (0xc0 <= c && c <= 0xdf){ 03214 ptr->stat = 1; 03215 status_push_ch(ptr, c); 03216 }else if (0xe0 <= c && c <= 0xef){ 03217 ptr->stat = 2; 03218 status_push_ch(ptr, c); 03219 }else if (0xf0 <= c && c <= 0xf4){ 03220 ptr->stat = 3; 03221 status_push_ch(ptr, c); 03222 }else{ 03223 status_disable(ptr); 03224 } 03225 break; 03226 case 1: 03227 case 2: 03228 if (0x80 <= c && c <= 0xbf){ 03229 status_push_ch(ptr, c); 03230 if (ptr->index > ptr->stat){ 03231 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb 03232 && ptr->buf[2] == 0xbf); 03233 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2], 03234 &ptr->buf[0], &ptr->buf[1]); 03235 if (!bom){ 03236 code_score(ptr); 03237 } 03238 status_clear(ptr); 03239 } 03240 }else{ 03241 status_disable(ptr); 03242 } 03243 break; 03244 case 3: 03245 if (0x80 <= c && c <= 0xbf){ 03246 if (ptr->index < ptr->stat){ 03247 status_push_ch(ptr, c); 03248 } else { 03249 status_clear(ptr); 03250 } 03251 }else{ 03252 status_disable(ptr); 03253 } 03254 break; 03255 } 03256 } 03257 #endif 03258 03259 static void 03260 code_status(nkf_char c) 03261 { 03262 int action_flag = 1; 03263 struct input_code *result = 0; 03264 struct input_code *p = input_code_list; 03265 while (p->name){ 03266 if (!p->status_func) { 03267 ++p; 03268 continue; 03269 } 03270 if (!p->status_func) 03271 continue; 03272 (p->status_func)(p, c); 03273 if (p->stat > 0){ 03274 action_flag = 0; 03275 }else if(p->stat == 0){ 03276 if (result){ 03277 action_flag = 0; 03278 }else{ 03279 result = p; 03280 } 03281 } 03282 ++p; 03283 } 03284 03285 if (action_flag){ 03286 if (result && !estab_f){ 03287 set_iconv(TRUE, result->iconv_func); 03288 }else if (c <= DEL){ 03289 struct input_code *ptr = input_code_list; 03290 while (ptr->name){ 03291 status_reset(ptr); 03292 ++ptr; 03293 } 03294 } 03295 } 03296 } 03297 03298 typedef struct { 03299 nkf_buf_t *std_gc_buf; 03300 nkf_char broken_state; 03301 nkf_buf_t *broken_buf; 03302 nkf_char mimeout_state; 03303 nkf_buf_t *nfc_buf; 03304 } nkf_state_t; 03305 03306 static nkf_state_t *nkf_state = NULL; 03307 03308 #define STD_GC_BUFSIZE (256) 03309 03310 static void 03311 nkf_state_init(void) 03312 { 03313 if (nkf_state) { 03314 nkf_buf_clear(nkf_state->std_gc_buf); 03315 nkf_buf_clear(nkf_state->broken_buf); 03316 nkf_buf_clear(nkf_state->nfc_buf); 03317 } 03318 else { 03319 nkf_state = nkf_xmalloc(sizeof(nkf_state_t)); 03320 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE); 03321 nkf_state->broken_buf = nkf_buf_new(3); 03322 nkf_state->nfc_buf = nkf_buf_new(9); 03323 } 03324 nkf_state->broken_state = 0; 03325 nkf_state->mimeout_state = 0; 03326 } 03327 03328 #ifndef WIN32DLL 03329 static nkf_char 03330 std_getc(FILE *f) 03331 { 03332 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){ 03333 return nkf_buf_pop(nkf_state->std_gc_buf); 03334 } 03335 return getc(f); 03336 } 03337 #endif /*WIN32DLL*/ 03338 03339 static nkf_char 03340 std_ungetc(nkf_char c, ARG_UNUSED FILE *f) 03341 { 03342 nkf_buf_push(nkf_state->std_gc_buf, c); 03343 return c; 03344 } 03345 03346 #ifndef WIN32DLL 03347 static void 03348 std_putc(nkf_char c) 03349 { 03350 if(c!=EOF) 03351 putchar(c); 03352 } 03353 #endif /*WIN32DLL*/ 03354 03355 static nkf_char hold_buf[HOLD_SIZE*2]; 03356 static int hold_count = 0; 03357 static nkf_char 03358 push_hold_buf(nkf_char c2) 03359 { 03360 if (hold_count >= HOLD_SIZE*2) 03361 return (EOF); 03362 hold_buf[hold_count++] = c2; 03363 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count); 03364 } 03365 03366 static int 03367 h_conv(FILE *f, nkf_char c1, nkf_char c2) 03368 { 03369 int ret; 03370 int hold_index; 03371 int fromhold_count; 03372 nkf_char c3, c4; 03373 03378 hold_count = 0; 03379 push_hold_buf(c1); 03380 push_hold_buf(c2); 03381 03382 while ((c2 = (*i_getc)(f)) != EOF) { 03383 if (c2 == ESC){ 03384 (*i_ungetc)(c2,f); 03385 break; 03386 } 03387 code_status(c2); 03388 if (push_hold_buf(c2) == EOF || estab_f) { 03389 break; 03390 } 03391 } 03392 03393 if (!estab_f) { 03394 struct input_code *p = input_code_list; 03395 struct input_code *result = p; 03396 if (c2 == EOF) { 03397 code_status(c2); 03398 } 03399 while (p->name) { 03400 if (p->status_func && p->score < result->score) { 03401 result = p; 03402 } 03403 p++; 03404 } 03405 set_iconv(TRUE, result->iconv_func); 03406 } 03407 03408 03418 ret = c2; 03419 hold_index = 0; 03420 while (hold_index < hold_count){ 03421 c1 = hold_buf[hold_index++]; 03422 if (nkf_char_unicode_p(c1)) { 03423 (*oconv)(0, c1); 03424 continue; 03425 } 03426 else if (c1 <= DEL){ 03427 (*iconv)(0, c1, 0); 03428 continue; 03429 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){ 03430 (*iconv)(JIS_X_0201_1976_K, c1, 0); 03431 continue; 03432 } 03433 fromhold_count = 1; 03434 if (hold_index < hold_count){ 03435 c2 = hold_buf[hold_index++]; 03436 fromhold_count++; 03437 }else{ 03438 c2 = (*i_getc)(f); 03439 if (c2 == EOF){ 03440 c4 = EOF; 03441 break; 03442 } 03443 code_status(c2); 03444 } 03445 c3 = 0; 03446 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */ 03447 case -2: 03448 /* 4 bytes UTF-8 */ 03449 if (hold_index < hold_count){ 03450 c3 = hold_buf[hold_index++]; 03451 } else if ((c3 = (*i_getc)(f)) == EOF) { 03452 ret = EOF; 03453 break; 03454 } 03455 code_status(c3); 03456 if (hold_index < hold_count){ 03457 c4 = hold_buf[hold_index++]; 03458 } else if ((c4 = (*i_getc)(f)) == EOF) { 03459 c3 = ret = EOF; 03460 break; 03461 } 03462 code_status(c4); 03463 (*iconv)(c1, c2, (c3<<8)|c4); 03464 break; 03465 case -3: 03466 /* 4 bytes UTF-8 (check combining character) */ 03467 if (hold_index < hold_count){ 03468 c3 = hold_buf[hold_index++]; 03469 fromhold_count++; 03470 } else if ((c3 = (*i_getc)(f)) == EOF) { 03471 w_iconv_nocombine(c1, c2, 0); 03472 break; 03473 } 03474 if (hold_index < hold_count){ 03475 c4 = hold_buf[hold_index++]; 03476 fromhold_count++; 03477 } else if ((c4 = (*i_getc)(f)) == EOF) { 03478 w_iconv_nocombine(c1, c2, 0); 03479 if (fromhold_count <= 2) 03480 (*i_ungetc)(c3,f); 03481 else 03482 hold_index--; 03483 continue; 03484 } 03485 if (w_iconv_combine(c1, c2, 0, c3, c4, 0)) { 03486 w_iconv_nocombine(c1, c2, 0); 03487 if (fromhold_count <= 2) { 03488 (*i_ungetc)(c4,f); 03489 (*i_ungetc)(c3,f); 03490 } else if (fromhold_count == 3) { 03491 (*i_ungetc)(c4,f); 03492 hold_index--; 03493 } else { 03494 hold_index -= 2; 03495 } 03496 } 03497 break; 03498 case -1: 03499 /* 3 bytes EUC or UTF-8 */ 03500 if (hold_index < hold_count){ 03501 c3 = hold_buf[hold_index++]; 03502 fromhold_count++; 03503 } else if ((c3 = (*i_getc)(f)) == EOF) { 03504 ret = EOF; 03505 break; 03506 } else { 03507 code_status(c3); 03508 } 03509 if ((*iconv)(c1, c2, c3) == -3) { 03510 /* 6 bytes UTF-8 (check combining character) */ 03511 nkf_char c5, c6; 03512 if (hold_index < hold_count){ 03513 c4 = hold_buf[hold_index++]; 03514 fromhold_count++; 03515 } else if ((c4 = (*i_getc)(f)) == EOF) { 03516 w_iconv_nocombine(c1, c2, c3); 03517 continue; 03518 } 03519 if (hold_index < hold_count){ 03520 c5 = hold_buf[hold_index++]; 03521 fromhold_count++; 03522 } else if ((c5 = (*i_getc)(f)) == EOF) { 03523 w_iconv_nocombine(c1, c2, c3); 03524 if (fromhold_count == 4) 03525 hold_index--; 03526 else 03527 (*i_ungetc)(c4,f); 03528 continue; 03529 } 03530 if (hold_index < hold_count){ 03531 c6 = hold_buf[hold_index++]; 03532 fromhold_count++; 03533 } else if ((c6 = (*i_getc)(f)) == EOF) { 03534 w_iconv_nocombine(c1, c2, c3); 03535 if (fromhold_count == 5) { 03536 hold_index -= 2; 03537 } else if (fromhold_count == 4) { 03538 hold_index--; 03539 (*i_ungetc)(c5,f); 03540 } else { 03541 (*i_ungetc)(c5,f); 03542 (*i_ungetc)(c4,f); 03543 } 03544 continue; 03545 } 03546 if (w_iconv_combine(c1, c2, c3, c4, c5, c6)) { 03547 w_iconv_nocombine(c1, c2, c3); 03548 if (fromhold_count == 6) { 03549 hold_index -= 3; 03550 } else if (fromhold_count == 5) { 03551 hold_index -= 2; 03552 (*i_ungetc)(c6,f); 03553 } else if (fromhold_count == 4) { 03554 hold_index--; 03555 (*i_ungetc)(c6,f); 03556 (*i_ungetc)(c5,f); 03557 } else { 03558 (*i_ungetc)(c6,f); 03559 (*i_ungetc)(c5,f); 03560 (*i_ungetc)(c4,f); 03561 } 03562 } 03563 } 03564 break; 03565 } 03566 if (c3 == EOF) break; 03567 } 03568 return ret; 03569 } 03570 03571 /* 03572 * Check and Ignore BOM 03573 */ 03574 static void 03575 check_bom(FILE *f) 03576 { 03577 int c2; 03578 switch(c2 = (*i_getc)(f)){ 03579 case 0x00: 03580 if((c2 = (*i_getc)(f)) == 0x00){ 03581 if((c2 = (*i_getc)(f)) == 0xFE){ 03582 if((c2 = (*i_getc)(f)) == 0xFF){ 03583 if(!input_encoding){ 03584 set_iconv(TRUE, w_iconv32); 03585 } 03586 if (iconv == w_iconv32) { 03587 input_bom_f = TRUE; 03588 input_endian = ENDIAN_BIG; 03589 return; 03590 } 03591 (*i_ungetc)(0xFF,f); 03592 }else (*i_ungetc)(c2,f); 03593 (*i_ungetc)(0xFE,f); 03594 }else if(c2 == 0xFF){ 03595 if((c2 = (*i_getc)(f)) == 0xFE){ 03596 if(!input_encoding){ 03597 set_iconv(TRUE, w_iconv32); 03598 } 03599 if (iconv == w_iconv32) { 03600 input_endian = ENDIAN_2143; 03601 return; 03602 } 03603 (*i_ungetc)(0xFF,f); 03604 }else (*i_ungetc)(c2,f); 03605 (*i_ungetc)(0xFF,f); 03606 }else (*i_ungetc)(c2,f); 03607 (*i_ungetc)(0x00,f); 03608 }else (*i_ungetc)(c2,f); 03609 (*i_ungetc)(0x00,f); 03610 break; 03611 case 0xEF: 03612 if((c2 = (*i_getc)(f)) == 0xBB){ 03613 if((c2 = (*i_getc)(f)) == 0xBF){ 03614 if(!input_encoding){ 03615 set_iconv(TRUE, w_iconv); 03616 } 03617 if (iconv == w_iconv) { 03618 input_bom_f = TRUE; 03619 return; 03620 } 03621 (*i_ungetc)(0xBF,f); 03622 }else (*i_ungetc)(c2,f); 03623 (*i_ungetc)(0xBB,f); 03624 }else (*i_ungetc)(c2,f); 03625 (*i_ungetc)(0xEF,f); 03626 break; 03627 case 0xFE: 03628 if((c2 = (*i_getc)(f)) == 0xFF){ 03629 if((c2 = (*i_getc)(f)) == 0x00){ 03630 if((c2 = (*i_getc)(f)) == 0x00){ 03631 if(!input_encoding){ 03632 set_iconv(TRUE, w_iconv32); 03633 } 03634 if (iconv == w_iconv32) { 03635 input_endian = ENDIAN_3412; 03636 return; 03637 } 03638 (*i_ungetc)(0x00,f); 03639 }else (*i_ungetc)(c2,f); 03640 (*i_ungetc)(0x00,f); 03641 }else (*i_ungetc)(c2,f); 03642 if(!input_encoding){ 03643 set_iconv(TRUE, w_iconv16); 03644 } 03645 if (iconv == w_iconv16) { 03646 input_endian = ENDIAN_BIG; 03647 input_bom_f = TRUE; 03648 return; 03649 } 03650 (*i_ungetc)(0xFF,f); 03651 }else (*i_ungetc)(c2,f); 03652 (*i_ungetc)(0xFE,f); 03653 break; 03654 case 0xFF: 03655 if((c2 = (*i_getc)(f)) == 0xFE){ 03656 if((c2 = (*i_getc)(f)) == 0x00){ 03657 if((c2 = (*i_getc)(f)) == 0x00){ 03658 if(!input_encoding){ 03659 set_iconv(TRUE, w_iconv32); 03660 } 03661 if (iconv == w_iconv32) { 03662 input_endian = ENDIAN_LITTLE; 03663 input_bom_f = TRUE; 03664 return; 03665 } 03666 (*i_ungetc)(0x00,f); 03667 }else (*i_ungetc)(c2,f); 03668 (*i_ungetc)(0x00,f); 03669 }else (*i_ungetc)(c2,f); 03670 if(!input_encoding){ 03671 set_iconv(TRUE, w_iconv16); 03672 } 03673 if (iconv == w_iconv16) { 03674 input_endian = ENDIAN_LITTLE; 03675 input_bom_f = TRUE; 03676 return; 03677 } 03678 (*i_ungetc)(0xFE,f); 03679 }else (*i_ungetc)(c2,f); 03680 (*i_ungetc)(0xFF,f); 03681 break; 03682 default: 03683 (*i_ungetc)(c2,f); 03684 break; 03685 } 03686 } 03687 03688 static nkf_char 03689 broken_getc(FILE *f) 03690 { 03691 nkf_char c, c1; 03692 03693 if (!nkf_buf_empty_p(nkf_state->broken_buf)) { 03694 return nkf_buf_pop(nkf_state->broken_buf); 03695 } 03696 c = (*i_bgetc)(f); 03697 if (c=='$' && nkf_state->broken_state != ESC 03698 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) { 03699 c1= (*i_bgetc)(f); 03700 nkf_state->broken_state = 0; 03701 if (c1=='@'|| c1=='B') { 03702 nkf_buf_push(nkf_state->broken_buf, c1); 03703 nkf_buf_push(nkf_state->broken_buf, c); 03704 return ESC; 03705 } else { 03706 (*i_bungetc)(c1,f); 03707 return c; 03708 } 03709 } else if (c=='(' && nkf_state->broken_state != ESC 03710 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) { 03711 c1= (*i_bgetc)(f); 03712 nkf_state->broken_state = 0; 03713 if (c1=='J'|| c1=='B') { 03714 nkf_buf_push(nkf_state->broken_buf, c1); 03715 nkf_buf_push(nkf_state->broken_buf, c); 03716 return ESC; 03717 } else { 03718 (*i_bungetc)(c1,f); 03719 return c; 03720 } 03721 } else { 03722 nkf_state->broken_state = c; 03723 return c; 03724 } 03725 } 03726 03727 static nkf_char 03728 broken_ungetc(nkf_char c, ARG_UNUSED FILE *f) 03729 { 03730 if (nkf_buf_length(nkf_state->broken_buf) < 2) 03731 nkf_buf_push(nkf_state->broken_buf, c); 03732 return c; 03733 } 03734 03735 static void 03736 eol_conv(nkf_char c2, nkf_char c1) 03737 { 03738 if (guess_f && input_eol != EOF) { 03739 if (c2 == 0 && c1 == LF) { 03740 if (!input_eol) input_eol = prev_cr ? CRLF : LF; 03741 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF; 03742 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF; 03743 else if (!prev_cr); 03744 else if (!input_eol) input_eol = CR; 03745 else if (input_eol != CR) input_eol = EOF; 03746 } 03747 if (prev_cr || (c2 == 0 && c1 == LF)) { 03748 prev_cr = 0; 03749 if (eolmode_f != LF) (*o_eol_conv)(0, CR); 03750 if (eolmode_f != CR) (*o_eol_conv)(0, LF); 03751 } 03752 if (c2 == 0 && c1 == CR) prev_cr = CR; 03753 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1); 03754 } 03755 03756 static void 03757 put_newline(void (*func)(nkf_char)) 03758 { 03759 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) { 03760 case CRLF: 03761 (*func)(0x0D); 03762 (*func)(0x0A); 03763 break; 03764 case CR: 03765 (*func)(0x0D); 03766 break; 03767 case LF: 03768 (*func)(0x0A); 03769 break; 03770 } 03771 } 03772 03773 static void 03774 oconv_newline(void (*func)(nkf_char, nkf_char)) 03775 { 03776 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) { 03777 case CRLF: 03778 (*func)(0, 0x0D); 03779 (*func)(0, 0x0A); 03780 break; 03781 case CR: 03782 (*func)(0, 0x0D); 03783 break; 03784 case LF: 03785 (*func)(0, 0x0A); 03786 break; 03787 } 03788 } 03789 03790 /* 03791 Return value of fold_conv() 03792 03793 LF add newline and output char 03794 CR add newline and output nothing 03795 SP space 03796 0 skip 03797 1 (or else) normal output 03798 03799 fold state in prev (previous character) 03800 03801 >0x80 Japanese (X0208/X0201) 03802 <0x80 ASCII 03803 LF new line 03804 SP space 03805 03806 This fold algorthm does not preserve heading space in a line. 03807 This is the main difference from fmt. 03808 */ 03809 03810 #define char_size(c2,c1) (c2?2:1) 03811 03812 static void 03813 fold_conv(nkf_char c2, nkf_char c1) 03814 { 03815 nkf_char prev0; 03816 nkf_char fold_state; 03817 03818 if (c1== CR && !fold_preserve_f) { 03819 fold_state=0; /* ignore cr */ 03820 }else if (c1== LF&&f_prev==CR && fold_preserve_f) { 03821 f_prev = LF; 03822 fold_state=0; /* ignore cr */ 03823 } else if (c1== BS) { 03824 if (f_line>0) f_line--; 03825 fold_state = 1; 03826 } else if (c2==EOF && f_line != 0) { /* close open last line */ 03827 fold_state = LF; 03828 } else if ((c1==LF && !fold_preserve_f) 03829 || ((c1==CR||(c1==LF&&f_prev!=CR)) 03830 && fold_preserve_f)) { 03831 /* new line */ 03832 if (fold_preserve_f) { 03833 f_prev = c1; 03834 f_line = 0; 03835 fold_state = CR; 03836 } else if ((f_prev == c1 && !fold_preserve_f) 03837 || (f_prev == LF && fold_preserve_f) 03838 ) { /* duplicate newline */ 03839 if (f_line) { 03840 f_line = 0; 03841 fold_state = LF; /* output two newline */ 03842 } else { 03843 f_line = 0; 03844 fold_state = 1; 03845 } 03846 } else { 03847 if (f_prev&0x80) { /* Japanese? */ 03848 f_prev = c1; 03849 fold_state = 0; /* ignore given single newline */ 03850 } else if (f_prev==SP) { 03851 fold_state = 0; 03852 } else { 03853 f_prev = c1; 03854 if (++f_line<=fold_len) 03855 fold_state = SP; 03856 else { 03857 f_line = 0; 03858 fold_state = CR; /* fold and output nothing */ 03859 } 03860 } 03861 } 03862 } else if (c1=='\f') { 03863 f_prev = LF; 03864 f_line = 0; 03865 fold_state = LF; /* output newline and clear */ 03866 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) { 03867 /* X0208 kankaku or ascii space */ 03868 if (f_prev == SP) { 03869 fold_state = 0; /* remove duplicate spaces */ 03870 } else { 03871 f_prev = SP; 03872 if (++f_line<=fold_len) 03873 fold_state = SP; /* output ASCII space only */ 03874 else { 03875 f_prev = SP; f_line = 0; 03876 fold_state = CR; /* fold and output nothing */ 03877 } 03878 } 03879 } else { 03880 prev0 = f_prev; /* we still need this one... , but almost done */ 03881 f_prev = c1; 03882 if (c2 || c2 == JIS_X_0201_1976_K) 03883 f_prev |= 0x80; /* this is Japanese */ 03884 f_line += c2 == JIS_X_0201_1976_K ? 1: char_size(c2,c1); 03885 if (f_line<=fold_len) { /* normal case */ 03886 fold_state = 1; 03887 } else { 03888 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */ 03889 f_line = char_size(c2,c1); 03890 fold_state = LF; /* We can't wait, do fold now */ 03891 } else if (c2 == JIS_X_0201_1976_K) { 03892 /* simple kinsoku rules return 1 means no folding */ 03893 if (c1==(0xde&0x7f)) fold_state = 1; /* $B!+(B*/ 03894 else if (c1==(0xdf&0x7f)) fold_state = 1; /* $B!,(B*/ 03895 else if (c1==(0xa4&0x7f)) fold_state = 1; /* $B!#(B*/ 03896 else if (c1==(0xa3&0x7f)) fold_state = 1; /* $B!$(B*/ 03897 else if (c1==(0xa1&0x7f)) fold_state = 1; /* $B!W(B*/ 03898 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */ 03899 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */ 03900 f_line = 1; 03901 fold_state = LF;/* add one new f_line before this character */ 03902 } else { 03903 f_line = 1; 03904 fold_state = LF;/* add one new f_line before this character */ 03905 } 03906 } else if (c2==0) { 03907 /* kinsoku point in ASCII */ 03908 if ( c1==')'|| /* { [ ( */ 03909 c1==']'|| 03910 c1=='}'|| 03911 c1=='.'|| 03912 c1==','|| 03913 c1=='!'|| 03914 c1=='?'|| 03915 c1=='/'|| 03916 c1==':'|| 03917 c1==';') { 03918 fold_state = 1; 03919 /* just after special */ 03920 } else if (!is_alnum(prev0)) { 03921 f_line = char_size(c2,c1); 03922 fold_state = LF; 03923 } else if ((prev0==SP) || /* ignored new f_line */ 03924 (prev0==LF)|| /* ignored new f_line */ 03925 (prev0&0x80)) { /* X0208 - ASCII */ 03926 f_line = char_size(c2,c1); 03927 fold_state = LF;/* add one new f_line before this character */ 03928 } else { 03929 fold_state = 1; /* default no fold in ASCII */ 03930 } 03931 } else { 03932 if (c2=='!') { 03933 if (c1=='"') fold_state = 1; /* $B!"(B */ 03934 else if (c1=='#') fold_state = 1; /* $B!#(B */ 03935 else if (c1=='W') fold_state = 1; /* $B!W(B */ 03936 else if (c1=='K') fold_state = 1; /* $B!K(B */ 03937 else if (c1=='$') fold_state = 1; /* $B!$(B */ 03938 else if (c1=='%') fold_state = 1; /* $B!%(B */ 03939 else if (c1=='\'') fold_state = 1; /* $B!\(B */ 03940 else if (c1=='(') fold_state = 1; /* $B!((B */ 03941 else if (c1==')') fold_state = 1; /* $B!)(B */ 03942 else if (c1=='*') fold_state = 1; /* $B!*(B */ 03943 else if (c1=='+') fold_state = 1; /* $B!+(B */ 03944 else if (c1==',') fold_state = 1; /* $B!,(B */ 03945 /* default no fold in kinsoku */ 03946 else { 03947 fold_state = LF; 03948 f_line = char_size(c2,c1); 03949 /* add one new f_line before this character */ 03950 } 03951 } else { 03952 f_line = char_size(c2,c1); 03953 fold_state = LF; 03954 /* add one new f_line before this character */ 03955 } 03956 } 03957 } 03958 } 03959 /* terminator process */ 03960 switch(fold_state) { 03961 case LF: 03962 oconv_newline(o_fconv); 03963 (*o_fconv)(c2,c1); 03964 break; 03965 case 0: 03966 return; 03967 case CR: 03968 oconv_newline(o_fconv); 03969 break; 03970 case TAB: 03971 case SP: 03972 (*o_fconv)(0,SP); 03973 break; 03974 default: 03975 (*o_fconv)(c2,c1); 03976 } 03977 } 03978 03979 static nkf_char z_prev2=0,z_prev1=0; 03980 03981 static void 03982 z_conv(nkf_char c2, nkf_char c1) 03983 { 03984 03985 /* if (c2) c1 &= 0x7f; assertion */ 03986 03987 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) { 03988 (*o_zconv)(c2,c1); 03989 return; 03990 } 03991 03992 if (x0201_f) { 03993 if (z_prev2 == JIS_X_0201_1976_K) { 03994 if (c2 == JIS_X_0201_1976_K) { 03995 if (c1 == (0xde&0x7f)) { /* $BByE@(B */ 03996 z_prev2 = 0; 03997 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]); 03998 return; 03999 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /* $BH>ByE@(B */ 04000 z_prev2 = 0; 04001 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]); 04002 return; 04003 } else if (x0213_f && c1 == (0xdf&0x7f) && ev_x0213[(z_prev1-SP)*2]) { /* $BH>ByE@(B */ 04004 z_prev2 = 0; 04005 (*o_zconv)(ev_x0213[(z_prev1-SP)*2], ev_x0213[(z_prev1-SP)*2+1]); 04006 return; 04007 } 04008 } 04009 z_prev2 = 0; 04010 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]); 04011 } 04012 if (c2 == JIS_X_0201_1976_K) { 04013 if (dv[(c1-SP)*2] || ev[(c1-SP)*2] || (x0213_f && ev_x0213[(c1-SP)*2])) { 04014 /* wait for $BByE@(B or $BH>ByE@(B */ 04015 z_prev1 = c1; 04016 z_prev2 = c2; 04017 return; 04018 } else { 04019 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]); 04020 return; 04021 } 04022 } 04023 } 04024 04025 if (c2 == EOF) { 04026 (*o_zconv)(c2, c1); 04027 return; 04028 } 04029 04030 if (alpha_f&1 && c2 == 0x23) { 04031 /* JISX0208 Alphabet */ 04032 c2 = 0; 04033 } else if (c2 == 0x21) { 04034 /* JISX0208 Kigou */ 04035 if (0x21==c1) { 04036 if (alpha_f&2) { 04037 c2 = 0; 04038 c1 = SP; 04039 } else if (alpha_f&4) { 04040 (*o_zconv)(0, SP); 04041 (*o_zconv)(0, SP); 04042 return; 04043 } 04044 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) { 04045 c2 = 0; 04046 c1 = fv[c1-0x20]; 04047 } 04048 } 04049 04050 if (alpha_f&8 && c2 == 0) { 04051 /* HTML Entity */ 04052 const char *entity = 0; 04053 switch (c1){ 04054 case '>': entity = ">"; break; 04055 case '<': entity = "<"; break; 04056 case '\"': entity = """; break; 04057 case '&': entity = "&"; break; 04058 } 04059 if (entity){ 04060 while (*entity) (*o_zconv)(0, *entity++); 04061 return; 04062 } 04063 } 04064 04065 if (alpha_f & 16) { 04066 /* JIS X 0208 Katakana to JIS X 0201 Katakana */ 04067 if (c2 == 0x21) { 04068 nkf_char c = 0; 04069 switch (c1) { 04070 case 0x23: 04071 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */ 04072 c = 0xA1; 04073 break; 04074 case 0x56: 04075 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */ 04076 c = 0xA2; 04077 break; 04078 case 0x57: 04079 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */ 04080 c = 0xA3; 04081 break; 04082 case 0x22: 04083 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */ 04084 c = 0xA4; 04085 break; 04086 case 0x26: 04087 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */ 04088 c = 0xA5; 04089 break; 04090 case 0x3C: 04091 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */ 04092 c = 0xB0; 04093 break; 04094 case 0x2B: 04095 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */ 04096 c = 0xDE; 04097 break; 04098 case 0x2C: 04099 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */ 04100 c = 0xDF; 04101 break; 04102 } 04103 if (c) { 04104 (*o_zconv)(JIS_X_0201_1976_K, c); 04105 return; 04106 } 04107 } else if (c2 == 0x25) { 04108 /* JISX0208 Katakana */ 04109 static const int fullwidth_to_halfwidth[] = 04110 { 04111 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00, 04112 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800, 04113 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00, 04114 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000, 04115 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E, 04116 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00, 04117 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F, 04118 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000, 04119 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00, 04120 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00, 04121 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x365F, 04122 0x375F, 0x385F, 0x395F, 0x3A5F, 0x3E5F, 0x425F, 0x445F, 0x0000 04123 }; 04124 if (fullwidth_to_halfwidth[c1-0x20]){ 04125 c2 = fullwidth_to_halfwidth[c1-0x20]; 04126 (*o_zconv)(JIS_X_0201_1976_K, c2>>8); 04127 if (c2 & 0xFF) { 04128 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF); 04129 } 04130 return; 04131 } 04132 } else if (c2 == 0 && nkf_char_unicode_p(c1) && 04133 ((c1&VALUE_MASK) == 0x3099 || (c1&VALUE_MASK) == 0x309A)) { /* $B9g@.MQByE@!&H>ByE@(B */ 04134 (*o_zconv)(JIS_X_0201_1976_K, 0x5E + (c1&VALUE_MASK) - 0x3099); 04135 return; 04136 } 04137 } 04138 (*o_zconv)(c2,c1); 04139 } 04140 04141 04142 #define rot13(c) ( \ 04143 ( c < 'A') ? c: \ 04144 (c <= 'M') ? (c + 13): \ 04145 (c <= 'Z') ? (c - 13): \ 04146 (c < 'a') ? (c): \ 04147 (c <= 'm') ? (c + 13): \ 04148 (c <= 'z') ? (c - 13): \ 04149 (c) \ 04150 ) 04151 04152 #define rot47(c) ( \ 04153 ( c < '!') ? c: \ 04154 ( c <= 'O') ? (c + 47) : \ 04155 ( c <= '~') ? (c - 47) : \ 04156 c \ 04157 ) 04158 04159 static void 04160 rot_conv(nkf_char c2, nkf_char c1) 04161 { 04162 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) { 04163 c1 = rot13(c1); 04164 } else if (c2) { 04165 c1 = rot47(c1); 04166 c2 = rot47(c2); 04167 } 04168 (*o_rot_conv)(c2,c1); 04169 } 04170 04171 static void 04172 hira_conv(nkf_char c2, nkf_char c1) 04173 { 04174 if (hira_f & 1) { 04175 if (c2 == 0x25) { 04176 if (0x20 < c1 && c1 < 0x74) { 04177 c2 = 0x24; 04178 (*o_hira_conv)(c2,c1); 04179 return; 04180 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) { 04181 c2 = 0; 04182 c1 = nkf_char_unicode_new(0x3094); 04183 (*o_hira_conv)(c2,c1); 04184 return; 04185 } 04186 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) { 04187 c1 += 2; 04188 (*o_hira_conv)(c2,c1); 04189 return; 04190 } 04191 } 04192 if (hira_f & 2) { 04193 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) { 04194 c2 = 0x25; 04195 c1 = 0x74; 04196 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) { 04197 c2 = 0x25; 04198 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) { 04199 c1 -= 2; 04200 } 04201 } 04202 (*o_hira_conv)(c2,c1); 04203 } 04204 04205 04206 static void 04207 iso2022jp_check_conv(nkf_char c2, nkf_char c1) 04208 { 04209 #define RANGE_NUM_MAX 18 04210 static const nkf_char range[RANGE_NUM_MAX][2] = { 04211 {0x222f, 0x2239,}, 04212 {0x2242, 0x2249,}, 04213 {0x2251, 0x225b,}, 04214 {0x226b, 0x2271,}, 04215 {0x227a, 0x227d,}, 04216 {0x2321, 0x232f,}, 04217 {0x233a, 0x2340,}, 04218 {0x235b, 0x2360,}, 04219 {0x237b, 0x237e,}, 04220 {0x2474, 0x247e,}, 04221 {0x2577, 0x257e,}, 04222 {0x2639, 0x2640,}, 04223 {0x2659, 0x267e,}, 04224 {0x2742, 0x2750,}, 04225 {0x2772, 0x277e,}, 04226 {0x2841, 0x287e,}, 04227 {0x4f54, 0x4f7e,}, 04228 {0x7425, 0x747e}, 04229 }; 04230 nkf_char i; 04231 nkf_char start, end, c; 04232 04233 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) { 04234 c2 = GETA1; 04235 c1 = GETA2; 04236 } 04237 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) { 04238 c2 = GETA1; 04239 c1 = GETA2; 04240 } 04241 04242 for (i = 0; i < RANGE_NUM_MAX; i++) { 04243 start = range[i][0]; 04244 end = range[i][1]; 04245 c = (c2 << 8) + c1; 04246 if (c >= start && c <= end) { 04247 c2 = GETA1; 04248 c1 = GETA2; 04249 } 04250 } 04251 (*o_iso2022jp_check_conv)(c2,c1); 04252 } 04253 04254 04255 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */ 04256 04257 static const unsigned char *mime_pattern[] = { 04258 (const unsigned char *)"\075?EUC-JP?B?", 04259 (const unsigned char *)"\075?SHIFT_JIS?B?", 04260 (const unsigned char *)"\075?ISO-8859-1?Q?", 04261 (const unsigned char *)"\075?ISO-8859-1?B?", 04262 (const unsigned char *)"\075?ISO-2022-JP?B?", 04263 (const unsigned char *)"\075?ISO-2022-JP?B?", 04264 (const unsigned char *)"\075?ISO-2022-JP?Q?", 04265 #if defined(UTF8_INPUT_ENABLE) 04266 (const unsigned char *)"\075?UTF-8?B?", 04267 (const unsigned char *)"\075?UTF-8?Q?", 04268 #endif 04269 (const unsigned char *)"\075?US-ASCII?Q?", 04270 NULL 04271 }; 04272 04273 04274 /* $B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u(B */ 04275 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = { 04276 e_iconv, s_iconv, 0, 0, 0, 0, 0, 04277 #if defined(UTF8_INPUT_ENABLE) 04278 w_iconv, w_iconv, 04279 #endif 04280 0, 04281 }; 04282 04283 static const nkf_char mime_encode[] = { 04284 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K, 04285 #if defined(UTF8_INPUT_ENABLE) 04286 UTF_8, UTF_8, 04287 #endif 04288 ASCII, 04289 0 04290 }; 04291 04292 static const nkf_char mime_encode_method[] = { 04293 'B', 'B','Q', 'B', 'B', 'B', 'Q', 04294 #if defined(UTF8_INPUT_ENABLE) 04295 'B', 'Q', 04296 #endif 04297 'Q', 04298 0 04299 }; 04300 04301 04302 /* MIME preprocessor fifo */ 04303 04304 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */ 04305 #define MIME_BUF_MASK (MIME_BUF_SIZE-1) 04306 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK] 04307 static struct { 04308 unsigned char buf[MIME_BUF_SIZE]; 04309 unsigned int top; 04310 unsigned int last; /* decoded */ 04311 unsigned int input; /* undecoded */ 04312 } mime_input_state; 04313 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL; 04314 04315 #define MAXRECOVER 20 04316 04317 static void 04318 mime_input_buf_unshift(nkf_char c) 04319 { 04320 mime_input_buf(--mime_input_state.top) = (unsigned char)c; 04321 } 04322 04323 static nkf_char 04324 mime_ungetc(nkf_char c, ARG_UNUSED FILE *f) 04325 { 04326 mime_input_buf_unshift(c); 04327 return c; 04328 } 04329 04330 static nkf_char 04331 mime_ungetc_buf(nkf_char c, FILE *f) 04332 { 04333 if (mimebuf_f) 04334 (*i_mungetc_buf)(c,f); 04335 else 04336 mime_input_buf(--mime_input_state.input) = (unsigned char)c; 04337 return c; 04338 } 04339 04340 static nkf_char 04341 mime_getc_buf(FILE *f) 04342 { 04343 /* we don't keep eof of mime_input_buf, becase it contains ?= as 04344 a terminator. It was checked in mime_integrity. */ 04345 return ((mimebuf_f)? 04346 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++)); 04347 } 04348 04349 static void 04350 switch_mime_getc(void) 04351 { 04352 if (i_getc!=mime_getc) { 04353 i_mgetc = i_getc; i_getc = mime_getc; 04354 i_mungetc = i_ungetc; i_ungetc = mime_ungetc; 04355 if(mime_f==STRICT_MIME) { 04356 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf; 04357 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf; 04358 } 04359 } 04360 } 04361 04362 static void 04363 unswitch_mime_getc(void) 04364 { 04365 if(mime_f==STRICT_MIME) { 04366 i_mgetc = i_mgetc_buf; 04367 i_mungetc = i_mungetc_buf; 04368 } 04369 i_getc = i_mgetc; 04370 i_ungetc = i_mungetc; 04371 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back); 04372 mime_iconv_back = NULL; 04373 } 04374 04375 static nkf_char 04376 mime_integrity(FILE *f, const unsigned char *p) 04377 { 04378 nkf_char c,d; 04379 unsigned int q; 04380 /* In buffered mode, read until =? or NL or buffer full 04381 */ 04382 mime_input_state.input = mime_input_state.top; 04383 mime_input_state.last = mime_input_state.top; 04384 04385 while(*p) mime_input_buf(mime_input_state.input++) = *p++; 04386 d = 0; 04387 q = mime_input_state.input; 04388 while((c=(*i_getc)(f))!=EOF) { 04389 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) { 04390 break; /* buffer full */ 04391 } 04392 if (c=='=' && d=='?') { 04393 /* checked. skip header, start decode */ 04394 mime_input_buf(mime_input_state.input++) = (unsigned char)c; 04395 /* mime_last_input = mime_input_state.input; */ 04396 mime_input_state.input = q; 04397 switch_mime_getc(); 04398 return 1; 04399 } 04400 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c)))) 04401 break; 04402 /* Should we check length mod 4? */ 04403 mime_input_buf(mime_input_state.input++) = (unsigned char)c; 04404 d=c; 04405 } 04406 /* In case of Incomplete MIME, no MIME decode */ 04407 mime_input_buf(mime_input_state.input++) = (unsigned char)c; 04408 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */ 04409 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */ 04410 switch_mime_getc(); /* anyway we need buffered getc */ 04411 return 1; 04412 } 04413 04414 static nkf_char 04415 mime_begin_strict(FILE *f) 04416 { 04417 nkf_char c1 = 0; 04418 int i,j,k; 04419 const unsigned char *p,*q; 04420 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */ 04421 04422 mime_decode_mode = FALSE; 04423 /* =? has been checked */ 04424 j = 0; 04425 p = mime_pattern[j]; 04426 r[0]='='; r[1]='?'; 04427 04428 for(i=2;p[i]>SP;i++) { /* start at =? */ 04429 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) { 04430 /* pattern fails, try next one */ 04431 q = p; 04432 while (mime_pattern[++j]) { 04433 p = mime_pattern[j]; 04434 for(k=2;k<i;k++) /* assume length(p) > i */ 04435 if (p[k]!=q[k]) break; 04436 if (k==i && nkf_toupper(c1)==p[k]) break; 04437 } 04438 p = mime_pattern[j]; 04439 if (p) continue; /* found next one, continue */ 04440 /* all fails, output from recovery buffer */ 04441 (*i_ungetc)(c1,f); 04442 for(j=0;j<i;j++) { 04443 (*oconv)(0,r[j]); 04444 } 04445 return c1; 04446 } 04447 } 04448 mime_decode_mode = p[i-2]; 04449 04450 mime_iconv_back = iconv; 04451 set_iconv(FALSE, mime_priority_func[j]); 04452 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME); 04453 04454 if (mime_decode_mode=='B') { 04455 mimebuf_f = unbuf_f; 04456 if (!unbuf_f) { 04457 /* do MIME integrity check */ 04458 return mime_integrity(f,mime_pattern[j]); 04459 } 04460 } 04461 switch_mime_getc(); 04462 mimebuf_f = TRUE; 04463 return c1; 04464 } 04465 04466 static nkf_char 04467 mime_begin(FILE *f) 04468 { 04469 nkf_char c1 = 0; 04470 int i,k; 04471 04472 /* In NONSTRICT mode, only =? is checked. In case of failure, we */ 04473 /* re-read and convert again from mime_buffer. */ 04474 04475 /* =? has been checked */ 04476 k = mime_input_state.last; 04477 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?'; 04478 for(i=2;i<MAXRECOVER;i++) { /* start at =? */ 04479 /* We accept any character type even if it is breaked by new lines */ 04480 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1; 04481 if (c1==LF||c1==SP||c1==CR|| 04482 c1=='-'||c1=='_'||is_alnum(c1)) continue; 04483 if (c1=='=') { 04484 /* Failed. But this could be another MIME preemble */ 04485 (*i_ungetc)(c1,f); 04486 mime_input_state.last--; 04487 break; 04488 } 04489 if (c1!='?') break; 04490 else { 04491 /* c1=='?' */ 04492 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1; 04493 if (!(++i<MAXRECOVER) || c1==EOF) break; 04494 if (c1=='b'||c1=='B') { 04495 mime_decode_mode = 'B'; 04496 } else if (c1=='q'||c1=='Q') { 04497 mime_decode_mode = 'Q'; 04498 } else { 04499 break; 04500 } 04501 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1; 04502 if (!(++i<MAXRECOVER) || c1==EOF) break; 04503 if (c1!='?') { 04504 mime_decode_mode = FALSE; 04505 } 04506 break; 04507 } 04508 } 04509 switch_mime_getc(); 04510 if (!mime_decode_mode) { 04511 /* false MIME premble, restart from mime_buffer */ 04512 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */ 04513 /* Since we are in MIME mode until buffer becomes empty, */ 04514 /* we never go into mime_begin again for a while. */ 04515 return c1; 04516 } 04517 /* discard mime preemble, and goto MIME mode */ 04518 mime_input_state.last = k; 04519 /* do no MIME integrity check */ 04520 return c1; /* used only for checking EOF */ 04521 } 04522 04523 #ifdef CHECK_OPTION 04524 static void 04525 no_putc(ARG_UNUSED nkf_char c) 04526 { 04527 ; 04528 } 04529 04530 static void 04531 debug(const char *str) 04532 { 04533 if (debug_f){ 04534 fprintf(stderr, "%s\n", str ? str : "NULL"); 04535 } 04536 } 04537 #endif 04538 04539 static void 04540 set_input_codename(const char *codename) 04541 { 04542 if (!input_codename) { 04543 input_codename = codename; 04544 } else if (strcmp(codename, input_codename) != 0) { 04545 input_codename = ""; 04546 } 04547 } 04548 04549 static const char* 04550 get_guessed_code(void) 04551 { 04552 if (input_codename && !*input_codename) { 04553 input_codename = "BINARY"; 04554 } else { 04555 struct input_code *p = find_inputcode_byfunc(iconv); 04556 if (!input_codename) { 04557 input_codename = "ASCII"; 04558 } else if (strcmp(input_codename, "Shift_JIS") == 0) { 04559 if (p->score & (SCORE_DEPEND|SCORE_CP932)) 04560 input_codename = "CP932"; 04561 } else if (strcmp(input_codename, "EUC-JP") == 0) { 04562 if (p->score & SCORE_X0213) 04563 input_codename = "EUC-JIS-2004"; 04564 else if (p->score & (SCORE_X0212)) 04565 input_codename = "EUCJP-MS"; 04566 else if (p->score & (SCORE_DEPEND|SCORE_CP932)) 04567 input_codename = "CP51932"; 04568 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) { 04569 if (p->score & (SCORE_KANA)) 04570 input_codename = "CP50221"; 04571 else if (p->score & (SCORE_DEPEND|SCORE_CP932)) 04572 input_codename = "CP50220"; 04573 } 04574 } 04575 return input_codename; 04576 } 04577 04578 #if !defined(PERL_XS) && !defined(WIN32DLL) 04579 static void 04580 print_guessed_code(char *filename) 04581 { 04582 if (filename != NULL) printf("%s: ", filename); 04583 if (input_codename && !*input_codename) { 04584 printf("BINARY\n"); 04585 } else { 04586 input_codename = get_guessed_code(); 04587 if (guess_f == 1) { 04588 printf("%s\n", input_codename); 04589 } else { 04590 printf("%s%s%s%s\n", 04591 input_codename, 04592 iconv != w_iconv16 && iconv != w_iconv32 ? "" : 04593 input_endian == ENDIAN_LITTLE ? " LE" : 04594 input_endian == ENDIAN_BIG ? " BE" : 04595 "[BUG]", 04596 input_bom_f ? " (BOM)" : "", 04597 input_eol == CR ? " (CR)" : 04598 input_eol == LF ? " (LF)" : 04599 input_eol == CRLF ? " (CRLF)" : 04600 input_eol == EOF ? " (MIXED NL)" : 04601 ""); 04602 } 04603 } 04604 } 04605 #endif /*WIN32DLL*/ 04606 04607 #ifdef INPUT_OPTION 04608 04609 static nkf_char 04610 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f)) 04611 { 04612 nkf_char c1, c2, c3; 04613 c1 = (*g)(f); 04614 if (c1 != ch){ 04615 return c1; 04616 } 04617 c2 = (*g)(f); 04618 if (!nkf_isxdigit(c2)){ 04619 (*u)(c2, f); 04620 return c1; 04621 } 04622 c3 = (*g)(f); 04623 if (!nkf_isxdigit(c3)){ 04624 (*u)(c2, f); 04625 (*u)(c3, f); 04626 return c1; 04627 } 04628 return (hex2bin(c2) << 4) | hex2bin(c3); 04629 } 04630 04631 static nkf_char 04632 cap_getc(FILE *f) 04633 { 04634 return hex_getc(':', f, i_cgetc, i_cungetc); 04635 } 04636 04637 static nkf_char 04638 cap_ungetc(nkf_char c, FILE *f) 04639 { 04640 return (*i_cungetc)(c, f); 04641 } 04642 04643 static nkf_char 04644 url_getc(FILE *f) 04645 { 04646 return hex_getc('%', f, i_ugetc, i_uungetc); 04647 } 04648 04649 static nkf_char 04650 url_ungetc(nkf_char c, FILE *f) 04651 { 04652 return (*i_uungetc)(c, f); 04653 } 04654 #endif 04655 04656 #ifdef NUMCHAR_OPTION 04657 static nkf_char 04658 numchar_getc(FILE *f) 04659 { 04660 nkf_char (*g)(FILE *) = i_ngetc; 04661 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc; 04662 int i = 0, j; 04663 nkf_char buf[12]; 04664 nkf_char c = -1; 04665 04666 buf[i] = (*g)(f); 04667 if (buf[i] == '&'){ 04668 buf[++i] = (*g)(f); 04669 if (buf[i] == '#'){ 04670 c = 0; 04671 buf[++i] = (*g)(f); 04672 if (buf[i] == 'x' || buf[i] == 'X'){ 04673 for (j = 0; j < 7; j++){ 04674 buf[++i] = (*g)(f); 04675 if (!nkf_isxdigit(buf[i])){ 04676 if (buf[i] != ';'){ 04677 c = -1; 04678 } 04679 break; 04680 } 04681 c <<= 4; 04682 c |= hex2bin(buf[i]); 04683 } 04684 }else{ 04685 for (j = 0; j < 8; j++){ 04686 if (j){ 04687 buf[++i] = (*g)(f); 04688 } 04689 if (!nkf_isdigit(buf[i])){ 04690 if (buf[i] != ';'){ 04691 c = -1; 04692 } 04693 break; 04694 } 04695 c *= 10; 04696 c += hex2bin(buf[i]); 04697 } 04698 } 04699 } 04700 } 04701 if (c != -1){ 04702 return nkf_char_unicode_new(c); 04703 } 04704 while (i > 0){ 04705 (*u)(buf[i], f); 04706 --i; 04707 } 04708 return buf[0]; 04709 } 04710 04711 static nkf_char 04712 numchar_ungetc(nkf_char c, FILE *f) 04713 { 04714 return (*i_nungetc)(c, f); 04715 } 04716 #endif 04717 04718 #ifdef UNICODE_NORMALIZATION 04719 04720 static nkf_char 04721 nfc_getc(FILE *f) 04722 { 04723 nkf_char (*g)(FILE *f) = i_nfc_getc; 04724 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc; 04725 nkf_buf_t *buf = nkf_state->nfc_buf; 04726 const unsigned char *array; 04727 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1; 04728 nkf_char c = (*g)(f); 04729 04730 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c; 04731 04732 nkf_buf_push(buf, c); 04733 do { 04734 while (lower <= upper) { 04735 int mid = (lower+upper) / 2; 04736 int len; 04737 array = normalization_table[mid].nfd; 04738 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) { 04739 if (len >= nkf_buf_length(buf)) { 04740 c = (*g)(f); 04741 if (c == EOF) { 04742 len = 0; 04743 lower = 1, upper = 0; 04744 break; 04745 } 04746 nkf_buf_push(buf, c); 04747 } 04748 if (array[len] != nkf_buf_at(buf, len)) { 04749 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1; 04750 else upper = mid - 1; 04751 len = 0; 04752 break; 04753 } 04754 } 04755 if (len > 0) { 04756 int i; 04757 array = normalization_table[mid].nfc; 04758 nkf_buf_clear(buf); 04759 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++) 04760 nkf_buf_push(buf, array[i]); 04761 break; 04762 } 04763 } 04764 } while (lower <= upper); 04765 04766 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f); 04767 c = nkf_buf_pop(buf); 04768 04769 return c; 04770 } 04771 04772 static nkf_char 04773 nfc_ungetc(nkf_char c, FILE *f) 04774 { 04775 return (*i_nfc_ungetc)(c, f); 04776 } 04777 #endif /* UNICODE_NORMALIZATION */ 04778 04779 04780 static nkf_char 04781 base64decode(nkf_char c) 04782 { 04783 int i; 04784 if (c > '@') { 04785 if (c < '[') { 04786 i = c - 'A'; /* A..Z 0-25 */ 04787 } else if (c == '_') { 04788 i = '?' /* 63 */ ; /* _ 63 */ 04789 } else { 04790 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */ 04791 } 04792 } else if (c > '/') { 04793 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */ 04794 } else if (c == '+' || c == '-') { 04795 i = '>' /* 62 */ ; /* + and - 62 */ 04796 } else { 04797 i = '?' /* 63 */ ; /* / 63 */ 04798 } 04799 return (i); 04800 } 04801 04802 static nkf_char 04803 mime_getc(FILE *f) 04804 { 04805 nkf_char c1, c2, c3, c4, cc; 04806 nkf_char t1, t2, t3, t4, mode, exit_mode; 04807 nkf_char lwsp_count; 04808 char *lwsp_buf; 04809 char *lwsp_buf_new; 04810 nkf_char lwsp_size = 128; 04811 04812 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */ 04813 return mime_input_buf(mime_input_state.top++); 04814 } 04815 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) { 04816 mime_decode_mode=FALSE; 04817 unswitch_mime_getc(); 04818 return (*i_getc)(f); 04819 } 04820 04821 if (mimebuf_f == FIXED_MIME) 04822 exit_mode = mime_decode_mode; 04823 else 04824 exit_mode = FALSE; 04825 if (mime_decode_mode == 'Q') { 04826 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF); 04827 restart_mime_q: 04828 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP; 04829 if (c1<=SP || DEL<=c1) { 04830 mime_decode_mode = exit_mode; /* prepare for quit */ 04831 return c1; 04832 } 04833 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) { 04834 return c1; 04835 } 04836 04837 mime_decode_mode = exit_mode; /* prepare for quit */ 04838 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF); 04839 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) { 04840 /* end Q encoding */ 04841 input_mode = exit_mode; 04842 lwsp_count = 0; 04843 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char)); 04844 while ((c1=(*i_getc)(f))!=EOF) { 04845 switch (c1) { 04846 case LF: 04847 case CR: 04848 if (c1==LF) { 04849 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { 04850 i_ungetc(SP,f); 04851 continue; 04852 } else { 04853 i_ungetc(c1,f); 04854 } 04855 c1 = LF; 04856 } else { 04857 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) { 04858 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { 04859 i_ungetc(SP,f); 04860 continue; 04861 } else { 04862 i_ungetc(c1,f); 04863 } 04864 i_ungetc(LF,f); 04865 } else { 04866 i_ungetc(c1,f); 04867 } 04868 c1 = CR; 04869 } 04870 break; 04871 case SP: 04872 case TAB: 04873 lwsp_buf[lwsp_count] = (unsigned char)c1; 04874 if (lwsp_count++>lwsp_size){ 04875 lwsp_size <<= 1; 04876 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char)); 04877 lwsp_buf = lwsp_buf_new; 04878 } 04879 continue; 04880 } 04881 break; 04882 } 04883 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) { 04884 i_ungetc(c1,f); 04885 for(lwsp_count--;lwsp_count>0;lwsp_count--) 04886 i_ungetc(lwsp_buf[lwsp_count],f); 04887 c1 = lwsp_buf[0]; 04888 } 04889 nkf_xfree(lwsp_buf); 04890 return c1; 04891 } 04892 if (c1=='='&&c2<SP) { /* this is soft wrap */ 04893 while((c1 = (*i_mgetc)(f)) <=SP) { 04894 if (c1 == EOF) return (EOF); 04895 } 04896 mime_decode_mode = 'Q'; /* still in MIME */ 04897 goto restart_mime_q; 04898 } 04899 if (c1=='?') { 04900 mime_decode_mode = 'Q'; /* still in MIME */ 04901 (*i_mungetc)(c2,f); 04902 return c1; 04903 } 04904 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF); 04905 if (c2<=SP) return c2; 04906 mime_decode_mode = 'Q'; /* still in MIME */ 04907 return ((hex2bin(c2)<<4) + hex2bin(c3)); 04908 } 04909 04910 if (mime_decode_mode != 'B') { 04911 mime_decode_mode = FALSE; 04912 return (*i_mgetc)(f); 04913 } 04914 04915 04916 /* Base64 encoding */ 04917 /* 04918 MIME allows line break in the middle of 04919 Base64, but we are very pessimistic in decoding 04920 in unbuf mode because MIME encoded code may broken by 04921 less or editor's control sequence (such as ESC-[-K in unbuffered 04922 mode. ignore incomplete MIME. 04923 */ 04924 mode = mime_decode_mode; 04925 mime_decode_mode = exit_mode; /* prepare for quit */ 04926 04927 while ((c1 = (*i_mgetc)(f))<=SP) { 04928 if (c1==EOF) 04929 return (EOF); 04930 } 04931 mime_c2_retry: 04932 if ((c2 = (*i_mgetc)(f))<=SP) { 04933 if (c2==EOF) 04934 return (EOF); 04935 if (mime_f != STRICT_MIME) goto mime_c2_retry; 04936 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII; 04937 return c2; 04938 } 04939 if ((c1 == '?') && (c2 == '=')) { 04940 input_mode = ASCII; 04941 lwsp_count = 0; 04942 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char)); 04943 while ((c1=(*i_getc)(f))!=EOF) { 04944 switch (c1) { 04945 case LF: 04946 case CR: 04947 if (c1==LF) { 04948 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { 04949 i_ungetc(SP,f); 04950 continue; 04951 } else { 04952 i_ungetc(c1,f); 04953 } 04954 c1 = LF; 04955 } else { 04956 if ((c1=(*i_getc)(f))!=EOF) { 04957 if (c1==SP) { 04958 i_ungetc(SP,f); 04959 continue; 04960 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { 04961 i_ungetc(SP,f); 04962 continue; 04963 } else { 04964 i_ungetc(c1,f); 04965 } 04966 i_ungetc(LF,f); 04967 } else { 04968 i_ungetc(c1,f); 04969 } 04970 c1 = CR; 04971 } 04972 break; 04973 case SP: 04974 case TAB: 04975 lwsp_buf[lwsp_count] = (unsigned char)c1; 04976 if (lwsp_count++>lwsp_size){ 04977 lwsp_size <<= 1; 04978 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char)); 04979 lwsp_buf = lwsp_buf_new; 04980 } 04981 continue; 04982 } 04983 break; 04984 } 04985 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) { 04986 i_ungetc(c1,f); 04987 for(lwsp_count--;lwsp_count>0;lwsp_count--) 04988 i_ungetc(lwsp_buf[lwsp_count],f); 04989 c1 = lwsp_buf[0]; 04990 } 04991 nkf_xfree(lwsp_buf); 04992 return c1; 04993 } 04994 mime_c3_retry: 04995 if ((c3 = (*i_mgetc)(f))<=SP) { 04996 if (c3==EOF) 04997 return (EOF); 04998 if (mime_f != STRICT_MIME) goto mime_c3_retry; 04999 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII; 05000 return c3; 05001 } 05002 mime_c4_retry: 05003 if ((c4 = (*i_mgetc)(f))<=SP) { 05004 if (c4==EOF) 05005 return (EOF); 05006 if (mime_f != STRICT_MIME) goto mime_c4_retry; 05007 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII; 05008 return c4; 05009 } 05010 05011 mime_decode_mode = mode; /* still in MIME sigh... */ 05012 05013 /* BASE 64 decoding */ 05014 05015 t1 = 0x3f & base64decode(c1); 05016 t2 = 0x3f & base64decode(c2); 05017 t3 = 0x3f & base64decode(c3); 05018 t4 = 0x3f & base64decode(c4); 05019 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03); 05020 if (c2 != '=') { 05021 mime_input_buf(mime_input_state.last++) = (unsigned char)cc; 05022 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f); 05023 if (c3 != '=') { 05024 mime_input_buf(mime_input_state.last++) = (unsigned char)cc; 05025 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f); 05026 if (c4 != '=') 05027 mime_input_buf(mime_input_state.last++) = (unsigned char)cc; 05028 } 05029 } else { 05030 return c1; 05031 } 05032 return mime_input_buf(mime_input_state.top++); 05033 } 05034 05035 static const char basis_64[] = 05036 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 05037 05038 #define MIMEOUT_BUF_LENGTH 74 05039 static struct { 05040 unsigned char buf[MIMEOUT_BUF_LENGTH+1]; 05041 int count; 05042 } mimeout_state; 05043 05044 /*nkf_char mime_lastchar2, mime_lastchar1;*/ 05045 05046 static void 05047 open_mime(nkf_char mode) 05048 { 05049 const unsigned char *p; 05050 int i; 05051 int j; 05052 p = mime_pattern[0]; 05053 for(i=0;mime_pattern[i];i++) { 05054 if (mode == mime_encode[i]) { 05055 p = mime_pattern[i]; 05056 break; 05057 } 05058 } 05059 mimeout_mode = mime_encode_method[i]; 05060 i = 0; 05061 if (base64_count>45) { 05062 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){ 05063 (*o_mputc)(mimeout_state.buf[i]); 05064 i++; 05065 } 05066 put_newline(o_mputc); 05067 (*o_mputc)(SP); 05068 base64_count = 1; 05069 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) { 05070 i++; 05071 } 05072 } 05073 for (;i<mimeout_state.count;i++) { 05074 if (nkf_isspace(mimeout_state.buf[i])) { 05075 (*o_mputc)(mimeout_state.buf[i]); 05076 base64_count ++; 05077 } else { 05078 break; 05079 } 05080 } 05081 while(*p) { 05082 (*o_mputc)(*p++); 05083 base64_count ++; 05084 } 05085 j = mimeout_state.count; 05086 mimeout_state.count = 0; 05087 for (;i<j;i++) { 05088 mime_putc(mimeout_state.buf[i]); 05089 } 05090 } 05091 05092 static void 05093 mime_prechar(nkf_char c2, nkf_char c1) 05094 { 05095 if (mimeout_mode > 0){ 05096 if (c2 == EOF){ 05097 if (base64_count + mimeout_state.count/3*4> 73){ 05098 (*o_base64conv)(EOF,0); 05099 oconv_newline(o_base64conv); 05100 (*o_base64conv)(0,SP); 05101 base64_count = 1; 05102 } 05103 } else { 05104 if ((c2 != 0 || c1 > DEL) && base64_count + mimeout_state.count/3*4> 66) { 05105 (*o_base64conv)(EOF,0); 05106 oconv_newline(o_base64conv); 05107 (*o_base64conv)(0,SP); 05108 base64_count = 1; 05109 mimeout_mode = -1; 05110 } 05111 } 05112 } else if (c2) { 05113 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) { 05114 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B'; 05115 open_mime(output_mode); 05116 (*o_base64conv)(EOF,0); 05117 oconv_newline(o_base64conv); 05118 (*o_base64conv)(0,SP); 05119 base64_count = 1; 05120 mimeout_mode = -1; 05121 } 05122 } 05123 } 05124 05125 static void 05126 close_mime(void) 05127 { 05128 (*o_mputc)('?'); 05129 (*o_mputc)('='); 05130 base64_count += 2; 05131 mimeout_mode = 0; 05132 } 05133 05134 static void 05135 eof_mime(void) 05136 { 05137 switch(mimeout_mode) { 05138 case 'Q': 05139 case 'B': 05140 break; 05141 case 2: 05142 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]); 05143 (*o_mputc)('='); 05144 (*o_mputc)('='); 05145 base64_count += 3; 05146 break; 05147 case 1: 05148 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]); 05149 (*o_mputc)('='); 05150 base64_count += 2; 05151 break; 05152 } 05153 if (mimeout_mode > 0) { 05154 if (mimeout_f!=FIXED_MIME) { 05155 close_mime(); 05156 } else if (mimeout_mode != 'Q') 05157 mimeout_mode = 'B'; 05158 } 05159 } 05160 05161 static void 05162 mimeout_addchar(nkf_char c) 05163 { 05164 switch(mimeout_mode) { 05165 case 'Q': 05166 if (c==CR||c==LF) { 05167 (*o_mputc)(c); 05168 base64_count = 0; 05169 } else if(!nkf_isalnum(c)) { 05170 (*o_mputc)('='); 05171 (*o_mputc)(bin2hex(((c>>4)&0xf))); 05172 (*o_mputc)(bin2hex((c&0xf))); 05173 base64_count += 3; 05174 } else { 05175 (*o_mputc)(c); 05176 base64_count++; 05177 } 05178 break; 05179 case 'B': 05180 nkf_state->mimeout_state=c; 05181 (*o_mputc)(basis_64[c>>2]); 05182 mimeout_mode=2; 05183 base64_count ++; 05184 break; 05185 case 2: 05186 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]); 05187 nkf_state->mimeout_state=c; 05188 mimeout_mode=1; 05189 base64_count ++; 05190 break; 05191 case 1: 05192 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]); 05193 (*o_mputc)(basis_64[c & 0x3F]); 05194 mimeout_mode='B'; 05195 base64_count += 2; 05196 break; 05197 default: 05198 (*o_mputc)(c); 05199 base64_count++; 05200 break; 05201 } 05202 } 05203 05204 static void 05205 mime_putc(nkf_char c) 05206 { 05207 int i, j; 05208 nkf_char lastchar; 05209 05210 if (mimeout_f == FIXED_MIME){ 05211 if (mimeout_mode == 'Q'){ 05212 if (base64_count > 71){ 05213 if (c!=CR && c!=LF) { 05214 (*o_mputc)('='); 05215 put_newline(o_mputc); 05216 } 05217 base64_count = 0; 05218 } 05219 }else{ 05220 if (base64_count > 71){ 05221 eof_mime(); 05222 put_newline(o_mputc); 05223 base64_count = 0; 05224 } 05225 if (c == EOF) { /* c==EOF */ 05226 eof_mime(); 05227 } 05228 } 05229 if (c != EOF) { /* c==EOF */ 05230 mimeout_addchar(c); 05231 } 05232 return; 05233 } 05234 05235 /* mimeout_f != FIXED_MIME */ 05236 05237 if (c == EOF) { /* c==EOF */ 05238 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode); 05239 j = mimeout_state.count; 05240 mimeout_state.count = 0; 05241 i = 0; 05242 if (mimeout_mode > 0) { 05243 if (!nkf_isblank(mimeout_state.buf[j-1])) { 05244 for (;i<j;i++) { 05245 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){ 05246 break; 05247 } 05248 mimeout_addchar(mimeout_state.buf[i]); 05249 } 05250 eof_mime(); 05251 for (;i<j;i++) { 05252 mimeout_addchar(mimeout_state.buf[i]); 05253 } 05254 } else { 05255 for (;i<j;i++) { 05256 mimeout_addchar(mimeout_state.buf[i]); 05257 } 05258 eof_mime(); 05259 } 05260 } else { 05261 for (;i<j;i++) { 05262 mimeout_addchar(mimeout_state.buf[i]); 05263 } 05264 } 05265 return; 05266 } 05267 05268 if (mimeout_state.count > 0){ 05269 lastchar = mimeout_state.buf[mimeout_state.count - 1]; 05270 }else{ 05271 lastchar = -1; 05272 } 05273 05274 if (mimeout_mode=='Q') { 05275 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) { 05276 if (c == CR || c == LF) { 05277 close_mime(); 05278 (*o_mputc)(c); 05279 base64_count = 0; 05280 return; 05281 } else if (c <= SP) { 05282 close_mime(); 05283 if (base64_count > 70) { 05284 put_newline(o_mputc); 05285 base64_count = 0; 05286 } 05287 if (!nkf_isblank(c)) { 05288 (*o_mputc)(SP); 05289 base64_count++; 05290 } 05291 } else { 05292 if (base64_count > 70) { 05293 close_mime(); 05294 put_newline(o_mputc); 05295 (*o_mputc)(SP); 05296 base64_count = 1; 05297 open_mime(output_mode); 05298 } 05299 if (!nkf_noescape_mime(c)) { 05300 mimeout_addchar(c); 05301 return; 05302 } 05303 } 05304 if (c != 0x1B) { 05305 (*o_mputc)(c); 05306 base64_count++; 05307 return; 05308 } 05309 } 05310 } 05311 05312 if (mimeout_mode <= 0) { 05313 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 || 05314 output_mode == UTF_8)) { 05315 if (nkf_isspace(c)) { 05316 int flag = 0; 05317 if (mimeout_mode == -1) { 05318 flag = 1; 05319 } 05320 if (c==CR || c==LF) { 05321 if (flag) { 05322 open_mime(output_mode); 05323 output_mode = 0; 05324 } else { 05325 base64_count = 0; 05326 } 05327 } 05328 for (i=0;i<mimeout_state.count;i++) { 05329 (*o_mputc)(mimeout_state.buf[i]); 05330 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){ 05331 base64_count = 0; 05332 }else{ 05333 base64_count++; 05334 } 05335 } 05336 if (flag) { 05337 eof_mime(); 05338 base64_count = 0; 05339 mimeout_mode = 0; 05340 } 05341 mimeout_state.buf[0] = (char)c; 05342 mimeout_state.count = 1; 05343 }else{ 05344 if (base64_count > 1 05345 && base64_count + mimeout_state.count > 76 05346 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){ 05347 static const char *str = "boundary=\""; 05348 static int len = 10; 05349 i = 0; 05350 05351 for (; i < mimeout_state.count - len; ++i) { 05352 if (!strncmp((char *)(mimeout_state.buf+i), str, len)) { 05353 i += len - 2; 05354 break; 05355 } 05356 } 05357 05358 if (i == 0 || i == mimeout_state.count - len) { 05359 put_newline(o_mputc); 05360 base64_count = 0; 05361 if (!nkf_isspace(mimeout_state.buf[0])){ 05362 (*o_mputc)(SP); 05363 base64_count++; 05364 } 05365 } 05366 else { 05367 int j; 05368 for (j = 0; j <= i; ++j) { 05369 (*o_mputc)(mimeout_state.buf[j]); 05370 } 05371 put_newline(o_mputc); 05372 base64_count = 1; 05373 for (; j <= mimeout_state.count; ++j) { 05374 mimeout_state.buf[j - i] = mimeout_state.buf[j]; 05375 } 05376 mimeout_state.count -= i; 05377 } 05378 } 05379 mimeout_state.buf[mimeout_state.count++] = (char)c; 05380 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) { 05381 open_mime(output_mode); 05382 } 05383 } 05384 return; 05385 }else{ 05386 if (lastchar==CR || lastchar == LF){ 05387 for (i=0;i<mimeout_state.count;i++) { 05388 (*o_mputc)(mimeout_state.buf[i]); 05389 } 05390 base64_count = 0; 05391 mimeout_state.count = 0; 05392 } 05393 if (lastchar==SP) { 05394 for (i=0;i<mimeout_state.count-1;i++) { 05395 (*o_mputc)(mimeout_state.buf[i]); 05396 base64_count++; 05397 } 05398 mimeout_state.buf[0] = SP; 05399 mimeout_state.count = 1; 05400 } 05401 open_mime(output_mode); 05402 } 05403 }else{ 05404 /* mimeout_mode == 'B', 1, 2 */ 05405 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 || 05406 output_mode == UTF_8)) { 05407 if (lastchar == CR || lastchar == LF){ 05408 if (nkf_isblank(c)) { 05409 for (i=0;i<mimeout_state.count;i++) { 05410 mimeout_addchar(mimeout_state.buf[i]); 05411 } 05412 mimeout_state.count = 0; 05413 } else { 05414 eof_mime(); 05415 for (i=0;i<mimeout_state.count;i++) { 05416 (*o_mputc)(mimeout_state.buf[i]); 05417 } 05418 base64_count = 0; 05419 mimeout_state.count = 0; 05420 } 05421 mimeout_state.buf[mimeout_state.count++] = (char)c; 05422 return; 05423 } 05424 if (nkf_isspace(c)) { 05425 for (i=0;i<mimeout_state.count;i++) { 05426 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) { 05427 eof_mime(); 05428 for (i=0;i<mimeout_state.count;i++) { 05429 (*o_mputc)(mimeout_state.buf[i]); 05430 base64_count++; 05431 } 05432 mimeout_state.count = 0; 05433 } 05434 } 05435 mimeout_state.buf[mimeout_state.count++] = (char)c; 05436 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) { 05437 eof_mime(); 05438 for (i=0;i<mimeout_state.count;i++) { 05439 (*o_mputc)(mimeout_state.buf[i]); 05440 base64_count++; 05441 } 05442 mimeout_state.count = 0; 05443 } 05444 return; 05445 } 05446 if (mimeout_state.count>0 && SP<c && c!='=') { 05447 mimeout_state.buf[mimeout_state.count++] = (char)c; 05448 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) { 05449 j = mimeout_state.count; 05450 mimeout_state.count = 0; 05451 for (i=0;i<j;i++) { 05452 mimeout_addchar(mimeout_state.buf[i]); 05453 } 05454 } 05455 return; 05456 } 05457 } 05458 } 05459 if (mimeout_state.count>0) { 05460 j = mimeout_state.count; 05461 mimeout_state.count = 0; 05462 for (i=0;i<j;i++) { 05463 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF) 05464 break; 05465 mimeout_addchar(mimeout_state.buf[i]); 05466 } 05467 if (i<j) { 05468 eof_mime(); 05469 base64_count=0; 05470 for (;i<j;i++) { 05471 (*o_mputc)(mimeout_state.buf[i]); 05472 } 05473 open_mime(output_mode); 05474 } 05475 } 05476 mimeout_addchar(c); 05477 } 05478 05479 static void 05480 base64_conv(nkf_char c2, nkf_char c1) 05481 { 05482 mime_prechar(c2, c1); 05483 (*o_base64conv)(c2,c1); 05484 } 05485 05486 #ifdef HAVE_ICONV_H 05487 typedef struct nkf_iconv_t { 05488 iconv_t cd; 05489 char *input_buffer; 05490 size_t input_buffer_size; 05491 char *output_buffer; 05492 size_t output_buffer_size; 05493 } 05494 05495 static nkf_iconv_t 05496 nkf_iconv_new(char *tocode, char *fromcode) 05497 { 05498 nkf_iconv_t converter; 05499 05500 converter->input_buffer_size = IOBUF_SIZE; 05501 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size); 05502 converter->output_buffer_size = IOBUF_SIZE * 2; 05503 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size); 05504 converter->cd = iconv_open(tocode, fromcode); 05505 if (converter->cd == (iconv_t)-1) 05506 { 05507 switch (errno) { 05508 case EINVAL: 05509 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode)); 05510 return -1; 05511 default: 05512 perror("can't iconv_open"); 05513 } 05514 } 05515 } 05516 05517 static size_t 05518 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input) 05519 { 05520 size_t invalid = (size_t)0; 05521 char *input_buffer = converter->input_buffer; 05522 size_t input_length = (size_t)0; 05523 char *output_buffer = converter->output_buffer; 05524 size_t output_length = converter->output_buffer_size; 05525 int c; 05526 05527 do { 05528 if (c != EOF) { 05529 while ((c = (*i_getc)(f)) != EOF) { 05530 input_buffer[input_length++] = c; 05531 if (input_length < converter->input_buffer_size) break; 05532 } 05533 } 05534 05535 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length); 05536 while (output_length-- > 0) { 05537 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]); 05538 } 05539 if (ret == (size_t) - 1) { 05540 switch (errno) { 05541 case EINVAL: 05542 if (input_buffer != converter->input_buffer) 05543 memmove(converter->input_buffer, input_buffer, input_length); 05544 break; 05545 case E2BIG: 05546 converter->output_buffer_size *= 2; 05547 output_buffer = realloc(converter->outbuf, converter->output_buffer_size); 05548 if (output_buffer == NULL) { 05549 perror("can't realloc"); 05550 return -1; 05551 } 05552 converter->output_buffer = output_buffer; 05553 break; 05554 default: 05555 perror("can't iconv"); 05556 return -1; 05557 } 05558 } else { 05559 invalid += ret; 05560 } 05561 } while (1); 05562 05563 return invalid; 05564 } 05565 05566 05567 static void 05568 nkf_iconv_close(nkf_iconv_t *convert) 05569 { 05570 nkf_xfree(converter->inbuf); 05571 nkf_xfree(converter->outbuf); 05572 iconv_close(converter->cd); 05573 } 05574 #endif 05575 05576 05577 static void 05578 reinit(void) 05579 { 05580 { 05581 struct input_code *p = input_code_list; 05582 while (p->name){ 05583 status_reinit(p++); 05584 } 05585 } 05586 unbuf_f = FALSE; 05587 estab_f = FALSE; 05588 nop_f = FALSE; 05589 binmode_f = TRUE; 05590 rot_f = FALSE; 05591 hira_f = FALSE; 05592 alpha_f = FALSE; 05593 mime_f = MIME_DECODE_DEFAULT; 05594 mime_decode_f = FALSE; 05595 mimebuf_f = FALSE; 05596 broken_f = FALSE; 05597 iso8859_f = FALSE; 05598 mimeout_f = FALSE; 05599 x0201_f = NKF_UNSPECIFIED; 05600 iso2022jp_f = FALSE; 05601 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE) 05602 ms_ucs_map_f = UCS_MAP_ASCII; 05603 #endif 05604 #ifdef UTF8_INPUT_ENABLE 05605 no_cp932ext_f = FALSE; 05606 no_best_fit_chars_f = FALSE; 05607 encode_fallback = NULL; 05608 unicode_subchar = '?'; 05609 input_endian = ENDIAN_BIG; 05610 #endif 05611 #ifdef UTF8_OUTPUT_ENABLE 05612 output_bom_f = FALSE; 05613 output_endian = ENDIAN_BIG; 05614 #endif 05615 #ifdef UNICODE_NORMALIZATION 05616 nfc_f = FALSE; 05617 #endif 05618 #ifdef INPUT_OPTION 05619 cap_f = FALSE; 05620 url_f = FALSE; 05621 numchar_f = FALSE; 05622 #endif 05623 #ifdef CHECK_OPTION 05624 noout_f = FALSE; 05625 debug_f = FALSE; 05626 #endif 05627 guess_f = 0; 05628 #ifdef EXEC_IO 05629 exec_f = 0; 05630 #endif 05631 #ifdef SHIFTJIS_CP932 05632 cp51932_f = TRUE; 05633 cp932inv_f = TRUE; 05634 #endif 05635 #ifdef X0212_ENABLE 05636 x0212_f = FALSE; 05637 x0213_f = FALSE; 05638 #endif 05639 { 05640 int i; 05641 for (i = 0; i < 256; i++){ 05642 prefix_table[i] = 0; 05643 } 05644 } 05645 hold_count = 0; 05646 mimeout_state.count = 0; 05647 mimeout_mode = 0; 05648 base64_count = 0; 05649 f_line = 0; 05650 f_prev = 0; 05651 fold_preserve_f = FALSE; 05652 fold_f = FALSE; 05653 fold_len = 0; 05654 kanji_intro = DEFAULT_J; 05655 ascii_intro = DEFAULT_R; 05656 fold_margin = FOLD_MARGIN; 05657 o_zconv = no_connection; 05658 o_fconv = no_connection; 05659 o_eol_conv = no_connection; 05660 o_rot_conv = no_connection; 05661 o_hira_conv = no_connection; 05662 o_base64conv = no_connection; 05663 o_iso2022jp_check_conv = no_connection; 05664 o_putc = std_putc; 05665 i_getc = std_getc; 05666 i_ungetc = std_ungetc; 05667 i_bgetc = std_getc; 05668 i_bungetc = std_ungetc; 05669 o_mputc = std_putc; 05670 i_mgetc = std_getc; 05671 i_mungetc = std_ungetc; 05672 i_mgetc_buf = std_getc; 05673 i_mungetc_buf = std_ungetc; 05674 output_mode = ASCII; 05675 input_mode = ASCII; 05676 mime_decode_mode = FALSE; 05677 file_out_f = FALSE; 05678 eolmode_f = 0; 05679 input_eol = 0; 05680 prev_cr = 0; 05681 option_mode = 0; 05682 z_prev2=0,z_prev1=0; 05683 #ifdef CHECK_OPTION 05684 iconv_for_check = 0; 05685 #endif 05686 input_codename = NULL; 05687 input_encoding = NULL; 05688 output_encoding = NULL; 05689 nkf_state_init(); 05690 #ifdef WIN32DLL 05691 reinitdll(); 05692 #endif /*WIN32DLL*/ 05693 } 05694 05695 static int 05696 module_connection(void) 05697 { 05698 if (input_encoding) set_input_encoding(input_encoding); 05699 if (!output_encoding) { 05700 output_encoding = nkf_default_encoding(); 05701 } 05702 if (!output_encoding) { 05703 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP); 05704 else return -1; 05705 } 05706 set_output_encoding(output_encoding); 05707 oconv = nkf_enc_to_oconv(output_encoding); 05708 o_putc = std_putc; 05709 if (nkf_enc_unicode_p(output_encoding)) 05710 output_mode = UTF_8; 05711 05712 if (x0201_f == NKF_UNSPECIFIED) { 05713 x0201_f = X0201_DEFAULT; 05714 } 05715 05716 /* replace continucation module, from output side */ 05717 05718 /* output redicrection */ 05719 #ifdef CHECK_OPTION 05720 if (noout_f || guess_f){ 05721 o_putc = no_putc; 05722 } 05723 #endif 05724 if (mimeout_f) { 05725 o_mputc = o_putc; 05726 o_putc = mime_putc; 05727 if (mimeout_f == TRUE) { 05728 o_base64conv = oconv; oconv = base64_conv; 05729 } 05730 /* base64_count = 0; */ 05731 } 05732 05733 if (eolmode_f || guess_f) { 05734 o_eol_conv = oconv; oconv = eol_conv; 05735 } 05736 if (rot_f) { 05737 o_rot_conv = oconv; oconv = rot_conv; 05738 } 05739 if (iso2022jp_f) { 05740 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv; 05741 } 05742 if (hira_f) { 05743 o_hira_conv = oconv; oconv = hira_conv; 05744 } 05745 if (fold_f) { 05746 o_fconv = oconv; oconv = fold_conv; 05747 f_line = 0; 05748 } 05749 if (alpha_f || x0201_f) { 05750 o_zconv = oconv; oconv = z_conv; 05751 } 05752 05753 i_getc = std_getc; 05754 i_ungetc = std_ungetc; 05755 /* input redicrection */ 05756 #ifdef INPUT_OPTION 05757 if (cap_f){ 05758 i_cgetc = i_getc; i_getc = cap_getc; 05759 i_cungetc = i_ungetc; i_ungetc= cap_ungetc; 05760 } 05761 if (url_f){ 05762 i_ugetc = i_getc; i_getc = url_getc; 05763 i_uungetc = i_ungetc; i_ungetc= url_ungetc; 05764 } 05765 #endif 05766 #ifdef NUMCHAR_OPTION 05767 if (numchar_f){ 05768 i_ngetc = i_getc; i_getc = numchar_getc; 05769 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc; 05770 } 05771 #endif 05772 #ifdef UNICODE_NORMALIZATION 05773 if (nfc_f){ 05774 i_nfc_getc = i_getc; i_getc = nfc_getc; 05775 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc; 05776 } 05777 #endif 05778 if (mime_f && mimebuf_f==FIXED_MIME) { 05779 i_mgetc = i_getc; i_getc = mime_getc; 05780 i_mungetc = i_ungetc; i_ungetc = mime_ungetc; 05781 } 05782 if (broken_f & 1) { 05783 i_bgetc = i_getc; i_getc = broken_getc; 05784 i_bungetc = i_ungetc; i_ungetc = broken_ungetc; 05785 } 05786 if (input_encoding) { 05787 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding)); 05788 } else { 05789 set_iconv(FALSE, e_iconv); 05790 } 05791 05792 { 05793 struct input_code *p = input_code_list; 05794 while (p->name){ 05795 status_reinit(p++); 05796 } 05797 } 05798 return 0; 05799 } 05800 05801 /* 05802 Conversion main loop. Code detection only. 05803 */ 05804 05805 #if !defined(PERL_XS) && !defined(WIN32DLL) 05806 static nkf_char 05807 noconvert(FILE *f) 05808 { 05809 nkf_char c; 05810 05811 if (nop_f == 2) 05812 module_connection(); 05813 while ((c = (*i_getc)(f)) != EOF) 05814 (*o_putc)(c); 05815 (*o_putc)(EOF); 05816 return 1; 05817 } 05818 #endif 05819 05820 #define NEXT continue /* no output, get next */ 05821 #define SKIP c2=0;continue /* no output, get next */ 05822 #define MORE c2=c1;continue /* need one more byte */ 05823 #define SEND (void)0 /* output c1 and c2, get next */ 05824 #define LAST break /* end of loop, go closing */ 05825 #define set_input_mode(mode) do { \ 05826 input_mode = mode; \ 05827 shift_mode = 0; \ 05828 set_input_codename("ISO-2022-JP"); \ 05829 debug("ISO-2022-JP"); \ 05830 } while (0) 05831 05832 static int 05833 kanji_convert(FILE *f) 05834 { 05835 nkf_char c1=0, c2=0, c3=0, c4=0; 05836 int shift_mode = 0; /* 0, 1, 2, 3 */ 05837 int g2 = 0; 05838 int is_8bit = FALSE; 05839 05840 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) { 05841 is_8bit = TRUE; 05842 } 05843 05844 input_mode = ASCII; 05845 output_mode = ASCII; 05846 05847 if (module_connection() < 0) { 05848 #if !defined(PERL_XS) && !defined(WIN32DLL) 05849 fprintf(stderr, "no output encoding given\n"); 05850 #endif 05851 return -1; 05852 } 05853 check_bom(f); 05854 05855 #ifdef UTF8_INPUT_ENABLE 05856 if(iconv == w_iconv32){ 05857 while ((c1 = (*i_getc)(f)) != EOF && 05858 (c2 = (*i_getc)(f)) != EOF && 05859 (c3 = (*i_getc)(f)) != EOF && 05860 (c4 = (*i_getc)(f)) != EOF) { 05861 nkf_char c5, c6, c7, c8; 05862 if (nkf_iconv_utf_32(c1, c2, c3, c4) == (size_t)NKF_ICONV_WAIT_COMBINING_CHAR) { 05863 if ((c5 = (*i_getc)(f)) != EOF && 05864 (c6 = (*i_getc)(f)) != EOF && 05865 (c7 = (*i_getc)(f)) != EOF && 05866 (c8 = (*i_getc)(f)) != EOF) { 05867 if (nkf_iconv_utf_32_combine(c1, c2, c3, c4, c5, c6, c7, c8)) { 05868 (*i_ungetc)(c8, f); 05869 (*i_ungetc)(c7, f); 05870 (*i_ungetc)(c6, f); 05871 (*i_ungetc)(c5, f); 05872 nkf_iconv_utf_32_nocombine(c1, c2, c3, c4); 05873 } 05874 } else { 05875 nkf_iconv_utf_32_nocombine(c1, c2, c3, c4); 05876 } 05877 } 05878 } 05879 goto finished; 05880 } 05881 else if (iconv == w_iconv16) { 05882 while ((c1 = (*i_getc)(f)) != EOF && 05883 (c2 = (*i_getc)(f)) != EOF) { 05884 size_t ret = nkf_iconv_utf_16(c1, c2, 0, 0); 05885 if (ret == NKF_ICONV_NEED_TWO_MORE_BYTES && 05886 (c3 = (*i_getc)(f)) != EOF && 05887 (c4 = (*i_getc)(f)) != EOF) { 05888 nkf_iconv_utf_16(c1, c2, c3, c4); 05889 } else if (ret == (size_t)NKF_ICONV_WAIT_COMBINING_CHAR) { 05890 if ((c3 = (*i_getc)(f)) != EOF && 05891 (c4 = (*i_getc)(f)) != EOF) { 05892 if (nkf_iconv_utf_16_combine(c1, c2, c3, c4)) { 05893 (*i_ungetc)(c4, f); 05894 (*i_ungetc)(c3, f); 05895 nkf_iconv_utf_16_nocombine(c1, c2); 05896 } 05897 } else { 05898 nkf_iconv_utf_16_nocombine(c1, c2); 05899 } 05900 } 05901 } 05902 goto finished; 05903 } 05904 #endif 05905 05906 while ((c1 = (*i_getc)(f)) != EOF) { 05907 #ifdef INPUT_CODE_FIX 05908 if (!input_encoding) 05909 #endif 05910 code_status(c1); 05911 if (c2) { 05912 /* second byte */ 05913 if (c2 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) { 05914 /* in case of 8th bit is on */ 05915 if (!estab_f&&!mime_decode_mode) { 05916 /* in case of not established yet */ 05917 /* It is still ambiguious */ 05918 if (h_conv(f, c2, c1)==EOF) { 05919 LAST; 05920 } 05921 else { 05922 SKIP; 05923 } 05924 } 05925 else { 05926 /* in case of already established */ 05927 if (c1 < 0x40) { 05928 /* ignore bogus code */ 05929 SKIP; 05930 } else { 05931 SEND; 05932 } 05933 } 05934 } 05935 else { 05936 /* 2nd byte of 7 bit code or SJIS */ 05937 SEND; 05938 } 05939 } 05940 else if (nkf_char_unicode_p(c1)) { 05941 (*oconv)(0, c1); 05942 NEXT; 05943 } 05944 else { 05945 /* first byte */ 05946 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) { 05947 /* CP5022x */ 05948 MORE; 05949 }else if (input_codename && input_codename[0] == 'I' && 05950 0xA1 <= c1 && c1 <= 0xDF) { 05951 /* JIS X 0201 Katakana in 8bit JIS */ 05952 c2 = JIS_X_0201_1976_K; 05953 c1 &= 0x7f; 05954 SEND; 05955 } else if (c1 > DEL) { 05956 /* 8 bit code */ 05957 if (!estab_f && !iso8859_f) { 05958 /* not established yet */ 05959 MORE; 05960 } else { /* estab_f==TRUE */ 05961 if (iso8859_f) { 05962 c2 = ISO_8859_1; 05963 c1 &= 0x7f; 05964 SEND; 05965 } 05966 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) || 05967 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) { 05968 /* JIS X 0201 */ 05969 c2 = JIS_X_0201_1976_K; 05970 c1 &= 0x7f; 05971 SEND; 05972 } 05973 else { 05974 /* already established */ 05975 MORE; 05976 } 05977 } 05978 } else if (SP < c1 && c1 < DEL) { 05979 /* in case of Roman characters */ 05980 if (shift_mode) { 05981 /* output 1 shifted byte */ 05982 if (iso8859_f) { 05983 c2 = ISO_8859_1; 05984 SEND; 05985 } else if (nkf_byte_jisx0201_katakana_p(c1)){ 05986 /* output 1 shifted byte */ 05987 c2 = JIS_X_0201_1976_K; 05988 SEND; 05989 } else { 05990 /* look like bogus code */ 05991 SKIP; 05992 } 05993 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 || 05994 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) { 05995 /* in case of Kanji shifted */ 05996 MORE; 05997 } else if (c1 == '=' && mime_f && !mime_decode_mode) { 05998 /* Check MIME code */ 05999 if ((c1 = (*i_getc)(f)) == EOF) { 06000 (*oconv)(0, '='); 06001 LAST; 06002 } else if (c1 == '?') { 06003 /* =? is mime conversion start sequence */ 06004 if(mime_f == STRICT_MIME) { 06005 /* check in real detail */ 06006 if (mime_begin_strict(f) == EOF) 06007 LAST; 06008 SKIP; 06009 } else if (mime_begin(f) == EOF) 06010 LAST; 06011 SKIP; 06012 } else { 06013 (*oconv)(0, '='); 06014 (*i_ungetc)(c1,f); 06015 SKIP; 06016 } 06017 } else { 06018 /* normal ASCII code */ 06019 SEND; 06020 } 06021 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) { 06022 shift_mode = 0; 06023 SKIP; 06024 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) { 06025 shift_mode = 1; 06026 SKIP; 06027 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) { 06028 if ((c1 = (*i_getc)(f)) == EOF) { 06029 (*oconv)(0, ESC); 06030 LAST; 06031 } 06032 else if (c1 == '&') { 06033 /* IRR */ 06034 if ((c1 = (*i_getc)(f)) == EOF) { 06035 LAST; 06036 } else { 06037 SKIP; 06038 } 06039 } 06040 else if (c1 == '$') { 06041 /* GZDMx */ 06042 if ((c1 = (*i_getc)(f)) == EOF) { 06043 /* don't send bogus code 06044 (*oconv)(0, ESC); 06045 (*oconv)(0, '$'); */ 06046 LAST; 06047 } else if (c1 == '@' || c1 == 'B') { 06048 /* JIS X 0208 */ 06049 set_input_mode(JIS_X_0208); 06050 SKIP; 06051 } else if (c1 == '(') { 06052 /* GZDM4 */ 06053 if ((c1 = (*i_getc)(f)) == EOF) { 06054 /* don't send bogus code 06055 (*oconv)(0, ESC); 06056 (*oconv)(0, '$'); 06057 (*oconv)(0, '('); 06058 */ 06059 LAST; 06060 } else if (c1 == '@'|| c1 == 'B') { 06061 /* JIS X 0208 */ 06062 set_input_mode(JIS_X_0208); 06063 SKIP; 06064 #ifdef X0212_ENABLE 06065 } else if (c1 == 'D'){ 06066 set_input_mode(JIS_X_0212); 06067 SKIP; 06068 #endif /* X0212_ENABLE */ 06069 } else if (c1 == 'O' || c1 == 'Q'){ 06070 set_input_mode(JIS_X_0213_1); 06071 SKIP; 06072 } else if (c1 == 'P'){ 06073 set_input_mode(JIS_X_0213_2); 06074 SKIP; 06075 } else { 06076 /* could be some special code */ 06077 (*oconv)(0, ESC); 06078 (*oconv)(0, '$'); 06079 (*oconv)(0, '('); 06080 (*oconv)(0, c1); 06081 SKIP; 06082 } 06083 } else if (broken_f&0x2) { 06084 /* accept any ESC-(-x as broken code ... */ 06085 input_mode = JIS_X_0208; 06086 shift_mode = 0; 06087 SKIP; 06088 } else { 06089 (*oconv)(0, ESC); 06090 (*oconv)(0, '$'); 06091 (*oconv)(0, c1); 06092 SKIP; 06093 } 06094 } else if (c1 == '(') { 06095 /* GZD4 */ 06096 if ((c1 = (*i_getc)(f)) == EOF) { 06097 /* don't send bogus code 06098 (*oconv)(0, ESC); 06099 (*oconv)(0, '('); */ 06100 LAST; 06101 } 06102 else if (c1 == 'I') { 06103 /* JIS X 0201 Katakana */ 06104 set_input_mode(JIS_X_0201_1976_K); 06105 shift_mode = 1; 06106 SKIP; 06107 } 06108 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') { 06109 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */ 06110 set_input_mode(ASCII); 06111 SKIP; 06112 } 06113 else if (broken_f&0x2) { 06114 set_input_mode(ASCII); 06115 SKIP; 06116 } 06117 else { 06118 (*oconv)(0, ESC); 06119 (*oconv)(0, '('); 06120 SEND; 06121 } 06122 } 06123 else if (c1 == '.') { 06124 /* G2D6 */ 06125 if ((c1 = (*i_getc)(f)) == EOF) { 06126 LAST; 06127 } 06128 else if (c1 == 'A') { 06129 /* ISO-8859-1 */ 06130 g2 = ISO_8859_1; 06131 SKIP; 06132 } 06133 else { 06134 (*oconv)(0, ESC); 06135 (*oconv)(0, '.'); 06136 SEND; 06137 } 06138 } 06139 else if (c1 == 'N') { 06140 /* SS2 */ 06141 c1 = (*i_getc)(f); 06142 if (g2 == ISO_8859_1) { 06143 c2 = ISO_8859_1; 06144 SEND; 06145 }else{ 06146 (*i_ungetc)(c1, f); 06147 /* lonely ESC */ 06148 (*oconv)(0, ESC); 06149 SEND; 06150 } 06151 } 06152 else { 06153 /* lonely ESC */ 06154 (*oconv)(0, ESC); 06155 SEND; 06156 } 06157 } else if (c1 == ESC && iconv == s_iconv) { 06158 /* ESC in Shift_JIS */ 06159 if ((c1 = (*i_getc)(f)) == EOF) { 06160 (*oconv)(0, ESC); 06161 LAST; 06162 } else if (c1 == '$') { 06163 /* J-PHONE emoji */ 06164 if ((c1 = (*i_getc)(f)) == EOF) { 06165 LAST; 06166 } else if (('E' <= c1 && c1 <= 'G') || 06167 ('O' <= c1 && c1 <= 'Q')) { 06168 /* 06169 NUM : 0 1 2 3 4 5 06170 BYTE: G E F O P Q 06171 C%7 : 1 6 0 2 3 4 06172 C%7 : 0 1 2 3 4 5 6 06173 NUM : 2 0 3 4 5 X 1 06174 */ 06175 static const nkf_char jphone_emoji_first_table[7] = 06176 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0}; 06177 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]); 06178 if ((c1 = (*i_getc)(f)) == EOF) LAST; 06179 while (SP <= c1 && c1 <= 'z') { 06180 (*oconv)(0, c1 + c3); 06181 if ((c1 = (*i_getc)(f)) == EOF) LAST; 06182 } 06183 SKIP; 06184 } 06185 else { 06186 (*oconv)(0, ESC); 06187 (*oconv)(0, '$'); 06188 SEND; 06189 } 06190 } 06191 else { 06192 /* lonely ESC */ 06193 (*oconv)(0, ESC); 06194 SEND; 06195 } 06196 } else if (c1 == LF || c1 == CR) { 06197 if (broken_f&4) { 06198 input_mode = ASCII; set_iconv(FALSE, 0); 06199 SEND; 06200 } else if (mime_decode_f && !mime_decode_mode){ 06201 if (c1 == LF) { 06202 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) { 06203 i_ungetc(SP,f); 06204 continue; 06205 } else { 06206 i_ungetc(c1,f); 06207 } 06208 c1 = LF; 06209 SEND; 06210 } else { /* if (c1 == CR)*/ 06211 if ((c1=(*i_getc)(f))!=EOF) { 06212 if (c1==SP) { 06213 i_ungetc(SP,f); 06214 continue; 06215 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) { 06216 i_ungetc(SP,f); 06217 continue; 06218 } else { 06219 i_ungetc(c1,f); 06220 } 06221 i_ungetc(LF,f); 06222 } else { 06223 i_ungetc(c1,f); 06224 } 06225 c1 = CR; 06226 SEND; 06227 } 06228 } 06229 } else 06230 SEND; 06231 } 06232 /* send: */ 06233 switch(input_mode){ 06234 case ASCII: 06235 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */ 06236 case -2: 06237 /* 4 bytes UTF-8 */ 06238 if ((c3 = (*i_getc)(f)) != EOF) { 06239 code_status(c3); 06240 c3 <<= 8; 06241 if ((c4 = (*i_getc)(f)) != EOF) { 06242 code_status(c4); 06243 (*iconv)(c2, c1, c3|c4); 06244 } 06245 } 06246 break; 06247 case -3: 06248 /* 4 bytes UTF-8 (check combining character) */ 06249 if ((c3 = (*i_getc)(f)) != EOF) { 06250 if ((c4 = (*i_getc)(f)) != EOF) { 06251 if (w_iconv_combine(c2, c1, 0, c3, c4, 0)) { 06252 (*i_ungetc)(c4, f); 06253 (*i_ungetc)(c3, f); 06254 w_iconv_nocombine(c2, c1, 0); 06255 } 06256 } else { 06257 (*i_ungetc)(c3, f); 06258 w_iconv_nocombine(c2, c1, 0); 06259 } 06260 } else { 06261 w_iconv_nocombine(c2, c1, 0); 06262 } 06263 break; 06264 case -1: 06265 /* 3 bytes EUC or UTF-8 */ 06266 if ((c3 = (*i_getc)(f)) != EOF) { 06267 code_status(c3); 06268 if ((*iconv)(c2, c1, c3) == -3) { 06269 /* 6 bytes UTF-8 (check combining character) */ 06270 nkf_char c5, c6; 06271 if ((c4 = (*i_getc)(f)) != EOF) { 06272 if ((c5 = (*i_getc)(f)) != EOF) { 06273 if ((c6 = (*i_getc)(f)) != EOF) { 06274 if (w_iconv_combine(c2, c1, c3, c4, c5, c6)) { 06275 (*i_ungetc)(c6, f); 06276 (*i_ungetc)(c5, f); 06277 (*i_ungetc)(c4, f); 06278 w_iconv_nocombine(c2, c1, c3); 06279 } 06280 } else { 06281 (*i_ungetc)(c5, f); 06282 (*i_ungetc)(c4, f); 06283 w_iconv_nocombine(c2, c1, c3); 06284 } 06285 } else { 06286 (*i_ungetc)(c4, f); 06287 w_iconv_nocombine(c2, c1, c3); 06288 } 06289 } else { 06290 w_iconv_nocombine(c2, c1, c3); 06291 } 06292 } 06293 } 06294 break; 06295 } 06296 break; 06297 case JIS_X_0208: 06298 case JIS_X_0213_1: 06299 if (ms_ucs_map_f && 06300 0x7F <= c2 && c2 <= 0x92 && 06301 0x21 <= c1 && c1 <= 0x7E) { 06302 /* CP932 UDC */ 06303 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000); 06304 c2 = 0; 06305 } 06306 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */ 06307 break; 06308 #ifdef X0212_ENABLE 06309 case JIS_X_0212: 06310 (*oconv)(PREFIX_EUCG3 | c2, c1); 06311 break; 06312 #endif /* X0212_ENABLE */ 06313 case JIS_X_0213_2: 06314 (*oconv)(PREFIX_EUCG3 | c2, c1); 06315 break; 06316 default: 06317 (*oconv)(input_mode, c1); /* other special case */ 06318 } 06319 06320 c2 = 0; 06321 c3 = 0; 06322 continue; 06323 /* goto next_word */ 06324 } 06325 06326 finished: 06327 /* epilogue */ 06328 (*iconv)(EOF, 0, 0); 06329 if (!input_codename) 06330 { 06331 if (is_8bit) { 06332 struct input_code *p = input_code_list; 06333 struct input_code *result = p; 06334 while (p->name){ 06335 if (p->score < result->score) result = p; 06336 ++p; 06337 } 06338 set_input_codename(result->name); 06339 #ifdef CHECK_OPTION 06340 debug(result->name); 06341 #endif 06342 } 06343 } 06344 return 0; 06345 } 06346 06347 /* 06348 * int options(unsigned char *cp) 06349 * 06350 * return values: 06351 * 0: success 06352 * -1: ArgumentError 06353 */ 06354 static int 06355 options(unsigned char *cp) 06356 { 06357 nkf_char i, j; 06358 unsigned char *p; 06359 unsigned char *cp_back = NULL; 06360 nkf_encoding *enc; 06361 06362 if (option_mode==1) 06363 return 0; 06364 while(*cp && *cp++!='-'); 06365 while (*cp || cp_back) { 06366 if(!*cp){ 06367 cp = cp_back; 06368 cp_back = NULL; 06369 continue; 06370 } 06371 p = 0; 06372 switch (*cp++) { 06373 case '-': /* literal options */ 06374 if (!*cp || *cp == SP) { /* ignore the rest of arguments */ 06375 option_mode = 1; 06376 return 0; 06377 } 06378 for (i=0;i<(int)(sizeof(long_option)/sizeof(long_option[0]));i++) { 06379 p = (unsigned char *)long_option[i].name; 06380 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++); 06381 if (*p == cp[j] || cp[j] == SP){ 06382 p = &cp[j] + 1; 06383 break; 06384 } 06385 p = 0; 06386 } 06387 if (p == 0) { 06388 #if !defined(PERL_XS) && !defined(WIN32DLL) 06389 fprintf(stderr, "unknown long option: --%s\n", cp); 06390 #endif 06391 return -1; 06392 } 06393 while(*cp && *cp != SP && cp++); 06394 if (long_option[i].alias[0]){ 06395 cp_back = cp; 06396 cp = (unsigned char *)long_option[i].alias; 06397 }else{ 06398 #ifndef PERL_XS 06399 if (strcmp(long_option[i].name, "help") == 0){ 06400 usage(); 06401 exit(EXIT_SUCCESS); 06402 } 06403 #endif 06404 if (strcmp(long_option[i].name, "ic=") == 0){ 06405 enc = nkf_enc_find((char *)p); 06406 if (!enc) continue; 06407 input_encoding = enc; 06408 continue; 06409 } 06410 if (strcmp(long_option[i].name, "oc=") == 0){ 06411 enc = nkf_enc_find((char *)p); 06412 /* if (enc <= 0) continue; */ 06413 if (!enc) continue; 06414 output_encoding = enc; 06415 continue; 06416 } 06417 if (strcmp(long_option[i].name, "guess=") == 0){ 06418 if (p[0] == '0' || p[0] == '1') { 06419 guess_f = 1; 06420 } else { 06421 guess_f = 2; 06422 } 06423 continue; 06424 } 06425 #ifdef OVERWRITE 06426 if (strcmp(long_option[i].name, "overwrite") == 0){ 06427 file_out_f = TRUE; 06428 overwrite_f = TRUE; 06429 preserve_time_f = TRUE; 06430 continue; 06431 } 06432 if (strcmp(long_option[i].name, "overwrite=") == 0){ 06433 file_out_f = TRUE; 06434 overwrite_f = TRUE; 06435 preserve_time_f = TRUE; 06436 backup_f = TRUE; 06437 backup_suffix = (char *)p; 06438 continue; 06439 } 06440 if (strcmp(long_option[i].name, "in-place") == 0){ 06441 file_out_f = TRUE; 06442 overwrite_f = TRUE; 06443 preserve_time_f = FALSE; 06444 continue; 06445 } 06446 if (strcmp(long_option[i].name, "in-place=") == 0){ 06447 file_out_f = TRUE; 06448 overwrite_f = TRUE; 06449 preserve_time_f = FALSE; 06450 backup_f = TRUE; 06451 backup_suffix = (char *)p; 06452 continue; 06453 } 06454 #endif 06455 #ifdef INPUT_OPTION 06456 if (strcmp(long_option[i].name, "cap-input") == 0){ 06457 cap_f = TRUE; 06458 continue; 06459 } 06460 if (strcmp(long_option[i].name, "url-input") == 0){ 06461 url_f = TRUE; 06462 continue; 06463 } 06464 #endif 06465 #ifdef NUMCHAR_OPTION 06466 if (strcmp(long_option[i].name, "numchar-input") == 0){ 06467 numchar_f = TRUE; 06468 continue; 06469 } 06470 #endif 06471 #ifdef CHECK_OPTION 06472 if (strcmp(long_option[i].name, "no-output") == 0){ 06473 noout_f = TRUE; 06474 continue; 06475 } 06476 if (strcmp(long_option[i].name, "debug") == 0){ 06477 debug_f = TRUE; 06478 continue; 06479 } 06480 #endif 06481 if (strcmp(long_option[i].name, "cp932") == 0){ 06482 #ifdef SHIFTJIS_CP932 06483 cp51932_f = TRUE; 06484 cp932inv_f = -TRUE; 06485 #endif 06486 #ifdef UTF8_OUTPUT_ENABLE 06487 ms_ucs_map_f = UCS_MAP_CP932; 06488 #endif 06489 continue; 06490 } 06491 if (strcmp(long_option[i].name, "no-cp932") == 0){ 06492 #ifdef SHIFTJIS_CP932 06493 cp51932_f = FALSE; 06494 cp932inv_f = FALSE; 06495 #endif 06496 #ifdef UTF8_OUTPUT_ENABLE 06497 ms_ucs_map_f = UCS_MAP_ASCII; 06498 #endif 06499 continue; 06500 } 06501 #ifdef SHIFTJIS_CP932 06502 if (strcmp(long_option[i].name, "cp932inv") == 0){ 06503 cp932inv_f = -TRUE; 06504 continue; 06505 } 06506 #endif 06507 06508 #ifdef X0212_ENABLE 06509 if (strcmp(long_option[i].name, "x0212") == 0){ 06510 x0212_f = TRUE; 06511 continue; 06512 } 06513 #endif 06514 06515 #ifdef EXEC_IO 06516 if (strcmp(long_option[i].name, "exec-in") == 0){ 06517 exec_f = 1; 06518 return 0; 06519 } 06520 if (strcmp(long_option[i].name, "exec-out") == 0){ 06521 exec_f = -1; 06522 return 0; 06523 } 06524 #endif 06525 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE) 06526 if (strcmp(long_option[i].name, "no-cp932ext") == 0){ 06527 no_cp932ext_f = TRUE; 06528 continue; 06529 } 06530 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){ 06531 no_best_fit_chars_f = TRUE; 06532 continue; 06533 } 06534 if (strcmp(long_option[i].name, "fb-skip") == 0){ 06535 encode_fallback = NULL; 06536 continue; 06537 } 06538 if (strcmp(long_option[i].name, "fb-html") == 0){ 06539 encode_fallback = encode_fallback_html; 06540 continue; 06541 } 06542 if (strcmp(long_option[i].name, "fb-xml") == 0){ 06543 encode_fallback = encode_fallback_xml; 06544 continue; 06545 } 06546 if (strcmp(long_option[i].name, "fb-java") == 0){ 06547 encode_fallback = encode_fallback_java; 06548 continue; 06549 } 06550 if (strcmp(long_option[i].name, "fb-perl") == 0){ 06551 encode_fallback = encode_fallback_perl; 06552 continue; 06553 } 06554 if (strcmp(long_option[i].name, "fb-subchar") == 0){ 06555 encode_fallback = encode_fallback_subchar; 06556 continue; 06557 } 06558 if (strcmp(long_option[i].name, "fb-subchar=") == 0){ 06559 encode_fallback = encode_fallback_subchar; 06560 unicode_subchar = 0; 06561 if (p[0] != '0'){ 06562 /* decimal number */ 06563 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){ 06564 unicode_subchar *= 10; 06565 unicode_subchar += hex2bin(p[i]); 06566 } 06567 }else if(p[1] == 'x' || p[1] == 'X'){ 06568 /* hexadecimal number */ 06569 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){ 06570 unicode_subchar <<= 4; 06571 unicode_subchar |= hex2bin(p[i]); 06572 } 06573 }else{ 06574 /* octal number */ 06575 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){ 06576 unicode_subchar *= 8; 06577 unicode_subchar += hex2bin(p[i]); 06578 } 06579 } 06580 w16e_conv(unicode_subchar, &i, &j); 06581 unicode_subchar = i<<8 | j; 06582 continue; 06583 } 06584 #endif 06585 #ifdef UTF8_OUTPUT_ENABLE 06586 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){ 06587 ms_ucs_map_f = UCS_MAP_MS; 06588 continue; 06589 } 06590 #endif 06591 #ifdef UNICODE_NORMALIZATION 06592 if (strcmp(long_option[i].name, "utf8mac-input") == 0){ 06593 nfc_f = TRUE; 06594 continue; 06595 } 06596 #endif 06597 if (strcmp(long_option[i].name, "prefix=") == 0){ 06598 if (nkf_isgraph(p[0])){ 06599 for (i = 1; nkf_isgraph(p[i]); i++){ 06600 prefix_table[p[i]] = p[0]; 06601 } 06602 } 06603 continue; 06604 } 06605 #if !defined(PERL_XS) && !defined(WIN32DLL) 06606 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name); 06607 #endif 06608 return -1; 06609 } 06610 continue; 06611 case 'b': /* buffered mode */ 06612 unbuf_f = FALSE; 06613 continue; 06614 case 'u': /* non bufferd mode */ 06615 unbuf_f = TRUE; 06616 continue; 06617 case 't': /* transparent mode */ 06618 if (*cp=='1') { 06619 /* alias of -t */ 06620 cp++; 06621 nop_f = TRUE; 06622 } else if (*cp=='2') { 06623 /* 06624 * -t with put/get 06625 * 06626 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin 06627 * 06628 */ 06629 cp++; 06630 nop_f = 2; 06631 } else 06632 nop_f = TRUE; 06633 continue; 06634 case 'j': /* JIS output */ 06635 case 'n': 06636 output_encoding = nkf_enc_from_index(ISO_2022_JP); 06637 continue; 06638 case 'e': /* AT&T EUC output */ 06639 output_encoding = nkf_enc_from_index(EUCJP_NKF); 06640 continue; 06641 case 's': /* SJIS output */ 06642 output_encoding = nkf_enc_from_index(SHIFT_JIS); 06643 continue; 06644 case 'l': /* ISO8859 Latin-1 support, no conversion */ 06645 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */ 06646 input_encoding = nkf_enc_from_index(ISO_8859_1); 06647 continue; 06648 case 'i': /* Kanji IN ESC-$-@/B */ 06649 if (*cp=='@'||*cp=='B') 06650 kanji_intro = *cp++; 06651 continue; 06652 case 'o': /* ASCII IN ESC-(-J/B/H */ 06653 /* ESC ( H was used in initial JUNET messages */ 06654 if (*cp=='J'||*cp=='B'||*cp=='H') 06655 ascii_intro = *cp++; 06656 continue; 06657 case 'h': 06658 /* 06659 bit:1 katakana->hiragana 06660 bit:2 hiragana->katakana 06661 */ 06662 if ('9'>= *cp && *cp>='0') 06663 hira_f |= (*cp++ -'0'); 06664 else 06665 hira_f |= 1; 06666 continue; 06667 case 'r': 06668 rot_f = TRUE; 06669 continue; 06670 #if defined(MSDOS) || defined(__OS2__) 06671 case 'T': 06672 binmode_f = FALSE; 06673 continue; 06674 #endif 06675 #ifndef PERL_XS 06676 case 'V': 06677 show_configuration(); 06678 exit(EXIT_SUCCESS); 06679 break; 06680 case 'v': 06681 version(); 06682 exit(EXIT_SUCCESS); 06683 break; 06684 #endif 06685 #ifdef UTF8_OUTPUT_ENABLE 06686 case 'w': /* UTF-{8,16,32} output */ 06687 if (cp[0] == '8') { 06688 cp++; 06689 if (cp[0] == '0'){ 06690 cp++; 06691 output_encoding = nkf_enc_from_index(UTF_8N); 06692 } else { 06693 output_bom_f = TRUE; 06694 output_encoding = nkf_enc_from_index(UTF_8_BOM); 06695 } 06696 } else { 06697 int enc_idx; 06698 if ('1'== cp[0] && '6'==cp[1]) { 06699 cp += 2; 06700 enc_idx = UTF_16; 06701 } else if ('3'== cp[0] && '2'==cp[1]) { 06702 cp += 2; 06703 enc_idx = UTF_32; 06704 } else { 06705 output_encoding = nkf_enc_from_index(UTF_8); 06706 continue; 06707 } 06708 if (cp[0]=='L') { 06709 cp++; 06710 output_endian = ENDIAN_LITTLE; 06711 output_bom_f = TRUE; 06712 } else if (cp[0] == 'B') { 06713 cp++; 06714 output_bom_f = TRUE; 06715 } 06716 if (cp[0] == '0'){ 06717 output_bom_f = FALSE; 06718 cp++; 06719 enc_idx = enc_idx == UTF_16 06720 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE) 06721 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE); 06722 } else { 06723 enc_idx = enc_idx == UTF_16 06724 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM) 06725 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM); 06726 } 06727 output_encoding = nkf_enc_from_index(enc_idx); 06728 } 06729 continue; 06730 #endif 06731 #ifdef UTF8_INPUT_ENABLE 06732 case 'W': /* UTF input */ 06733 if (cp[0] == '8') { 06734 cp++; 06735 input_encoding = nkf_enc_from_index(UTF_8); 06736 }else{ 06737 int enc_idx; 06738 if ('1'== cp[0] && '6'==cp[1]) { 06739 cp += 2; 06740 input_endian = ENDIAN_BIG; 06741 enc_idx = UTF_16; 06742 } else if ('3'== cp[0] && '2'==cp[1]) { 06743 cp += 2; 06744 input_endian = ENDIAN_BIG; 06745 enc_idx = UTF_32; 06746 } else { 06747 input_encoding = nkf_enc_from_index(UTF_8); 06748 continue; 06749 } 06750 if (cp[0]=='L') { 06751 cp++; 06752 input_endian = ENDIAN_LITTLE; 06753 } else if (cp[0] == 'B') { 06754 cp++; 06755 input_endian = ENDIAN_BIG; 06756 } 06757 enc_idx = (enc_idx == UTF_16 06758 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE) 06759 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE)); 06760 input_encoding = nkf_enc_from_index(enc_idx); 06761 } 06762 continue; 06763 #endif 06764 /* Input code assumption */ 06765 case 'J': /* ISO-2022-JP input */ 06766 input_encoding = nkf_enc_from_index(ISO_2022_JP); 06767 continue; 06768 case 'E': /* EUC-JP input */ 06769 input_encoding = nkf_enc_from_index(EUCJP_NKF); 06770 continue; 06771 case 'S': /* Shift_JIS input */ 06772 input_encoding = nkf_enc_from_index(SHIFT_JIS); 06773 continue; 06774 case 'Z': /* Convert X0208 alphabet to asii */ 06775 /* alpha_f 06776 bit:0 Convert JIS X 0208 Alphabet to ASCII 06777 bit:1 Convert Kankaku to one space 06778 bit:2 Convert Kankaku to two spaces 06779 bit:3 Convert HTML Entity 06780 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana 06781 */ 06782 while ('0'<= *cp && *cp <='4') { 06783 alpha_f |= 1 << (*cp++ - '0'); 06784 } 06785 alpha_f |= 1; 06786 continue; 06787 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */ 06788 x0201_f = FALSE; /* No X0201->X0208 conversion */ 06789 /* accept X0201 06790 ESC-(-I in JIS, EUC, MS Kanji 06791 SI/SO in JIS, EUC, MS Kanji 06792 SS2 in EUC, JIS, not in MS Kanji 06793 MS Kanji (0xa0-0xdf) 06794 output X0201 06795 ESC-(-I in JIS (0x20-0x5f) 06796 SS2 in EUC (0xa0-0xdf) 06797 0xa0-0xd in MS Kanji (0xa0-0xdf) 06798 */ 06799 continue; 06800 case 'X': /* Convert X0201 kana to X0208 */ 06801 x0201_f = TRUE; 06802 continue; 06803 case 'F': /* prserve new lines */ 06804 fold_preserve_f = TRUE; 06805 case 'f': /* folding -f60 or -f */ 06806 fold_f = TRUE; 06807 fold_len = 0; 06808 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */ 06809 fold_len *= 10; 06810 fold_len += *cp++ - '0'; 06811 } 06812 if (!(0<fold_len && fold_len<BUFSIZ)) 06813 fold_len = DEFAULT_FOLD; 06814 if (*cp=='-') { 06815 fold_margin = 0; 06816 cp++; 06817 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */ 06818 fold_margin *= 10; 06819 fold_margin += *cp++ - '0'; 06820 } 06821 } 06822 continue; 06823 case 'm': /* MIME support */ 06824 /* mime_decode_f = TRUE; */ /* this has too large side effects... */ 06825 if (*cp=='B'||*cp=='Q') { 06826 mime_decode_mode = *cp++; 06827 mimebuf_f = FIXED_MIME; 06828 } else if (*cp=='N') { 06829 mime_f = TRUE; cp++; 06830 } else if (*cp=='S') { 06831 mime_f = STRICT_MIME; cp++; 06832 } else if (*cp=='0') { 06833 mime_decode_f = FALSE; 06834 mime_f = FALSE; cp++; 06835 } else { 06836 mime_f = STRICT_MIME; 06837 } 06838 continue; 06839 case 'M': /* MIME output */ 06840 if (*cp=='B') { 06841 mimeout_mode = 'B'; 06842 mimeout_f = FIXED_MIME; cp++; 06843 } else if (*cp=='Q') { 06844 mimeout_mode = 'Q'; 06845 mimeout_f = FIXED_MIME; cp++; 06846 } else { 06847 mimeout_f = TRUE; 06848 } 06849 continue; 06850 case 'B': /* Broken JIS support */ 06851 /* bit:0 no ESC JIS 06852 bit:1 allow any x on ESC-(-x or ESC-$-x 06853 bit:2 reset to ascii on NL 06854 */ 06855 if ('9'>= *cp && *cp>='0') 06856 broken_f |= 1<<(*cp++ -'0'); 06857 else 06858 broken_f |= TRUE; 06859 continue; 06860 #ifndef PERL_XS 06861 case 'O':/* for Output file */ 06862 file_out_f = TRUE; 06863 continue; 06864 #endif 06865 case 'c':/* add cr code */ 06866 eolmode_f = CRLF; 06867 continue; 06868 case 'd':/* delete cr code */ 06869 eolmode_f = LF; 06870 continue; 06871 case 'I': /* ISO-2022-JP output */ 06872 iso2022jp_f = TRUE; 06873 continue; 06874 case 'L': /* line mode */ 06875 if (*cp=='u') { /* unix */ 06876 eolmode_f = LF; cp++; 06877 } else if (*cp=='m') { /* mac */ 06878 eolmode_f = CR; cp++; 06879 } else if (*cp=='w') { /* windows */ 06880 eolmode_f = CRLF; cp++; 06881 } else if (*cp=='0') { /* no conversion */ 06882 eolmode_f = 0; cp++; 06883 } 06884 continue; 06885 #ifndef PERL_XS 06886 case 'g': 06887 if ('2' <= *cp && *cp <= '9') { 06888 guess_f = 2; 06889 cp++; 06890 } else if (*cp == '0' || *cp == '1') { 06891 guess_f = 1; 06892 cp++; 06893 } else { 06894 guess_f = 1; 06895 } 06896 continue; 06897 #endif 06898 case SP: 06899 /* module muliple options in a string are allowed for Perl moudle */ 06900 while(*cp && *cp++!='-'); 06901 continue; 06902 default: 06903 #if !defined(PERL_XS) && !defined(WIN32DLL) 06904 fprintf(stderr, "unknown option: -%c\n", *(cp-1)); 06905 #endif 06906 /* bogus option but ignored */ 06907 return -1; 06908 } 06909 } 06910 return 0; 06911 } 06912 06913 #ifdef WIN32DLL 06914 #include "nkf32dll.c" 06915 #elif defined(PERL_XS) 06916 #else /* WIN32DLL */ 06917 int 06918 main(int argc, char **argv) 06919 { 06920 FILE *fin; 06921 unsigned char *cp; 06922 06923 char *outfname = NULL; 06924 char *origfname; 06925 06926 #ifdef EASYWIN /*Easy Win */ 06927 _BufferSize.y = 400;/*Set Scroll Buffer Size*/ 06928 #endif 06929 #ifdef DEFAULT_CODE_LOCALE 06930 setlocale(LC_CTYPE, ""); 06931 #endif 06932 nkf_state_init(); 06933 06934 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) { 06935 cp = (unsigned char *)*argv; 06936 options(cp); 06937 #ifdef EXEC_IO 06938 if (exec_f){ 06939 int fds[2], pid; 06940 if (pipe(fds) < 0 || (pid = fork()) < 0){ 06941 abort(); 06942 } 06943 if (pid == 0){ 06944 if (exec_f > 0){ 06945 close(fds[0]); 06946 dup2(fds[1], 1); 06947 }else{ 06948 close(fds[1]); 06949 dup2(fds[0], 0); 06950 } 06951 execvp(argv[1], &argv[1]); 06952 } 06953 if (exec_f > 0){ 06954 close(fds[1]); 06955 dup2(fds[0], 0); 06956 }else{ 06957 close(fds[0]); 06958 dup2(fds[1], 1); 06959 } 06960 argc = 0; 06961 break; 06962 } 06963 #endif 06964 } 06965 06966 if (guess_f) { 06967 #ifdef CHECK_OPTION 06968 int debug_f_back = debug_f; 06969 #endif 06970 #ifdef EXEC_IO 06971 int exec_f_back = exec_f; 06972 #endif 06973 #ifdef X0212_ENABLE 06974 int x0212_f_back = x0212_f; 06975 #endif 06976 int x0213_f_back = x0213_f; 06977 int guess_f_back = guess_f; 06978 reinit(); 06979 guess_f = guess_f_back; 06980 mime_f = FALSE; 06981 #ifdef CHECK_OPTION 06982 debug_f = debug_f_back; 06983 #endif 06984 #ifdef EXEC_IO 06985 exec_f = exec_f_back; 06986 #endif 06987 x0212_f = x0212_f_back; 06988 x0213_f = x0213_f_back; 06989 } 06990 06991 if (binmode_f == TRUE) 06992 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) 06993 if (freopen("","wb",stdout) == NULL) 06994 return (-1); 06995 #else 06996 setbinmode(stdout); 06997 #endif 06998 06999 if (unbuf_f) 07000 setbuf(stdout, (char *) NULL); 07001 else 07002 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE); 07003 07004 if (argc == 0) { 07005 if (binmode_f == TRUE) 07006 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) 07007 if (freopen("","rb",stdin) == NULL) return (-1); 07008 #else 07009 setbinmode(stdin); 07010 #endif 07011 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE); 07012 if (nop_f) 07013 noconvert(stdin); 07014 else { 07015 kanji_convert(stdin); 07016 if (guess_f) print_guessed_code(NULL); 07017 } 07018 } else { 07019 int nfiles = argc; 07020 int is_argument_error = FALSE; 07021 while (argc--) { 07022 input_codename = NULL; 07023 input_eol = 0; 07024 #ifdef CHECK_OPTION 07025 iconv_for_check = 0; 07026 #endif 07027 if ((fin = fopen((origfname = *argv++), "r")) == NULL) { 07028 perror(*(argv-1)); 07029 is_argument_error = TRUE; 07030 continue; 07031 } else { 07032 #ifdef OVERWRITE 07033 int fd = 0; 07034 int fd_backup = 0; 07035 #endif 07036 07037 /* reopen file for stdout */ 07038 if (file_out_f == TRUE) { 07039 #ifdef OVERWRITE 07040 if (overwrite_f){ 07041 outfname = nkf_xmalloc(strlen(origfname) 07042 + strlen(".nkftmpXXXXXX") 07043 + 1); 07044 strcpy(outfname, origfname); 07045 #ifdef MSDOS 07046 { 07047 int i; 07048 for (i = strlen(outfname); i; --i){ 07049 if (outfname[i - 1] == '/' 07050 || outfname[i - 1] == '\\'){ 07051 break; 07052 } 07053 } 07054 outfname[i] = '\0'; 07055 } 07056 strcat(outfname, "ntXXXXXX"); 07057 mktemp(outfname); 07058 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 07059 S_IREAD | S_IWRITE); 07060 #else 07061 strcat(outfname, ".nkftmpXXXXXX"); 07062 fd = mkstemp(outfname); 07063 #endif 07064 if (fd < 0 07065 || (fd_backup = dup(fileno(stdout))) < 0 07066 || dup2(fd, fileno(stdout)) < 0 07067 ){ 07068 perror(origfname); 07069 return -1; 07070 } 07071 }else 07072 #endif 07073 if(argc == 1) { 07074 outfname = *argv++; 07075 argc--; 07076 } else { 07077 outfname = "nkf.out"; 07078 } 07079 07080 if(freopen(outfname, "w", stdout) == NULL) { 07081 perror (outfname); 07082 return (-1); 07083 } 07084 if (binmode_f == TRUE) { 07085 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) 07086 if (freopen("","wb",stdout) == NULL) 07087 return (-1); 07088 #else 07089 setbinmode(stdout); 07090 #endif 07091 } 07092 } 07093 if (binmode_f == TRUE) 07094 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) 07095 if (freopen("","rb",fin) == NULL) 07096 return (-1); 07097 #else 07098 setbinmode(fin); 07099 #endif 07100 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE); 07101 if (nop_f) 07102 noconvert(fin); 07103 else { 07104 char *filename = NULL; 07105 kanji_convert(fin); 07106 if (nfiles > 1) filename = origfname; 07107 if (guess_f) print_guessed_code(filename); 07108 } 07109 fclose(fin); 07110 #ifdef OVERWRITE 07111 if (overwrite_f) { 07112 struct stat sb; 07113 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__) 07114 time_t tb[2]; 07115 #else 07116 struct utimbuf tb; 07117 #endif 07118 07119 fflush(stdout); 07120 close(fd); 07121 if (dup2(fd_backup, fileno(stdout)) < 0){ 07122 perror("dup2"); 07123 } 07124 if (stat(origfname, &sb)) { 07125 fprintf(stderr, "Can't stat %s\n", origfname); 07126 } 07127 /* $B%Q!<%_%C%7%g%s$rI|85(B */ 07128 if (chmod(outfname, sb.st_mode)) { 07129 fprintf(stderr, "Can't set permission %s\n", outfname); 07130 } 07131 07132 /* $B%?%$%`%9%?%s%W$rI|85(B */ 07133 if(preserve_time_f){ 07134 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__) 07135 tb[0] = tb[1] = sb.st_mtime; 07136 if (utime(outfname, tb)) { 07137 fprintf(stderr, "Can't set timestamp %s\n", outfname); 07138 } 07139 #else 07140 tb.actime = sb.st_atime; 07141 tb.modtime = sb.st_mtime; 07142 if (utime(outfname, &tb)) { 07143 fprintf(stderr, "Can't set timestamp %s\n", outfname); 07144 } 07145 #endif 07146 } 07147 if(backup_f){ 07148 char *backup_filename = get_backup_filename(backup_suffix, origfname); 07149 #ifdef MSDOS 07150 unlink(backup_filename); 07151 #endif 07152 if (rename(origfname, backup_filename)) { 07153 perror(backup_filename); 07154 fprintf(stderr, "Can't rename %s to %s\n", 07155 origfname, backup_filename); 07156 } 07157 nkf_xfree(backup_filename); 07158 }else{ 07159 #ifdef MSDOS 07160 if (unlink(origfname)){ 07161 perror(origfname); 07162 } 07163 #endif 07164 } 07165 if (rename(outfname, origfname)) { 07166 perror(origfname); 07167 fprintf(stderr, "Can't rename %s to %s\n", 07168 outfname, origfname); 07169 } 07170 nkf_xfree(outfname); 07171 } 07172 #endif 07173 } 07174 } 07175 if (is_argument_error) 07176 return(-1); 07177 } 07178 #ifdef EASYWIN /*Easy Win */ 07179 if (file_out_f == FALSE) 07180 scanf("%d",&end_check); 07181 else 07182 fclose(stdout); 07183 #else /* for Other OS */ 07184 if (file_out_f == TRUE) 07185 fclose(stdout); 07186 #endif /*Easy Win */ 07187 return (0); 07188 } 07189 #endif /* WIN32DLL */ 07190