Ruby
2.0.0p247(2013-06-27revision41674)
|
00001 /* 00002 * NKF - Ruby extension for Network Kanji Filter 00003 * 00004 * original nkf2.x is maintained at http://sourceforge.jp/projects/nkf/ 00005 * 00006 * $Id: nkf.c 36966 2012-09-14 16:07:49Z naruse $ 00007 * 00008 */ 00009 00010 #define RUBY_NKF_REVISION "$Revision: 36966 $" 00011 #define RUBY_NKF_VERSION NKF_VERSION " (" NKF_RELEASE_DATE ")" 00012 00013 #include "ruby/ruby.h" 00014 #include "ruby/encoding.h" 00015 00016 /* Replace nkf's getchar/putchar for variable modification */ 00017 /* we never use getc, ungetc */ 00018 00019 #undef getc 00020 #undef ungetc 00021 #define getc(f) (input_ctr>=i_len?-1:input[input_ctr++]) 00022 #define ungetc(c,f) input_ctr-- 00023 00024 #define INCSIZE 32 00025 #undef putchar 00026 #undef TRUE 00027 #undef FALSE 00028 #define putchar(c) rb_nkf_putchar(c) 00029 00030 /* Input/Output pointers */ 00031 00032 static unsigned char *output; 00033 static unsigned char *input; 00034 static int input_ctr; 00035 static int i_len; 00036 static int output_ctr; 00037 static int o_len; 00038 static int incsize; 00039 00040 static VALUE result; 00041 00042 static int 00043 rb_nkf_putchar(unsigned int c) 00044 { 00045 if (output_ctr >= o_len) { 00046 o_len += incsize; 00047 rb_str_resize(result, o_len); 00048 incsize *= 2; 00049 output = (unsigned char *)RSTRING_PTR(result); 00050 } 00051 output[output_ctr++] = c; 00052 00053 return c; 00054 } 00055 00056 /* Include kanji filter main part */ 00057 /* getchar and putchar will be replaced during inclusion */ 00058 00059 #define PERL_XS 1 00060 #include "nkf-utf8/config.h" 00061 #include "nkf-utf8/utf8tbl.c" 00062 #include "nkf-utf8/nkf.c" 00063 00064 rb_encoding* rb_nkf_enc_get(const char *name) 00065 { 00066 int idx = rb_enc_find_index(name); 00067 if (idx < 0) { 00068 nkf_encoding *nkf_enc = nkf_enc_find(name); 00069 idx = rb_enc_find_index(nkf_enc_name(nkf_enc_to_base_encoding(nkf_enc))); 00070 if (idx < 0) { 00071 idx = rb_define_dummy_encoding(name); 00072 } 00073 } 00074 return rb_enc_from_index(idx); 00075 } 00076 00077 int nkf_split_options(const char *arg) 00078 { 00079 int count = 0; 00080 unsigned char option[256]; 00081 int i = 0, j = 0; 00082 int is_escaped = FALSE; 00083 int is_single_quoted = FALSE; 00084 int is_double_quoted = FALSE; 00085 for(i = 0; arg[i]; i++){ 00086 if(j == 255){ 00087 return -1; 00088 }else if(is_single_quoted){ 00089 if(arg[i] == '\''){ 00090 is_single_quoted = FALSE; 00091 }else{ 00092 option[j++] = arg[i]; 00093 } 00094 }else if(is_escaped){ 00095 is_escaped = FALSE; 00096 option[j++] = arg[i]; 00097 }else if(arg[i] == '\\'){ 00098 is_escaped = TRUE; 00099 }else if(is_double_quoted){ 00100 if(arg[i] == '"'){ 00101 is_double_quoted = FALSE; 00102 }else{ 00103 option[j++] = arg[i]; 00104 } 00105 }else if(arg[i] == '\''){ 00106 is_single_quoted = TRUE; 00107 }else if(arg[i] == '"'){ 00108 is_double_quoted = TRUE; 00109 }else if(arg[i] == ' '){ 00110 option[j] = '\0'; 00111 options(option); 00112 j = 0; 00113 }else{ 00114 option[j++] = arg[i]; 00115 } 00116 } 00117 if(j){ 00118 option[j] = '\0'; 00119 options(option); 00120 } 00121 return count; 00122 } 00123 00124 /* 00125 * call-seq: 00126 * NKF.nkf(opt, str) => string 00127 * 00128 * Convert _str_ and return converted result. 00129 * Conversion details are specified by _opt_ as String. 00130 * 00131 * require 'nkf' 00132 * output = NKF.nkf("-s", input) 00133 */ 00134 00135 static VALUE 00136 rb_nkf_convert(VALUE obj, VALUE opt, VALUE src) 00137 { 00138 VALUE tmp; 00139 reinit(); 00140 StringValue(opt); 00141 nkf_split_options(RSTRING_PTR(opt)); 00142 if (!output_encoding) rb_raise(rb_eArgError, "no output encoding given"); 00143 00144 switch (nkf_enc_to_index(output_encoding)) { 00145 case UTF_8_BOM: output_encoding = nkf_enc_from_index(UTF_8); break; 00146 case UTF_16BE_BOM: output_encoding = nkf_enc_from_index(UTF_16BE); break; 00147 case UTF_16LE_BOM: output_encoding = nkf_enc_from_index(UTF_16LE); break; 00148 case UTF_32BE_BOM: output_encoding = nkf_enc_from_index(UTF_32BE); break; 00149 case UTF_32LE_BOM: output_encoding = nkf_enc_from_index(UTF_32LE); break; 00150 } 00151 output_bom_f = FALSE; 00152 00153 incsize = INCSIZE; 00154 00155 input_ctr = 0; 00156 StringValue(src); 00157 input = (unsigned char *)RSTRING_PTR(src); 00158 i_len = RSTRING_LENINT(src); 00159 tmp = rb_str_new(0, i_len*3 + 10); 00160 00161 output_ctr = 0; 00162 output = (unsigned char *)RSTRING_PTR(tmp); 00163 o_len = RSTRING_LENINT(tmp); 00164 *output = '\0'; 00165 00166 /* use _result_ begin*/ 00167 result = tmp; 00168 kanji_convert(NULL); 00169 result = Qnil; 00170 /* use _result_ end */ 00171 00172 rb_str_set_len(tmp, output_ctr); 00173 OBJ_INFECT(tmp, src); 00174 00175 if (mimeout_f) 00176 rb_enc_associate(tmp, rb_usascii_encoding()); 00177 else 00178 rb_enc_associate(tmp, rb_nkf_enc_get(nkf_enc_name(output_encoding))); 00179 00180 return tmp; 00181 } 00182 00183 00184 /* 00185 * call-seq: 00186 * NKF.guess(str) => encoding 00187 * 00188 * Returns guessed encoding of _str_ by nkf routine. 00189 * 00190 */ 00191 00192 static VALUE 00193 rb_nkf_guess(VALUE obj, VALUE src) 00194 { 00195 reinit(); 00196 00197 input_ctr = 0; 00198 StringValue(src); 00199 input = (unsigned char *)RSTRING_PTR(src); 00200 i_len = RSTRING_LENINT(src); 00201 00202 guess_f = TRUE; 00203 kanji_convert( NULL ); 00204 guess_f = FALSE; 00205 00206 return rb_enc_from_encoding(rb_nkf_enc_get(get_guessed_code())); 00207 } 00208 00209 00210 /* 00211 * NKF - Ruby extension for Network Kanji Filter 00212 * 00213 * == Description 00214 * 00215 * This is a Ruby Extension version of nkf (Network Kanji Filter). 00216 * It converts the first argument and returns converted result. Conversion 00217 * details are specified by flags as the first argument. 00218 * 00219 * *Nkf* is a yet another kanji code converter among networks, hosts and terminals. 00220 * It converts input kanji code to designated kanji code 00221 * such as ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 or UTF-16. 00222 * 00223 * One of the most unique faculty of *nkf* is the guess of the input kanji encodings. 00224 * It currently recognizes ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 and UTF-16. 00225 * So users needn't set the input kanji code explicitly. 00226 * 00227 * By default, X0201 kana is converted into X0208 kana. 00228 * For X0201 kana, SO/SI, SSO and ESC-(-I methods are supported. 00229 * For automatic code detection, nkf assumes no X0201 kana in Shift_JIS. 00230 * To accept X0201 in Shift_JIS, use <b>-X</b>, <b>-x</b> or <b>-S</b>. 00231 * 00232 * == Flags 00233 * 00234 * === -b -u 00235 * 00236 * Output is buffered (DEFAULT), Output is unbuffered. 00237 * 00238 * === -j -s -e -w -w16 -w32 00239 * 00240 * Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP, 00241 * UTF-8N, UTF-16BE, UTF-32BE. 00242 * Without this option and compile option, ISO-2022-JP is assumed. 00243 * 00244 * === -J -S -E -W -W16 -W32 00245 * 00246 * Input assumption is JIS 7 bit, Shift_JIS, EUC-JP, 00247 * UTF-8, UTF-16, UTF-32. 00248 * 00249 * ==== -J 00250 * 00251 * Assume JIS input. It also accepts EUC-JP. 00252 * This is the default. This flag does not exclude Shift_JIS. 00253 * 00254 * ==== -S 00255 * 00256 * Assume Shift_JIS and X0201 kana input. It also accepts JIS. 00257 * EUC-JP is recognized as X0201 kana. Without <b>-x</b> flag, 00258 * X0201 kana (halfwidth kana) is converted into X0208. 00259 * 00260 * ==== -E 00261 * 00262 * Assume EUC-JP input. It also accepts JIS. 00263 * Same as -J. 00264 * 00265 * === -t 00266 * 00267 * No conversion. 00268 * 00269 * === -i_ 00270 * 00271 * Output sequence to designate JIS-kanji. (DEFAULT B) 00272 * 00273 * === -o_ 00274 * 00275 * Output sequence to designate ASCII. (DEFAULT B) 00276 * 00277 * === -r 00278 * 00279 * {de/en}crypt ROT13/47 00280 * 00281 * === -h[123] --hiragana --katakana --katakana-hiragana 00282 * 00283 * [-h1 --hiragana] Katakana to Hiragana conversion. 00284 * 00285 * [-h2 --katakana] Hiragana to Katakana conversion. 00286 * 00287 * [-h3 --katakana-hiragana] Katakana to Hiragana and Hiragana to Katakana conversion. 00288 * 00289 * === -T 00290 * 00291 * Text mode output (MS-DOS) 00292 * 00293 * === -l 00294 * 00295 * ISO8859-1 (Latin-1) support 00296 * 00297 * === -f[<code>m</code> [- <code>n</code>]] 00298 * 00299 * Folding on <code>m</code> length with <code>n</code> margin in a line. 00300 * Without this option, fold length is 60 and fold margin is 10. 00301 * 00302 * === -F 00303 * 00304 * New line preserving line folding. 00305 * 00306 * === -Z[0-3] 00307 * 00308 * Convert X0208 alphabet (Fullwidth Alphabets) to ASCII. 00309 * 00310 * [-Z -Z0] Convert X0208 alphabet to ASCII. 00311 * 00312 * [-Z1] Converts X0208 kankaku to single ASCII space. 00313 * 00314 * [-Z2] Converts X0208 kankaku to double ASCII spaces. 00315 * 00316 * [-Z3] Replacing Fullwidth >, <, ", & into '>', '<', '"', '&' as in HTML. 00317 * 00318 * === -X -x 00319 * 00320 * Assume X0201 kana in MS-Kanji. 00321 * With <b>-X</b> or without this option, X0201 is converted into X0208 Kana. 00322 * With <b>-x</b>, try to preserve X0208 kana and do not convert X0201 kana to X0208. 00323 * In JIS output, ESC-(-I is used. In EUC output, SSO is used. 00324 * 00325 * === -B[0-2] 00326 * 00327 * Assume broken JIS-Kanji input, which lost ESC. 00328 * Useful when your site is using old B-News Nihongo patch. 00329 * 00330 * [-B1] allows any char after ESC-( or ESC-$. 00331 * 00332 * [-B2] forces ASCII after NL. 00333 * 00334 * === -I 00335 * 00336 * Replacing non iso-2022-jp char into a geta character 00337 * (substitute character in Japanese). 00338 * 00339 * === -d -c 00340 * 00341 * Delete \r in line feed, Add \r in line feed. 00342 * 00343 * === -m[BQN0] 00344 * 00345 * MIME ISO-2022-JP/ISO8859-1 decode. (DEFAULT) 00346 * To see ISO8859-1 (Latin-1) -l is necessary. 00347 * 00348 * [-mB] Decode MIME base64 encoded stream. Remove header or other part before 00349 * conversion. 00350 * 00351 * [-mQ] Decode MIME quoted stream. '_' in quoted stream is converted to space. 00352 * 00353 * [-mN] Non-strict decoding. 00354 * It allows line break in the middle of the base64 encoding. 00355 * 00356 * [-m0] No MIME decode. 00357 * 00358 * === -M 00359 * 00360 * MIME encode. Header style. All ASCII code and control characters are intact. 00361 * Kanji conversion is performed before encoding, so this cannot be used as a picture encoder. 00362 * 00363 * [-MB] MIME encode Base64 stream. 00364 * 00365 * [-MQ] Perfome quoted encoding. 00366 * 00367 * === -l 00368 * 00369 * Input and output code is ISO8859-1 (Latin-1) and ISO-2022-JP. 00370 * <b>-s</b>, <b>-e</b> and <b>-x</b> are not compatible with this option. 00371 * 00372 * === -L[uwm] 00373 * 00374 * new line mode 00375 * Without this option, nkf doesn't convert line breaks. 00376 * 00377 * [-Lu] unix (LF) 00378 * 00379 * [-Lw] windows (CRLF) 00380 * 00381 * [-Lm] mac (CR) 00382 * 00383 * === --fj --unix --mac --msdos --windows 00384 * 00385 * convert for these system 00386 * 00387 * === --jis --euc --sjis --mime --base64 00388 * 00389 * convert for named code 00390 * 00391 * === --jis-input --euc-input --sjis-input --mime-input --base64-input 00392 * 00393 * assume input system 00394 * 00395 * === --ic=<code>input codeset</code> --oc=<code>output codeset</code> 00396 * 00397 * Set the input or output codeset. 00398 * NKF supports following codesets and those codeset name are case insensitive. 00399 * 00400 * [ISO-2022-JP] a.k.a. RFC1468, 7bit JIS, JUNET 00401 * 00402 * [EUC-JP (eucJP-nkf)] a.k.a. AT&T JIS, Japanese EUC, UJIS 00403 * 00404 * [eucJP-ascii] a.k.a. x-eucjp-open-19970715-ascii 00405 * 00406 * [eucJP-ms] a.k.a. x-eucjp-open-19970715-ms 00407 * 00408 * [CP51932] Microsoft Version of EUC-JP. 00409 * 00410 * [Shift_JIS] SJIS, MS-Kanji 00411 * 00412 * [Windows-31J] a.k.a. CP932 00413 * 00414 * [UTF-8] same as UTF-8N 00415 * 00416 * [UTF-8N] UTF-8 without BOM 00417 * 00418 * [UTF-8-BOM] UTF-8 with BOM 00419 * 00420 * [UTF-16] same as UTF-16BE 00421 * 00422 * [UTF-16BE] UTF-16 Big Endian without BOM 00423 * 00424 * [UTF-16BE-BOM] UTF-16 Big Endian with BOM 00425 * 00426 * [UTF-16LE] UTF-16 Little Endian without BOM 00427 * 00428 * [UTF-16LE-BOM] UTF-16 Little Endian with BOM 00429 * 00430 * [UTF-32] same as UTF-32BE 00431 * 00432 * [UTF-32BE] UTF-32 Big Endian without BOM 00433 * 00434 * [UTF-32BE-BOM] UTF-32 Big Endian with BOM 00435 * 00436 * [UTF-32LE] UTF-32 Little Endian without BOM 00437 * 00438 * [UTF-32LE-BOM] UTF-32 Little Endian with BOM 00439 * 00440 * [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only) 00441 * 00442 * === --fb-{skip, html, xml, perl, java, subchar} 00443 * 00444 * Specify the way that nkf handles unassigned characters. 00445 * Without this option, --fb-skip is assumed. 00446 * 00447 * === --prefix= <code>escape character</code> <code>target character</code> .. 00448 * 00449 * When nkf converts to Shift_JIS, 00450 * nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters. 00451 * 1st byte of argument is the escape character and following bytes are target characters. 00452 * 00453 * === --no-cp932ext 00454 * 00455 * Handle the characters extended in CP932 as unassigned characters. 00456 * 00457 * == --no-best-fit-chars 00458 * 00459 * When Unicode to Encoded byte conversion, 00460 * don't convert characters which is not round trip safe. 00461 * When Unicode to Unicode conversion, 00462 * with this and -x option, nkf can be used as UTF converter. 00463 * (In other words, without this and -x option, nkf doesn't save some characters) 00464 * 00465 * When nkf convert string which related to path, you should use this opion. 00466 * 00467 * === --cap-input 00468 * 00469 * Decode hex encoded characters. 00470 * 00471 * === --url-input 00472 * 00473 * Unescape percent escaped characters. 00474 * 00475 * === -- 00476 * 00477 * Ignore rest of -option. 00478 */ 00479 00480 void 00481 Init_nkf() 00482 { 00483 VALUE mNKF = rb_define_module("NKF"); 00484 00485 rb_define_module_function(mNKF, "nkf", rb_nkf_convert, 2); 00486 rb_define_module_function(mNKF, "guess", rb_nkf_guess, 1); 00487 rb_define_alias(rb_singleton_class(mNKF), "guess", "guess"); 00488 00489 rb_define_const(mNKF, "AUTO", Qnil); 00490 rb_define_const(mNKF, "NOCONV", Qnil); 00491 rb_define_const(mNKF, "UNKNOWN", Qnil); 00492 rb_define_const(mNKF, "BINARY", rb_enc_from_encoding(rb_nkf_enc_get("BINARY"))); 00493 rb_define_const(mNKF, "ASCII", rb_enc_from_encoding(rb_nkf_enc_get("US-ASCII"))); 00494 rb_define_const(mNKF, "JIS", rb_enc_from_encoding(rb_nkf_enc_get("ISO-2022-JP"))); 00495 rb_define_const(mNKF, "EUC", rb_enc_from_encoding(rb_nkf_enc_get("EUC-JP"))); 00496 rb_define_const(mNKF, "SJIS", rb_enc_from_encoding(rb_nkf_enc_get("Shift_JIS"))); 00497 rb_define_const(mNKF, "UTF8", rb_enc_from_encoding(rb_utf8_encoding())); 00498 rb_define_const(mNKF, "UTF16", rb_enc_from_encoding(rb_nkf_enc_get("UTF-16BE"))); 00499 rb_define_const(mNKF, "UTF32", rb_enc_from_encoding(rb_nkf_enc_get("UTF-32BE"))); 00500 00501 /* Full version string of nkf */ 00502 rb_define_const(mNKF, "VERSION", rb_str_new2(RUBY_NKF_VERSION)); 00503 /* Version of nkf */ 00504 rb_define_const(mNKF, "NKF_VERSION", rb_str_new2(NKF_VERSION)); 00505 /* Release date of nkf */ 00506 rb_define_const(mNKF, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE)); 00507 } 00508