Ruby
2.0.0p247(2013-06-27revision41674)
|
00001 00002 #include "yaml_private.h" 00003 00004 /* 00005 * Declarations. 00006 */ 00007 00008 static int 00009 yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, 00010 size_t offset, int value); 00011 00012 static int 00013 yaml_parser_update_raw_buffer(yaml_parser_t *parser); 00014 00015 static int 00016 yaml_parser_determine_encoding(yaml_parser_t *parser); 00017 00018 YAML_DECLARE(int) 00019 yaml_parser_update_buffer(yaml_parser_t *parser, size_t length); 00020 00021 /* 00022 * Set the reader error and return 0. 00023 */ 00024 00025 static int 00026 yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, 00027 size_t offset, int value) 00028 { 00029 parser->error = YAML_READER_ERROR; 00030 parser->problem = problem; 00031 parser->problem_offset = offset; 00032 parser->problem_value = value; 00033 00034 return 0; 00035 } 00036 00037 /* 00038 * Byte order marks. 00039 */ 00040 00041 #define BOM_UTF8 "\xef\xbb\xbf" 00042 #define BOM_UTF16LE "\xff\xfe" 00043 #define BOM_UTF16BE "\xfe\xff" 00044 00045 /* 00046 * Determine the input stream encoding by checking the BOM symbol. If no BOM is 00047 * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure. 00048 */ 00049 00050 static int 00051 yaml_parser_determine_encoding(yaml_parser_t *parser) 00052 { 00053 /* Ensure that we had enough bytes in the raw buffer. */ 00054 00055 while (!parser->eof 00056 && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) { 00057 if (!yaml_parser_update_raw_buffer(parser)) { 00058 return 0; 00059 } 00060 } 00061 00062 /* Determine the encoding. */ 00063 00064 if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 00065 && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) { 00066 parser->encoding = YAML_UTF16LE_ENCODING; 00067 parser->raw_buffer.pointer += 2; 00068 parser->offset += 2; 00069 } 00070 else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 00071 && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) { 00072 parser->encoding = YAML_UTF16BE_ENCODING; 00073 parser->raw_buffer.pointer += 2; 00074 parser->offset += 2; 00075 } 00076 else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3 00077 && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) { 00078 parser->encoding = YAML_UTF8_ENCODING; 00079 parser->raw_buffer.pointer += 3; 00080 parser->offset += 3; 00081 } 00082 else { 00083 parser->encoding = YAML_UTF8_ENCODING; 00084 } 00085 00086 return 1; 00087 } 00088 00089 /* 00090 * Update the raw buffer. 00091 */ 00092 00093 static int 00094 yaml_parser_update_raw_buffer(yaml_parser_t *parser) 00095 { 00096 size_t size_read = 0; 00097 00098 /* Return if the raw buffer is full. */ 00099 00100 if (parser->raw_buffer.start == parser->raw_buffer.pointer 00101 && parser->raw_buffer.last == parser->raw_buffer.end) 00102 return 1; 00103 00104 /* Return on EOF. */ 00105 00106 if (parser->eof) return 1; 00107 00108 /* Move the remaining bytes in the raw buffer to the beginning. */ 00109 00110 if (parser->raw_buffer.start < parser->raw_buffer.pointer 00111 && parser->raw_buffer.pointer < parser->raw_buffer.last) { 00112 memmove(parser->raw_buffer.start, parser->raw_buffer.pointer, 00113 parser->raw_buffer.last - parser->raw_buffer.pointer); 00114 } 00115 parser->raw_buffer.last -= 00116 parser->raw_buffer.pointer - parser->raw_buffer.start; 00117 parser->raw_buffer.pointer = parser->raw_buffer.start; 00118 00119 /* Call the read handler to fill the buffer. */ 00120 00121 if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last, 00122 parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) { 00123 return yaml_parser_set_reader_error(parser, "input error", 00124 parser->offset, -1); 00125 } 00126 parser->raw_buffer.last += size_read; 00127 if (!size_read) { 00128 parser->eof = 1; 00129 } 00130 00131 return 1; 00132 } 00133 00134 /* 00135 * Ensure that the buffer contains at least `length` characters. 00136 * Return 1 on success, 0 on failure. 00137 * 00138 * The length is supposed to be significantly less that the buffer size. 00139 */ 00140 00141 YAML_DECLARE(int) 00142 yaml_parser_update_buffer(yaml_parser_t *parser, size_t length) 00143 { 00144 int first = 1; 00145 00146 assert(parser->read_handler); /* Read handler must be set. */ 00147 00148 /* If the EOF flag is set and the raw buffer is empty, do nothing. */ 00149 00150 if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last) 00151 return 1; 00152 00153 /* Return if the buffer contains enough characters. */ 00154 00155 if (parser->unread >= length) 00156 return 1; 00157 00158 /* Determine the input encoding if it is not known yet. */ 00159 00160 if (!parser->encoding) { 00161 if (!yaml_parser_determine_encoding(parser)) 00162 return 0; 00163 } 00164 00165 /* Move the unread characters to the beginning of the buffer. */ 00166 00167 if (parser->buffer.start < parser->buffer.pointer 00168 && parser->buffer.pointer < parser->buffer.last) { 00169 size_t size = parser->buffer.last - parser->buffer.pointer; 00170 memmove(parser->buffer.start, parser->buffer.pointer, size); 00171 parser->buffer.pointer = parser->buffer.start; 00172 parser->buffer.last = parser->buffer.start + size; 00173 } 00174 else if (parser->buffer.pointer == parser->buffer.last) { 00175 parser->buffer.pointer = parser->buffer.start; 00176 parser->buffer.last = parser->buffer.start; 00177 } 00178 00179 /* Fill the buffer until it has enough characters. */ 00180 00181 while (parser->unread < length) 00182 { 00183 /* Fill the raw buffer if necessary. */ 00184 00185 if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) { 00186 if (!yaml_parser_update_raw_buffer(parser)) return 0; 00187 } 00188 first = 0; 00189 00190 /* Decode the raw buffer. */ 00191 00192 while (parser->raw_buffer.pointer != parser->raw_buffer.last) 00193 { 00194 unsigned int value = 0, value2 = 0; 00195 int incomplete = 0; 00196 unsigned char octet; 00197 unsigned int width = 0; 00198 int low, high; 00199 size_t k; 00200 size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer; 00201 00202 /* Decode the next character. */ 00203 00204 switch (parser->encoding) 00205 { 00206 case YAML_UTF8_ENCODING: 00207 00208 /* 00209 * Decode a UTF-8 character. Check RFC 3629 00210 * (http://www.ietf.org/rfc/rfc3629.txt) for more details. 00211 * 00212 * The following table (taken from the RFC) is used for 00213 * decoding. 00214 * 00215 * Char. number range | UTF-8 octet sequence 00216 * (hexadecimal) | (binary) 00217 * --------------------+------------------------------------ 00218 * 0000 0000-0000 007F | 0xxxxxxx 00219 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 00220 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 00221 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 00222 * 00223 * Additionally, the characters in the range 0xD800-0xDFFF 00224 * are prohibited as they are reserved for use with UTF-16 00225 * surrogate pairs. 00226 */ 00227 00228 /* Determine the length of the UTF-8 sequence. */ 00229 00230 octet = parser->raw_buffer.pointer[0]; 00231 width = (octet & 0x80) == 0x00 ? 1 : 00232 (octet & 0xE0) == 0xC0 ? 2 : 00233 (octet & 0xF0) == 0xE0 ? 3 : 00234 (octet & 0xF8) == 0xF0 ? 4 : 0; 00235 00236 /* Check if the leading octet is valid. */ 00237 00238 if (!width) 00239 return yaml_parser_set_reader_error(parser, 00240 "invalid leading UTF-8 octet", 00241 parser->offset, octet); 00242 00243 /* Check if the raw buffer contains an incomplete character. */ 00244 00245 if (width > raw_unread) { 00246 if (parser->eof) { 00247 return yaml_parser_set_reader_error(parser, 00248 "incomplete UTF-8 octet sequence", 00249 parser->offset, -1); 00250 } 00251 incomplete = 1; 00252 break; 00253 } 00254 00255 /* Decode the leading octet. */ 00256 00257 value = (octet & 0x80) == 0x00 ? octet & 0x7F : 00258 (octet & 0xE0) == 0xC0 ? octet & 0x1F : 00259 (octet & 0xF0) == 0xE0 ? octet & 0x0F : 00260 (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0; 00261 00262 /* Check and decode the trailing octets. */ 00263 00264 for (k = 1; k < width; k ++) 00265 { 00266 octet = parser->raw_buffer.pointer[k]; 00267 00268 /* Check if the octet is valid. */ 00269 00270 if ((octet & 0xC0) != 0x80) 00271 return yaml_parser_set_reader_error(parser, 00272 "invalid trailing UTF-8 octet", 00273 parser->offset+k, octet); 00274 00275 /* Decode the octet. */ 00276 00277 value = (value << 6) + (octet & 0x3F); 00278 } 00279 00280 /* Check the length of the sequence against the value. */ 00281 00282 if (!((width == 1) || 00283 (width == 2 && value >= 0x80) || 00284 (width == 3 && value >= 0x800) || 00285 (width == 4 && value >= 0x10000))) 00286 return yaml_parser_set_reader_error(parser, 00287 "invalid length of a UTF-8 sequence", 00288 parser->offset, -1); 00289 00290 /* Check the range of the value. */ 00291 00292 if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) 00293 return yaml_parser_set_reader_error(parser, 00294 "invalid Unicode character", 00295 parser->offset, value); 00296 00297 break; 00298 00299 case YAML_UTF16LE_ENCODING: 00300 case YAML_UTF16BE_ENCODING: 00301 00302 low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1); 00303 high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0); 00304 00305 /* 00306 * The UTF-16 encoding is not as simple as one might 00307 * naively think. Check RFC 2781 00308 * (http://www.ietf.org/rfc/rfc2781.txt). 00309 * 00310 * Normally, two subsequent bytes describe a Unicode 00311 * character. However a special technique (called a 00312 * surrogate pair) is used for specifying character 00313 * values larger than 0xFFFF. 00314 * 00315 * A surrogate pair consists of two pseudo-characters: 00316 * high surrogate area (0xD800-0xDBFF) 00317 * low surrogate area (0xDC00-0xDFFF) 00318 * 00319 * The following formulas are used for decoding 00320 * and encoding characters using surrogate pairs: 00321 * 00322 * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF) 00323 * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF) 00324 * W1 = 110110yyyyyyyyyy 00325 * W2 = 110111xxxxxxxxxx 00326 * 00327 * where U is the character value, W1 is the high surrogate 00328 * area, W2 is the low surrogate area. 00329 */ 00330 00331 /* Check for incomplete UTF-16 character. */ 00332 00333 if (raw_unread < 2) { 00334 if (parser->eof) { 00335 return yaml_parser_set_reader_error(parser, 00336 "incomplete UTF-16 character", 00337 parser->offset, -1); 00338 } 00339 incomplete = 1; 00340 break; 00341 } 00342 00343 /* Get the character. */ 00344 00345 value = parser->raw_buffer.pointer[low] 00346 + (parser->raw_buffer.pointer[high] << 8); 00347 00348 /* Check for unexpected low surrogate area. */ 00349 00350 if ((value & 0xFC00) == 0xDC00) 00351 return yaml_parser_set_reader_error(parser, 00352 "unexpected low surrogate area", 00353 parser->offset, value); 00354 00355 /* Check for a high surrogate area. */ 00356 00357 if ((value & 0xFC00) == 0xD800) { 00358 00359 width = 4; 00360 00361 /* Check for incomplete surrogate pair. */ 00362 00363 if (raw_unread < 4) { 00364 if (parser->eof) { 00365 return yaml_parser_set_reader_error(parser, 00366 "incomplete UTF-16 surrogate pair", 00367 parser->offset, -1); 00368 } 00369 incomplete = 1; 00370 break; 00371 } 00372 00373 /* Get the next character. */ 00374 00375 value2 = parser->raw_buffer.pointer[low+2] 00376 + (parser->raw_buffer.pointer[high+2] << 8); 00377 00378 /* Check for a low surrogate area. */ 00379 00380 if ((value2 & 0xFC00) != 0xDC00) 00381 return yaml_parser_set_reader_error(parser, 00382 "expected low surrogate area", 00383 parser->offset+2, value2); 00384 00385 /* Generate the value of the surrogate pair. */ 00386 00387 value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); 00388 } 00389 00390 else { 00391 width = 2; 00392 } 00393 00394 break; 00395 00396 default: 00397 assert(1); /* Impossible. */ 00398 } 00399 00400 /* Check if the raw buffer contains enough bytes to form a character. */ 00401 00402 if (incomplete) break; 00403 00404 /* 00405 * Check if the character is in the allowed range: 00406 * #x9 | #xA | #xD | [#x20-#x7E] (8 bit) 00407 * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit) 00408 * | [#x10000-#x10FFFF] (32 bit) 00409 */ 00410 00411 if (! (value == 0x09 || value == 0x0A || value == 0x0D 00412 || (value >= 0x20 && value <= 0x7E) 00413 || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF) 00414 || (value >= 0xE000 && value <= 0xFFFD) 00415 || (value >= 0x10000 && value <= 0x10FFFF))) 00416 return yaml_parser_set_reader_error(parser, 00417 "control characters are not allowed", 00418 parser->offset, value); 00419 00420 /* Move the raw pointers. */ 00421 00422 parser->raw_buffer.pointer += width; 00423 parser->offset += width; 00424 00425 /* Finally put the character into the buffer. */ 00426 00427 /* 0000 0000-0000 007F -> 0xxxxxxx */ 00428 if (value <= 0x7F) { 00429 *(parser->buffer.last++) = value; 00430 } 00431 /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ 00432 else if (value <= 0x7FF) { 00433 *(parser->buffer.last++) = 0xC0 + (value >> 6); 00434 *(parser->buffer.last++) = 0x80 + (value & 0x3F); 00435 } 00436 /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ 00437 else if (value <= 0xFFFF) { 00438 *(parser->buffer.last++) = 0xE0 + (value >> 12); 00439 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); 00440 *(parser->buffer.last++) = 0x80 + (value & 0x3F); 00441 } 00442 /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 00443 else { 00444 *(parser->buffer.last++) = 0xF0 + (value >> 18); 00445 *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F); 00446 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); 00447 *(parser->buffer.last++) = 0x80 + (value & 0x3F); 00448 } 00449 00450 parser->unread ++; 00451 } 00452 00453 /* On EOF, put NUL into the buffer and return. */ 00454 00455 if (parser->eof) { 00456 *(parser->buffer.last++) = '\0'; 00457 parser->unread ++; 00458 return 1; 00459 } 00460 00461 } 00462 00463 return 1; 00464 } 00465 00466