Ruby 3.0.5p211 (2022-11-24 revision ba5cf0f7c52d4d35cc6a173c89eda98ceffa2dcf)
transcode.c
Go to the documentation of this file.
1/**********************************************************************
2
3 transcode.c -
4
5 $Author$
6 created at: Tue Oct 30 16:10:22 JST 2007
7
8 Copyright (C) 2007 Martin Duerst
9
10**********************************************************************/
11
13
14#include <ctype.h>
15
16#include "internal.h"
17#include "internal/array.h"
18#include "internal/inits.h"
19#include "internal/object.h"
20#include "internal/string.h"
21#include "internal/transcode.h"
22#include "ruby/encoding.h"
23
24#include "transcode_data.h"
25#include "id.h"
26
27#define ENABLE_ECONV_NEWLINE_OPTION 1
28
29/* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
30static VALUE rb_eUndefinedConversionError;
31static VALUE rb_eInvalidByteSequenceError;
32static VALUE rb_eConverterNotFoundError;
33
35
36static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
37static VALUE sym_xml, sym_text, sym_attr;
38static VALUE sym_universal_newline;
39static VALUE sym_crlf_newline;
40static VALUE sym_cr_newline;
41#ifdef ENABLE_ECONV_NEWLINE_OPTION
42static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
43#endif
44static VALUE sym_partial_input;
45
46static VALUE sym_invalid_byte_sequence;
47static VALUE sym_undefined_conversion;
48static VALUE sym_destination_buffer_full;
49static VALUE sym_source_buffer_empty;
50static VALUE sym_finished;
51static VALUE sym_after_output;
52static VALUE sym_incomplete_input;
53
54static unsigned char *
55allocate_converted_string(const char *sname, const char *dname,
56 const unsigned char *str, size_t len,
57 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
58 size_t *dst_len_ptr);
59
60/* dynamic structure, one per conversion (similar to iconv_t) */
61/* may carry conversion state (e.g. for iso-2022-jp) */
62typedef struct rb_transcoding {
64
65 int flags;
66
68 unsigned int next_table;
70 unsigned char next_byte;
71 unsigned int output_index;
72
73 ssize_t recognized_len; /* already interpreted */
74 ssize_t readagain_len; /* not yet interpreted */
75 union {
76 unsigned char ary[8]; /* max_input <= sizeof(ary) */
77 unsigned char *ptr; /* length: max_input */
78 } readbuf; /* recognized_len + readagain_len used */
79
80 ssize_t writebuf_off;
81 ssize_t writebuf_len;
82 union {
83 unsigned char ary[8]; /* max_output <= sizeof(ary) */
84 unsigned char *ptr; /* length: max_output */
86
87 union rb_transcoding_state_t { /* opaque data for stateful encoding */
88 void *ptr;
89 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
93#define TRANSCODING_READBUF(tc) \
94 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
95 (tc)->readbuf.ary : \
96 (tc)->readbuf.ptr)
97#define TRANSCODING_WRITEBUF(tc) \
98 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
99 (tc)->writebuf.ary : \
100 (tc)->writebuf.ptr)
101#define TRANSCODING_WRITEBUF_SIZE(tc) \
102 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
103 sizeof((tc)->writebuf.ary) : \
104 (size_t)(tc)->transcoder->max_output)
105#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
106#define TRANSCODING_STATE(tc) \
107 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
108 (tc)->state.ary : \
109 (tc)->state.ptr)
110
111typedef struct {
113 unsigned char *out_buf_start;
114 unsigned char *out_data_start;
115 unsigned char *out_data_end;
116 unsigned char *out_buf_end;
119
121 int flags;
122 int started; /* bool */
123
126
127 const unsigned char *replacement_str;
129 const char *replacement_enc;
130
131 unsigned char *in_buf_start;
132 unsigned char *in_data_start;
133 unsigned char *in_data_end;
134 unsigned char *in_buf_end;
136 int replacement_allocated; /* bool */
141
142 /* last error */
143 struct {
146 const char *source_encoding;
148 const unsigned char *error_bytes_start;
152
153 /* The following fields are only for Encoding::Converter.
154 * rb_econv_open set them NULL. */
157};
158
159/*
160 * Dispatch data and logic
161 */
162
163#define DECORATOR_P(sname, dname) (*(sname) == '\0')
164
165typedef struct {
166 const char *sname;
167 const char *dname;
168 const char *lib; /* null means no need to load a library */
171
172static st_table *transcoder_table;
173
174static transcoder_entry_t *
175make_transcoder_entry(const char *sname, const char *dname)
176{
177 st_data_t val;
178 st_table *table2;
179
180 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
182 st_add_direct(transcoder_table, (st_data_t)sname, val);
183 }
184 table2 = (st_table *)val;
185 if (!st_lookup(table2, (st_data_t)dname, &val)) {
187 entry->sname = sname;
188 entry->dname = dname;
189 entry->lib = NULL;
190 entry->transcoder = NULL;
191 val = (st_data_t)entry;
192 st_add_direct(table2, (st_data_t)dname, val);
193 }
194 return (transcoder_entry_t *)val;
195}
196
197static transcoder_entry_t *
198get_transcoder_entry(const char *sname, const char *dname)
199{
200 st_data_t val;
201 st_table *table2;
202
203 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
204 return NULL;
205 }
206 table2 = (st_table *)val;
207 if (!st_lookup(table2, (st_data_t)dname, &val)) {
208 return NULL;
209 }
210 return (transcoder_entry_t *)val;
211}
212
213void
215{
216 const char *const sname = tr->src_encoding;
217 const char *const dname = tr->dst_encoding;
218
219 transcoder_entry_t *entry;
220
221 entry = make_transcoder_entry(sname, dname);
222 if (entry->transcoder) {
223 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
224 sname, dname);
225 }
226
227 entry->transcoder = tr;
228}
229
230static void
231declare_transcoder(const char *sname, const char *dname, const char *lib)
232{
233 transcoder_entry_t *entry;
234
235 entry = make_transcoder_entry(sname, dname);
236 entry->lib = lib;
237}
238
239static const char transcoder_lib_prefix[] = "enc/trans/";
240
241void
242rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
243{
244 if (!lib) {
245 rb_raise(rb_eArgError, "invalid library name - (null)");
246 }
247 declare_transcoder(enc1, enc2, lib);
248}
249
250#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
251
252typedef struct search_path_queue_tag {
254 const char *enc;
256
257typedef struct {
261 const char *base_enc;
263
264static int
265transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
266{
267 const char *dname = (const char *)key;
270
271 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
272 return ST_CONTINUE;
273 }
274
276 q->enc = dname;
277 q->next = NULL;
278 *bfs->queue_last_ptr = q;
279 bfs->queue_last_ptr = &q->next;
280
281 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
282 return ST_CONTINUE;
283}
284
285static int
286transcode_search_path(const char *sname, const char *dname,
287 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
288 void *arg)
289{
292 st_data_t val;
293 st_table *table2;
294 int found;
295 int pathlen = -1;
296
297 if (encoding_equal(sname, dname))
298 return -1;
299
301 q->enc = sname;
302 q->next = NULL;
303 bfs.queue_last_ptr = &q->next;
304 bfs.queue = q;
305
308
309 while (bfs.queue) {
310 q = bfs.queue;
311 bfs.queue = q->next;
312 if (!bfs.queue)
313 bfs.queue_last_ptr = &bfs.queue;
314
315 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
316 xfree(q);
317 continue;
318 }
319 table2 = (st_table *)val;
320
321 if (st_lookup(table2, (st_data_t)dname, &val)) {
322 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
323 xfree(q);
324 found = 1;
325 goto cleanup;
326 }
327
328 bfs.base_enc = q->enc;
329 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
330 bfs.base_enc = NULL;
331
332 xfree(q);
333 }
334 found = 0;
335
336 cleanup:
337 while (bfs.queue) {
338 q = bfs.queue;
339 bfs.queue = q->next;
340 xfree(q);
341 }
342
343 if (found) {
344 const char *enc = dname;
345 int depth;
346 pathlen = 0;
347 while (1) {
348 st_lookup(bfs.visited, (st_data_t)enc, &val);
349 if (!val)
350 break;
351 pathlen++;
352 enc = (const char *)val;
353 }
354 depth = pathlen;
355 enc = dname;
356 while (1) {
357 st_lookup(bfs.visited, (st_data_t)enc, &val);
358 if (!val)
359 break;
360 callback((const char *)val, enc, --depth, arg);
361 enc = (const char *)val;
362 }
363 }
364
366
367 return pathlen; /* is -1 if not found */
368}
369
370static const rb_transcoder *
371load_transcoder_entry(transcoder_entry_t *entry)
372{
373 if (entry->transcoder)
374 return entry->transcoder;
375
376 if (entry->lib) {
377 const char *const lib = entry->lib;
378 const size_t len = strlen(lib);
379 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
380 const VALUE fn = rb_str_new(0, total_len);
381 char *const path = RSTRING_PTR(fn);
382
383 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
384 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
385 rb_str_set_len(fn, total_len);
386 OBJ_FREEZE(fn);
388 }
389
390 if (entry->transcoder)
391 return entry->transcoder;
392
393 return NULL;
394}
395
396static const char*
397get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
398{
399 if (encoding_equal(encname, "UTF-8")) {
400 *len_ret = 3;
401 *repl_encname_ptr = "UTF-8";
402 return "\xEF\xBF\xBD";
403 }
404 else {
405 *len_ret = 1;
406 *repl_encname_ptr = "US-ASCII";
407 return "?";
408 }
409}
410
411/*
412 * Transcoding engine logic
413 */
414
415static const unsigned char *
416transcode_char_start(rb_transcoding *tc,
417 const unsigned char *in_start,
418 const unsigned char *inchar_start,
419 const unsigned char *in_p,
420 size_t *char_len_ptr)
421{
422 const unsigned char *ptr;
423 if (inchar_start - in_start < tc->recognized_len) {
425 inchar_start, unsigned char, in_p - inchar_start);
427 }
428 else {
429 ptr = inchar_start - tc->recognized_len;
430 }
431 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
432 return ptr;
433}
434
436transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
437 const unsigned char *in_stop, unsigned char *out_stop,
438 rb_transcoding *tc,
439 const int opt)
440{
441 const rb_transcoder *tr = tc->transcoder;
442 int unitlen = tr->input_unit_length;
443 ssize_t readagain_len = 0;
444
445 const unsigned char *inchar_start;
446 const unsigned char *in_p;
447
448 unsigned char *out_p;
449
450 in_p = inchar_start = *in_pos;
451
452 out_p = *out_pos;
453
454#define SUSPEND(ret, num) \
455 do { \
456 tc->resume_position = (num); \
457 if (0 < in_p - inchar_start) \
458 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
459 inchar_start, unsigned char, in_p - inchar_start); \
460 *in_pos = in_p; \
461 *out_pos = out_p; \
462 tc->recognized_len += in_p - inchar_start; \
463 if (readagain_len) { \
464 tc->recognized_len -= readagain_len; \
465 tc->readagain_len = readagain_len; \
466 } \
467 return (ret); \
468 resume_label ## num:; \
469 } while (0)
470#define SUSPEND_OBUF(num) \
471 do { \
472 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
473 } while (0)
474
475#define SUSPEND_AFTER_OUTPUT(num) \
476 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
477 SUSPEND(econv_after_output, num); \
478 }
479
480#define next_table (tc->next_table)
481#define next_info (tc->next_info)
482#define next_byte (tc->next_byte)
483#define writebuf_len (tc->writebuf_len)
484#define writebuf_off (tc->writebuf_off)
485
486 switch (tc->resume_position) {
487 case 0: break;
488 case 1: goto resume_label1;
489 case 2: goto resume_label2;
490 case 3: goto resume_label3;
491 case 4: goto resume_label4;
492 case 5: goto resume_label5;
493 case 6: goto resume_label6;
494 case 7: goto resume_label7;
495 case 8: goto resume_label8;
496 case 9: goto resume_label9;
497 case 10: goto resume_label10;
498 case 11: goto resume_label11;
499 case 12: goto resume_label12;
500 case 13: goto resume_label13;
501 case 14: goto resume_label14;
502 case 15: goto resume_label15;
503 case 16: goto resume_label16;
504 case 17: goto resume_label17;
505 case 18: goto resume_label18;
506 case 19: goto resume_label19;
507 case 20: goto resume_label20;
508 case 21: goto resume_label21;
509 case 22: goto resume_label22;
510 case 23: goto resume_label23;
511 case 24: goto resume_label24;
512 case 25: goto resume_label25;
513 case 26: goto resume_label26;
514 case 27: goto resume_label27;
515 case 28: goto resume_label28;
516 case 29: goto resume_label29;
517 case 30: goto resume_label30;
518 case 31: goto resume_label31;
519 case 32: goto resume_label32;
520 case 33: goto resume_label33;
521 case 34: goto resume_label34;
522 }
523
524 while (1) {
525 inchar_start = in_p;
526 tc->recognized_len = 0;
527 next_table = tr->conv_tree_start;
528
530
531 if (in_stop <= in_p) {
532 if (!(opt & ECONV_PARTIAL_INPUT))
533 break;
535 continue;
536 }
537
538#define BYTE_ADDR(index) (tr->byte_array + (index))
539#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
540#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
541#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
542#define BL_MIN_BYTE (BL_BASE[0])
543#define BL_MAX_BYTE (BL_BASE[1])
544#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
545#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
546
547 next_byte = (unsigned char)*in_p++;
548 follow_byte:
551 else {
553 }
554 follow_info:
555 switch (next_info & 0x1F) {
556 case NOMAP:
557 {
558 const unsigned char *p = inchar_start;
559 writebuf_off = 0;
560 while (p < in_p) {
561 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
562 }
564 writebuf_off = 0;
565 while (writebuf_off < writebuf_len) {
566 SUSPEND_OBUF(3);
567 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
568 }
569 }
570 continue;
571 case 0x00: case 0x04: case 0x08: case 0x0C:
572 case 0x10: case 0x14: case 0x18: case 0x1C:
574 while (in_p >= in_stop) {
575 if (!(opt & ECONV_PARTIAL_INPUT))
576 goto incomplete;
578 }
579 next_byte = (unsigned char)*in_p++;
580 next_table = (unsigned int)next_info;
581 goto follow_byte;
582 case ZERObt: /* drop input */
583 continue;
584 case ONEbt:
585 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
586 continue;
587 case TWObt:
588 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
589 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
590 continue;
591 case THREEbt:
592 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
593 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
594 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
595 continue;
596 case FOURbt:
597 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
598 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
599 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
600 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
601 continue;
602 case GB4bt:
603 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
604 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
605 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
606 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
607 continue;
608 case STR1:
609 tc->output_index = 0;
612 tc->output_index++;
613 }
614 continue;
615 case FUNii:
616 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
617 goto follow_info;
618 case FUNsi:
619 {
620 const unsigned char *char_start;
621 size_t char_len;
622 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
623 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
624 goto follow_info;
625 }
626 case FUNio:
627 SUSPEND_OBUF(13);
628 if (tr->max_output <= out_stop - out_p)
629 out_p += tr->func_io(TRANSCODING_STATE(tc),
630 next_info, out_p, out_stop - out_p);
631 else {
632 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
633 next_info,
635 writebuf_off = 0;
636 while (writebuf_off < writebuf_len) {
637 SUSPEND_OBUF(20);
638 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
639 }
640 }
641 break;
642 case FUNso:
643 {
644 const unsigned char *char_start;
645 size_t char_len;
646 SUSPEND_OBUF(14);
647 if (tr->max_output <= out_stop - out_p) {
648 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
649 out_p += tr->func_so(TRANSCODING_STATE(tc),
650 char_start, (size_t)char_len,
651 out_p, out_stop - out_p);
652 }
653 else {
654 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
655 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
656 char_start, (size_t)char_len,
658 writebuf_off = 0;
659 while (writebuf_off < writebuf_len) {
660 SUSPEND_OBUF(22);
661 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
662 }
663 }
664 break;
665 }
666 case FUNsio:
667 {
668 const unsigned char *char_start;
669 size_t char_len;
670 SUSPEND_OBUF(33);
671 if (tr->max_output <= out_stop - out_p) {
672 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
673 out_p += tr->func_sio(TRANSCODING_STATE(tc),
674 char_start, (size_t)char_len, next_info,
675 out_p, out_stop - out_p);
676 }
677 else {
678 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
679 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
680 char_start, (size_t)char_len, next_info,
682 writebuf_off = 0;
683 while (writebuf_off < writebuf_len) {
684 SUSPEND_OBUF(34);
685 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
686 }
687 }
688 break;
689 }
690 case INVALID:
691 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
692 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
694 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
695 in_p = in_stop;
697 }
698 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
699 in_p = in_stop;
700 }
701 else {
702 in_p = inchar_start + (unitlen - tc->recognized_len);
703 }
704 }
705 else {
706 ssize_t invalid_len; /* including the last byte which causes invalid */
707 ssize_t discard_len;
708 invalid_len = tc->recognized_len + (in_p - inchar_start);
709 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
710 readagain_len = invalid_len - discard_len;
711 }
712 goto invalid;
713 case UNDEF:
714 goto undef;
715 default:
716 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
717 }
718 continue;
719
720 invalid:
722 continue;
723
724 incomplete:
726 continue;
727
728 undef:
730 continue;
731 }
732
733 /* cleanup */
734 if (tr->finish_func) {
735 SUSPEND_OBUF(4);
736 if (tr->max_output <= out_stop - out_p) {
737 out_p += tr->finish_func(TRANSCODING_STATE(tc),
738 out_p, out_stop - out_p);
739 }
740 else {
741 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
743 writebuf_off = 0;
744 while (writebuf_off < writebuf_len) {
745 SUSPEND_OBUF(23);
746 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
747 }
748 }
749 }
750 while (1)
752#undef SUSPEND
753#undef next_table
754#undef next_info
755#undef next_byte
756#undef writebuf_len
757#undef writebuf_off
758}
759
761transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
762 const unsigned char *in_stop, unsigned char *out_stop,
763 rb_transcoding *tc,
764 const int opt)
765{
766 if (tc->readagain_len) {
767 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
768 const unsigned char *readagain_pos = readagain_buf;
769 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
771
772 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
773 unsigned char, tc->readagain_len);
774 tc->readagain_len = 0;
775 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
776 if (res != econv_source_buffer_empty) {
778 readagain_pos, unsigned char, readagain_stop - readagain_pos);
779 tc->readagain_len += readagain_stop - readagain_pos;
780 return res;
781 }
782 }
783 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
784}
785
786static rb_transcoding *
787rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
788{
789 rb_transcoding *tc;
790
791 tc = ALLOC(rb_transcoding);
792 tc->transcoder = tr;
793 tc->flags = flags;
794 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
795 tc->state.ptr = xmalloc(tr->state_size);
796 if (tr->state_init_func) {
797 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
798 }
799 tc->resume_position = 0;
800 tc->recognized_len = 0;
801 tc->readagain_len = 0;
802 tc->writebuf_len = 0;
803 tc->writebuf_off = 0;
804 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
805 tc->readbuf.ptr = xmalloc(tr->max_input);
806 }
807 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
808 tc->writebuf.ptr = xmalloc(tr->max_output);
809 }
810 return tc;
811}
812
814rb_transcoding_convert(rb_transcoding *tc,
815 const unsigned char **input_ptr, const unsigned char *input_stop,
816 unsigned char **output_ptr, unsigned char *output_stop,
817 int flags)
818{
819 return transcode_restartable(
820 input_ptr, output_ptr,
821 input_stop, output_stop,
822 tc, flags);
823}
824
825static void
826rb_transcoding_close(rb_transcoding *tc)
827{
828 const rb_transcoder *tr = tc->transcoder;
829 if (tr->state_fini_func) {
830 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
831 }
832 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
833 xfree(tc->state.ptr);
834 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
835 xfree(tc->readbuf.ptr);
836 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
837 xfree(tc->writebuf.ptr);
838 xfree(tc);
839}
840
841static size_t
842rb_transcoding_memsize(rb_transcoding *tc)
843{
844 size_t size = sizeof(rb_transcoding);
845 const rb_transcoder *tr = tc->transcoder;
846
847 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
848 size += tr->state_size;
849 }
850 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
851 size += tr->max_input;
852 }
853 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
854 size += tr->max_output;
855 }
856 return size;
857}
858
859static rb_econv_t *
860rb_econv_alloc(int n_hint)
861{
862 rb_econv_t *ec;
863
864 if (n_hint <= 0)
865 n_hint = 1;
866
867 ec = ALLOC(rb_econv_t);
868 ec->flags = 0;
871 ec->started = 0;
872 ec->replacement_str = NULL;
873 ec->replacement_len = 0;
874 ec->replacement_enc = NULL;
875 ec->replacement_allocated = 0;
876 ec->in_buf_start = NULL;
877 ec->in_data_start = NULL;
878 ec->in_data_end = NULL;
879 ec->in_buf_end = NULL;
880 ec->num_allocated = n_hint;
881 ec->num_trans = 0;
883 ec->num_finished = 0;
884 ec->last_tc = NULL;
892 ec->source_encoding = NULL;
894 return ec;
895}
896
897static int
898rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
899{
900 int n, j;
901 int bufsize = 4096;
902 unsigned char *p;
903
904 if (ec->num_trans == ec->num_allocated) {
905 n = ec->num_allocated * 2;
907 ec->num_allocated = n;
908 }
909
910 p = xmalloc(bufsize);
911
912 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
913
914 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
915 ec->elems[i].out_buf_start = p;
916 ec->elems[i].out_buf_end = p + bufsize;
917 ec->elems[i].out_data_start = p;
918 ec->elems[i].out_data_end = p;
920
921 ec->num_trans++;
922
923 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
924 for (j = ec->num_trans-1; i <= j; j--) {
925 rb_transcoding *tc = ec->elems[j].tc;
926 const rb_transcoder *tr2 = tc->transcoder;
927 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
928 ec->last_tc = tc;
929 break;
930 }
931 }
932
933 return 0;
934}
935
936static rb_econv_t *
937rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
938{
939 rb_econv_t *ec;
940 int i, ret;
941
942 for (i = 0; i < n; i++) {
943 const rb_transcoder *tr;
944 tr = load_transcoder_entry(entries[i]);
945 if (!tr)
946 return NULL;
947 }
948
949 ec = rb_econv_alloc(n);
950
951 for (i = 0; i < n; i++) {
952 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
953 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
954 if (ret == -1) {
955 rb_econv_close(ec);
956 return NULL;
957 }
958 }
959
960 return ec;
961}
962
966};
967
968static void
969trans_open_i(const char *sname, const char *dname, int depth, void *arg)
970{
971 struct trans_open_t *toarg = arg;
972
973 if (!toarg->entries) {
974 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
975 }
976 toarg->entries[depth] = get_transcoder_entry(sname, dname);
977}
978
979static rb_econv_t *
980rb_econv_open0(const char *sname, const char *dname, int ecflags)
981{
983 int num_trans;
984 rb_econv_t *ec;
985
986 /* Just check if sname and dname are defined */
987 /* (This check is needed?) */
988 if (*sname) rb_enc_find_index(sname);
989 if (*dname) rb_enc_find_index(dname);
990
991 if (*sname == '\0' && *dname == '\0') {
992 num_trans = 0;
993 entries = NULL;
994 sname = dname = "";
995 }
996 else {
997 struct trans_open_t toarg;
998 toarg.entries = NULL;
999 toarg.num_additional = 0;
1000 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1001 entries = toarg.entries;
1002 if (num_trans < 0) {
1003 xfree(entries);
1004 return NULL;
1005 }
1006 }
1007
1008 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1009 xfree(entries);
1010 if (!ec)
1011 return NULL;
1012
1013 ec->flags = ecflags;
1014 ec->source_encoding_name = sname;
1015 ec->destination_encoding_name = dname;
1016
1017 return ec;
1018}
1019
1020#define MAX_ECFLAGS_DECORATORS 32
1021
1022static int
1023decorator_names(int ecflags, const char **decorators_ret)
1024{
1025 int num_decorators;
1026
1027 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1031 case 0:
1032 break;
1033 default:
1034 return -1;
1035 }
1036
1037 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1039 return -1;
1040
1041 num_decorators = 0;
1042
1043 if (ecflags & ECONV_XML_TEXT_DECORATOR)
1044 decorators_ret[num_decorators++] = "xml_text_escape";
1046 decorators_ret[num_decorators++] = "xml_attr_content_escape";
1047 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1048 decorators_ret[num_decorators++] = "xml_attr_quote";
1049
1050 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1051 decorators_ret[num_decorators++] = "crlf_newline";
1052 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1053 decorators_ret[num_decorators++] = "cr_newline";
1055 decorators_ret[num_decorators++] = "universal_newline";
1056
1057 return num_decorators;
1058}
1059
1060rb_econv_t *
1061rb_econv_open(const char *sname, const char *dname, int ecflags)
1062{
1063 rb_econv_t *ec;
1064 int num_decorators;
1065 const char *decorators[MAX_ECFLAGS_DECORATORS];
1066 int i;
1067
1068 num_decorators = decorator_names(ecflags, decorators);
1069 if (num_decorators == -1)
1070 return NULL;
1071
1072 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1073 if (!ec)
1074 return NULL;
1075
1076 for (i = 0; i < num_decorators; i++)
1077 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1078 rb_econv_close(ec);
1079 return NULL;
1080 }
1081
1082 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1083
1084 return ec;
1085}
1086
1087static int
1088trans_sweep(rb_econv_t *ec,
1089 const unsigned char **input_ptr, const unsigned char *input_stop,
1090 unsigned char **output_ptr, unsigned char *output_stop,
1091 int flags,
1092 int start)
1093{
1094 int try;
1095 int i, f;
1096
1097 const unsigned char **ipp, *is, *iold;
1098 unsigned char **opp, *os, *oold;
1100
1101 try = 1;
1102 while (try) {
1103 try = 0;
1104 for (i = start; i < ec->num_trans; i++) {
1105 rb_econv_elem_t *te = &ec->elems[i];
1106
1107 if (i == 0) {
1108 ipp = input_ptr;
1109 is = input_stop;
1110 }
1111 else {
1112 rb_econv_elem_t *prev_te = &ec->elems[i-1];
1113 ipp = (const unsigned char **)&prev_te->out_data_start;
1114 is = prev_te->out_data_end;
1115 }
1116
1117 if (i == ec->num_trans-1) {
1118 opp = output_ptr;
1119 os = output_stop;
1120 }
1121 else {
1122 if (te->out_buf_start != te->out_data_start) {
1123 ssize_t len = te->out_data_end - te->out_data_start;
1124 ssize_t off = te->out_data_start - te->out_buf_start;
1125 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1126 te->out_data_start = te->out_buf_start;
1127 te->out_data_end -= off;
1128 }
1129 opp = &te->out_data_end;
1130 os = te->out_buf_end;
1131 }
1132
1133 f = flags;
1134 if (ec->num_finished != i)
1136 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1137 start = 1;
1138 flags &= ~ECONV_AFTER_OUTPUT;
1139 }
1140 if (i != 0)
1141 f &= ~ECONV_AFTER_OUTPUT;
1142 iold = *ipp;
1143 oold = *opp;
1144 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1145 if (iold != *ipp || oold != *opp)
1146 try = 1;
1147
1148 switch (res) {
1152 case econv_after_output:
1153 return i;
1154
1157 break;
1158
1159 case econv_finished:
1160 ec->num_finished = i+1;
1161 break;
1162 }
1163 }
1164 }
1165 return -1;
1166}
1167
1168static rb_econv_result_t
1169rb_trans_conv(rb_econv_t *ec,
1170 const unsigned char **input_ptr, const unsigned char *input_stop,
1171 unsigned char **output_ptr, unsigned char *output_stop,
1172 int flags,
1173 int *result_position_ptr)
1174{
1175 int i;
1176 int needreport_index;
1177 int sweep_start;
1178
1179 unsigned char empty_buf;
1180 unsigned char *empty_ptr = &empty_buf;
1181
1182 if (!input_ptr) {
1183 input_ptr = (const unsigned char **)&empty_ptr;
1184 input_stop = empty_ptr;
1185 }
1186
1187 if (!output_ptr) {
1188 output_ptr = &empty_ptr;
1189 output_stop = empty_ptr;
1190 }
1191
1192 if (ec->elems[0].last_result == econv_after_output)
1194
1195 for (i = ec->num_trans-1; 0 <= i; i--) {
1196 switch (ec->elems[i].last_result) {
1200 case econv_after_output:
1201 case econv_finished:
1202 sweep_start = i+1;
1203 goto found_needreport;
1204
1207 break;
1208
1209 default:
1210 rb_bug("unexpected transcode last result");
1211 }
1212 }
1213
1214 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1215
1217 (flags & ECONV_AFTER_OUTPUT)) {
1219
1220 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1222 result_position_ptr);
1223
1224 if (res == econv_source_buffer_empty)
1225 return econv_after_output;
1226 return res;
1227 }
1228
1229 sweep_start = 0;
1230
1231 found_needreport:
1232
1233 do {
1234 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1235 sweep_start = needreport_index + 1;
1236 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1237
1238 for (i = ec->num_trans-1; 0 <= i; i--) {
1240 rb_econv_result_t res = ec->elems[i].last_result;
1241 if (res == econv_invalid_byte_sequence ||
1242 res == econv_incomplete_input ||
1244 res == econv_after_output) {
1246 }
1247 if (result_position_ptr)
1248 *result_position_ptr = i;
1249 return res;
1250 }
1251 }
1252 if (result_position_ptr)
1253 *result_position_ptr = -1;
1255}
1256
1257static rb_econv_result_t
1258rb_econv_convert0(rb_econv_t *ec,
1259 const unsigned char **input_ptr, const unsigned char *input_stop,
1260 unsigned char **output_ptr, unsigned char *output_stop,
1261 int flags)
1262{
1264 int result_position;
1265 int has_output = 0;
1266
1267 memset(&ec->last_error, 0, sizeof(ec->last_error));
1268
1269 if (ec->num_trans == 0) {
1270 size_t len;
1271 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1272 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1273 len = output_stop - *output_ptr;
1274 memcpy(*output_ptr, ec->in_data_start, len);
1275 *output_ptr = output_stop;
1276 ec->in_data_start += len;
1278 goto gotresult;
1279 }
1280 len = ec->in_data_end - ec->in_data_start;
1281 memcpy(*output_ptr, ec->in_data_start, len);
1282 *output_ptr += len;
1283 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1284 if (flags & ECONV_AFTER_OUTPUT) {
1285 res = econv_after_output;
1286 goto gotresult;
1287 }
1288 }
1289 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1290 len = output_stop - *output_ptr;
1291 }
1292 else {
1293 len = input_stop - *input_ptr;
1294 }
1295 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1296 *(*output_ptr)++ = *(*input_ptr)++;
1297 res = econv_after_output;
1298 goto gotresult;
1299 }
1300 memcpy(*output_ptr, *input_ptr, len);
1301 *output_ptr += len;
1302 *input_ptr += len;
1303 if (*input_ptr != input_stop)
1305 else if (flags & ECONV_PARTIAL_INPUT)
1307 else
1308 res = econv_finished;
1309 goto gotresult;
1310 }
1311
1312 if (ec->elems[ec->num_trans-1].out_data_start) {
1313 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1314 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1315 if (data_start != data_end) {
1316 size_t len;
1317 if (output_stop - *output_ptr < data_end - data_start) {
1318 len = output_stop - *output_ptr;
1319 memcpy(*output_ptr, data_start, len);
1320 *output_ptr = output_stop;
1321 ec->elems[ec->num_trans-1].out_data_start += len;
1323 goto gotresult;
1324 }
1325 len = data_end - data_start;
1326 memcpy(*output_ptr, data_start, len);
1327 *output_ptr += len;
1328 ec->elems[ec->num_trans-1].out_data_start =
1329 ec->elems[ec->num_trans-1].out_data_end =
1330 ec->elems[ec->num_trans-1].out_buf_start;
1331 has_output = 1;
1332 }
1333 }
1334
1335 if (ec->in_buf_start &&
1336 ec->in_data_start != ec->in_data_end) {
1337 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1338 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1339 if (res != econv_source_buffer_empty)
1340 goto gotresult;
1341 }
1342
1343 if (has_output &&
1344 (flags & ECONV_AFTER_OUTPUT) &&
1345 *input_ptr != input_stop) {
1346 input_stop = *input_ptr;
1347 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1348 if (res == econv_source_buffer_empty)
1349 res = econv_after_output;
1350 }
1351 else if ((flags & ECONV_AFTER_OUTPUT) ||
1352 ec->num_trans == 1) {
1353 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1354 }
1355 else {
1356 flags |= ECONV_AFTER_OUTPUT;
1357 do {
1358 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1359 } while (res == econv_after_output);
1360 }
1361
1362 gotresult:
1363 ec->last_error.result = res;
1364 if (res == econv_invalid_byte_sequence ||
1365 res == econv_incomplete_input ||
1367 rb_transcoding *error_tc = ec->elems[result_position].tc;
1368 ec->last_error.error_tc = error_tc;
1373 ec->last_error.readagain_len = error_tc->readagain_len;
1374 }
1375
1376 return res;
1377}
1378
1379static int output_replacement_character(rb_econv_t *ec);
1380
1381static int
1382output_hex_charref(rb_econv_t *ec)
1383{
1384 int ret;
1385 unsigned char utfbuf[1024];
1386 const unsigned char *utf;
1387 size_t utf_len;
1388 int utf_allocated = 0;
1389 char charef_buf[16];
1390 const unsigned char *p;
1391
1392 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1393 utf = ec->last_error.error_bytes_start;
1394 utf_len = ec->last_error.error_bytes_len;
1395 }
1396 else {
1397 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1399 utfbuf, sizeof(utfbuf),
1400 &utf_len);
1401 if (!utf)
1402 return -1;
1403 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1404 utf_allocated = 1;
1405 }
1406
1407 if (utf_len % 4 != 0)
1408 goto fail;
1409
1410 p = utf;
1411 while (4 <= utf_len) {
1412 unsigned int u = 0;
1413 u += p[0] << 24;
1414 u += p[1] << 16;
1415 u += p[2] << 8;
1416 u += p[3];
1417 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1418
1419 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1420 if (ret == -1)
1421 goto fail;
1422
1423 p += 4;
1424 utf_len -= 4;
1425 }
1426
1427 if (utf_allocated)
1428 xfree((void *)utf);
1429 return 0;
1430
1431 fail:
1432 if (utf_allocated)
1433 xfree((void *)utf);
1434 return -1;
1435}
1436
1439 const unsigned char **input_ptr, const unsigned char *input_stop,
1440 unsigned char **output_ptr, unsigned char *output_stop,
1441 int flags)
1442{
1444
1445 unsigned char empty_buf;
1446 unsigned char *empty_ptr = &empty_buf;
1447
1448 ec->started = 1;
1449
1450 if (!input_ptr) {
1451 input_ptr = (const unsigned char **)&empty_ptr;
1452 input_stop = empty_ptr;
1453 }
1454
1455 if (!output_ptr) {
1456 output_ptr = &empty_ptr;
1457 output_stop = empty_ptr;
1458 }
1459
1460 resume:
1461 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1462
1463 if (ret == econv_invalid_byte_sequence ||
1464 ret == econv_incomplete_input) {
1465 /* deal with invalid byte sequence */
1466 /* todo: add more alternative behaviors */
1467 switch (ec->flags & ECONV_INVALID_MASK) {
1469 if (output_replacement_character(ec) == 0)
1470 goto resume;
1471 }
1472 }
1473
1474 if (ret == econv_undefined_conversion) {
1475 /* valid character in source encoding
1476 * but no related character(s) in destination encoding */
1477 /* todo: add more alternative behaviors */
1478 switch (ec->flags & ECONV_UNDEF_MASK) {
1480 if (output_replacement_character(ec) == 0)
1481 goto resume;
1482 break;
1483
1485 if (output_hex_charref(ec) == 0)
1486 goto resume;
1487 break;
1488 }
1489 }
1490
1491 return ret;
1492}
1493
1494const char *
1496{
1497 rb_transcoding *tc = ec->last_tc;
1498 const rb_transcoder *tr;
1499
1500 if (tc == NULL)
1501 return "";
1502
1503 tr = tc->transcoder;
1504
1505 if (tr->asciicompat_type == asciicompat_encoder)
1506 return tr->src_encoding;
1507 return tr->dst_encoding;
1508}
1509
1510static unsigned char *
1511allocate_converted_string(const char *sname, const char *dname,
1512 const unsigned char *str, size_t len,
1513 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1514 size_t *dst_len_ptr)
1515{
1516 unsigned char *dst_str;
1517 size_t dst_len;
1518 size_t dst_bufsize;
1519
1520 rb_econv_t *ec;
1522
1523 const unsigned char *sp;
1524 unsigned char *dp;
1525
1526 if (caller_dst_buf)
1527 dst_bufsize = caller_dst_bufsize;
1528 else if (len == 0)
1529 dst_bufsize = 1;
1530 else
1531 dst_bufsize = len;
1532
1533 ec = rb_econv_open(sname, dname, 0);
1534 if (ec == NULL)
1535 return NULL;
1536 if (caller_dst_buf)
1537 dst_str = caller_dst_buf;
1538 else
1539 dst_str = xmalloc(dst_bufsize);
1540 dst_len = 0;
1541 sp = str;
1542 dp = dst_str+dst_len;
1543 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1544 dst_len = dp - dst_str;
1545 while (res == econv_destination_buffer_full) {
1546 if (SIZE_MAX/2 < dst_bufsize) {
1547 goto fail;
1548 }
1549 dst_bufsize *= 2;
1550 if (dst_str == caller_dst_buf) {
1551 unsigned char *tmp;
1552 tmp = xmalloc(dst_bufsize);
1553 memcpy(tmp, dst_str, dst_bufsize/2);
1554 dst_str = tmp;
1555 }
1556 else {
1557 dst_str = xrealloc(dst_str, dst_bufsize);
1558 }
1559 dp = dst_str+dst_len;
1560 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1561 dst_len = dp - dst_str;
1562 }
1563 if (res != econv_finished) {
1564 goto fail;
1565 }
1566 rb_econv_close(ec);
1567 *dst_len_ptr = dst_len;
1568 return dst_str;
1569
1570 fail:
1571 if (dst_str != caller_dst_buf)
1572 xfree(dst_str);
1573 rb_econv_close(ec);
1574 return NULL;
1575}
1576
1577/* result: 0:success -1:failure */
1578int
1580 const unsigned char *str, size_t len, const char *str_encoding)
1581{
1582 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1583 unsigned char insert_buf[4096];
1584 const unsigned char *insert_str = NULL;
1585 size_t insert_len;
1586
1587 int last_trans_index;
1588 rb_transcoding *tc;
1589
1590 unsigned char **buf_start_p;
1591 unsigned char **data_start_p;
1592 unsigned char **data_end_p;
1593 unsigned char **buf_end_p;
1594
1595 size_t need;
1596
1597 ec->started = 1;
1598
1599 if (len == 0)
1600 return 0;
1601
1602 if (encoding_equal(insert_encoding, str_encoding)) {
1603 insert_str = str;
1604 insert_len = len;
1605 }
1606 else {
1607 insert_str = allocate_converted_string(str_encoding, insert_encoding,
1608 str, len, insert_buf, sizeof(insert_buf), &insert_len);
1609 if (insert_str == NULL)
1610 return -1;
1611 }
1612
1613 need = insert_len;
1614
1615 last_trans_index = ec->num_trans-1;
1616 if (ec->num_trans == 0) {
1617 tc = NULL;
1618 buf_start_p = &ec->in_buf_start;
1619 data_start_p = &ec->in_data_start;
1620 data_end_p = &ec->in_data_end;
1621 buf_end_p = &ec->in_buf_end;
1622 }
1623 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1624 tc = ec->elems[last_trans_index].tc;
1625 need += tc->readagain_len;
1626 if (need < insert_len)
1627 goto fail;
1628 if (last_trans_index == 0) {
1629 buf_start_p = &ec->in_buf_start;
1630 data_start_p = &ec->in_data_start;
1631 data_end_p = &ec->in_data_end;
1632 buf_end_p = &ec->in_buf_end;
1633 }
1634 else {
1635 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1636 buf_start_p = &ee->out_buf_start;
1637 data_start_p = &ee->out_data_start;
1638 data_end_p = &ee->out_data_end;
1639 buf_end_p = &ee->out_buf_end;
1640 }
1641 }
1642 else {
1643 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1644 buf_start_p = &ee->out_buf_start;
1645 data_start_p = &ee->out_data_start;
1646 data_end_p = &ee->out_data_end;
1647 buf_end_p = &ee->out_buf_end;
1648 tc = ec->elems[last_trans_index].tc;
1649 }
1650
1651 if (*buf_start_p == NULL) {
1652 unsigned char *buf = xmalloc(need);
1653 *buf_start_p = buf;
1654 *data_start_p = buf;
1655 *data_end_p = buf;
1656 *buf_end_p = buf+need;
1657 }
1658 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1659 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1660 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1661 *data_start_p = *buf_start_p;
1662 if ((size_t)(*buf_end_p - *data_end_p) < need) {
1663 unsigned char *buf;
1664 size_t s = (*data_end_p - *buf_start_p) + need;
1665 if (s < need)
1666 goto fail;
1667 buf = xrealloc(*buf_start_p, s);
1668 *data_start_p = buf;
1669 *data_end_p = buf + (*data_end_p - *buf_start_p);
1670 *buf_start_p = buf;
1671 *buf_end_p = buf + s;
1672 }
1673 }
1674
1675 memcpy(*data_end_p, insert_str, insert_len);
1676 *data_end_p += insert_len;
1678 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1679 *data_end_p += tc->readagain_len;
1680 tc->readagain_len = 0;
1681 }
1682
1683 if (insert_str != str && insert_str != insert_buf)
1684 xfree((void*)insert_str);
1685 return 0;
1686
1687 fail:
1688 if (insert_str != str && insert_str != insert_buf)
1689 xfree((void*)insert_str);
1690 return -1;
1691}
1692
1693void
1695{
1696 int i;
1697
1698 if (ec->replacement_allocated) {
1699 xfree((void *)ec->replacement_str);
1700 }
1701 for (i = 0; i < ec->num_trans; i++) {
1702 rb_transcoding_close(ec->elems[i].tc);
1703 if (ec->elems[i].out_buf_start)
1704 xfree(ec->elems[i].out_buf_start);
1705 }
1706 xfree(ec->in_buf_start);
1707 xfree(ec->elems);
1708 xfree(ec);
1709}
1710
1711size_t
1713{
1714 size_t size = sizeof(rb_econv_t);
1715 int i;
1716
1717 if (ec->replacement_allocated) {
1718 size += ec->replacement_len;
1719 }
1720 for (i = 0; i < ec->num_trans; i++) {
1721 size += rb_transcoding_memsize(ec->elems[i].tc);
1722
1723 if (ec->elems[i].out_buf_start) {
1724 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1725 }
1726 }
1727 size += ec->in_buf_end - ec->in_buf_start;
1728 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1729
1730 return size;
1731}
1732
1733int
1735{
1736 if (ec->num_trans == 0)
1737 return 0;
1738#if SIZEOF_SIZE_T > SIZEOF_INT
1739 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1740#endif
1741 return (int)ec->elems[0].tc->readagain_len;
1742}
1743
1744void
1745rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1746{
1747 rb_transcoding *tc;
1748 if (ec->num_trans == 0 || n == 0)
1749 return;
1750 tc = ec->elems[0].tc;
1751 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1752 tc->readagain_len -= n;
1753}
1754
1758};
1759
1760static int
1761asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1762{
1763 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1764 transcoder_entry_t *entry = (transcoder_entry_t *)val;
1765 const rb_transcoder *tr;
1766
1767 if (DECORATOR_P(entry->sname, entry->dname))
1768 return ST_CONTINUE;
1769 tr = load_transcoder_entry(entry);
1770 if (tr && tr->asciicompat_type == asciicompat_decoder) {
1771 data->ascii_compat_name = tr->dst_encoding;
1772 return ST_STOP;
1773 }
1774 return ST_CONTINUE;
1775}
1776
1777const char *
1779{
1780 st_data_t v;
1781 st_table *table2;
1782 struct asciicompat_encoding_t data;
1783
1784 if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1785 return NULL;
1786 table2 = (st_table *)v;
1787
1788 /*
1789 * Assumption:
1790 * There is at most one transcoder for
1791 * converting from ASCII incompatible encoding.
1792 *
1793 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1794 */
1795 if (table2->num_entries != 1)
1796 return NULL;
1797
1799 data.ascii_compat_name = NULL;
1800 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1801 return data.ascii_compat_name;
1802}
1803
1804VALUE
1805rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1806{
1807 unsigned const char *sp, *se;
1808 unsigned char *ds, *dp, *de;
1810 int max_output;
1811
1812 if (NIL_P(dst)) {
1813 dst = rb_str_buf_new(len);
1814 if (ec->destination_encoding)
1816 }
1817
1818 if (ec->last_tc)
1819 max_output = ec->last_tc->transcoder->max_output;
1820 else
1821 max_output = 1;
1822
1823 do {
1824 long dlen = RSTRING_LEN(dst);
1825 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1826 unsigned long new_capa = (unsigned long)dlen + len + max_output;
1827 if (LONG_MAX < new_capa)
1828 rb_raise(rb_eArgError, "too long string");
1829 rb_str_resize(dst, new_capa);
1830 rb_str_set_len(dst, dlen);
1831 }
1832 sp = (const unsigned char *)ss;
1833 se = sp + len;
1834 ds = (unsigned char *)RSTRING_PTR(dst);
1835 de = ds + rb_str_capacity(dst);
1836 dp = ds += dlen;
1837 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1838 len -= (const char *)sp - ss;
1839 ss = (const char *)sp;
1840 rb_str_set_len(dst, dlen + (dp - ds));
1842 } while (res == econv_destination_buffer_full);
1843
1844 return dst;
1845}
1846
1847VALUE
1848rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1849{
1850 src = rb_str_new_frozen(src);
1851 dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1852 RB_GC_GUARD(src);
1853 return dst;
1854}
1855
1856VALUE
1858{
1859 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1860}
1861
1862VALUE
1863rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1864{
1865 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1866}
1867
1868VALUE
1870{
1871 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1872}
1873
1874static int
1875rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1876{
1877 transcoder_entry_t *entry;
1878 const rb_transcoder *tr;
1879
1880 if (ec->started != 0)
1881 return -1;
1882
1883 entry = get_transcoder_entry(sname, dname);
1884 if (!entry)
1885 return -1;
1886
1887 tr = load_transcoder_entry(entry);
1888 if (!tr) return -1;
1889
1890 return rb_econv_add_transcoder_at(ec, tr, n);
1891}
1892
1893static int
1894rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1895{
1896 return rb_econv_add_converter(ec, "", decorator_name, n);
1897}
1898
1899int
1900rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1901{
1902 const rb_transcoder *tr;
1903
1904 if (ec->num_trans == 0)
1905 return rb_econv_decorate_at(ec, decorator_name, 0);
1906
1907 tr = ec->elems[0].tc->transcoder;
1908
1909 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1910 tr->asciicompat_type == asciicompat_decoder)
1911 return rb_econv_decorate_at(ec, decorator_name, 1);
1912
1913 return rb_econv_decorate_at(ec, decorator_name, 0);
1914}
1915
1916int
1917rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1918{
1919 const rb_transcoder *tr;
1920
1921 if (ec->num_trans == 0)
1922 return rb_econv_decorate_at(ec, decorator_name, 0);
1923
1924 tr = ec->elems[ec->num_trans-1].tc->transcoder;
1925
1926 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1927 tr->asciicompat_type == asciicompat_encoder)
1928 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1929
1930 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1931}
1932
1933void
1935{
1936 const char *dname = 0;
1937
1938 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
1940 dname = "universal_newline";
1941 break;
1943 dname = "crlf_newline";
1944 break;
1946 dname = "cr_newline";
1947 break;
1948 }
1949
1950 if (dname) {
1951 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
1952 int num_trans = ec->num_trans;
1953 int i, j = 0;
1954
1955 for (i=0; i < num_trans; i++) {
1956 if (transcoder == ec->elems[i].tc->transcoder) {
1957 rb_transcoding_close(ec->elems[i].tc);
1958 xfree(ec->elems[i].out_buf_start);
1959 ec->num_trans--;
1960 }
1961 else
1962 ec->elems[j++] = ec->elems[i];
1963 }
1964 }
1965
1966 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
1967}
1968
1969static VALUE
1970econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
1971{
1972 int has_description = 0;
1973
1974 if (NIL_P(mesg))
1975 mesg = rb_str_new(NULL, 0);
1976
1977 if (*sname != '\0' || *dname != '\0') {
1978 if (*sname == '\0')
1979 rb_str_cat2(mesg, dname);
1980 else if (*dname == '\0')
1981 rb_str_cat2(mesg, sname);
1982 else
1983 rb_str_catf(mesg, "%s to %s", sname, dname);
1984 has_description = 1;
1985 }
1986
1987 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
1991 const char *pre = "";
1992 if (has_description)
1993 rb_str_cat2(mesg, " with ");
1994 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
1995 rb_str_cat2(mesg, pre); pre = ",";
1996 rb_str_cat2(mesg, "universal_newline");
1997 }
1998 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
1999 rb_str_cat2(mesg, pre); pre = ",";
2000 rb_str_cat2(mesg, "crlf_newline");
2001 }
2002 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2003 rb_str_cat2(mesg, pre); pre = ",";
2004 rb_str_cat2(mesg, "cr_newline");
2005 }
2006 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2007 rb_str_cat2(mesg, pre); pre = ",";
2008 rb_str_cat2(mesg, "xml_text");
2009 }
2010 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2011 rb_str_cat2(mesg, pre); pre = ",";
2012 rb_str_cat2(mesg, "xml_attr_content");
2013 }
2014 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2015 rb_str_cat2(mesg, pre); pre = ",";
2016 rb_str_cat2(mesg, "xml_attr_quote");
2017 }
2018 has_description = 1;
2019 }
2020 if (!has_description) {
2021 rb_str_cat2(mesg, "no-conversion");
2022 }
2023
2024 return mesg;
2025}
2026
2027VALUE
2028rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2029{
2030 VALUE mesg, exc;
2031 mesg = rb_str_new_cstr("code converter not found (");
2032 econv_description(sname, dname, ecflags, mesg);
2033 rb_str_cat2(mesg, ")");
2034 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2035 return exc;
2036}
2037
2038static VALUE
2039make_econv_exception(rb_econv_t *ec)
2040{
2041 VALUE mesg, exc;
2044 const char *err = (const char *)ec->last_error.error_bytes_start;
2045 size_t error_len = ec->last_error.error_bytes_len;
2046 VALUE bytes = rb_str_new(err, error_len);
2047 VALUE dumped = rb_str_dump(bytes);
2048 size_t readagain_len = ec->last_error.readagain_len;
2049 VALUE bytes2 = Qnil;
2050 VALUE dumped2;
2052 mesg = rb_sprintf("incomplete %s on %s",
2053 StringValueCStr(dumped),
2055 }
2056 else if (readagain_len) {
2057 bytes2 = rb_str_new(err+error_len, readagain_len);
2058 dumped2 = rb_str_dump(bytes2);
2059 mesg = rb_sprintf("%s followed by %s on %s",
2060 StringValueCStr(dumped),
2061 StringValueCStr(dumped2),
2063 }
2064 else {
2065 mesg = rb_sprintf("%s on %s",
2066 StringValueCStr(dumped),
2068 }
2069
2070 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2071 rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
2072 rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
2073 rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
2074 goto set_encs;
2075 }
2077 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2079 VALUE dumped = Qnil;
2080 int idx;
2081 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2082 rb_encoding *utf8 = rb_utf8_encoding();
2083 const char *start, *end;
2084 int n;
2085 start = (const char *)ec->last_error.error_bytes_start;
2086 end = start + ec->last_error.error_bytes_len;
2087 n = rb_enc_precise_mbclen(start, end, utf8);
2088 if (MBCLEN_CHARFOUND_P(n) &&
2090 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2091 dumped = rb_sprintf("U+%04X", cc);
2092 }
2093 }
2094 if (dumped == Qnil)
2095 dumped = rb_str_dump(bytes);
2096 if (strcmp(ec->last_error.source_encoding,
2097 ec->source_encoding_name) == 0 &&
2099 ec->destination_encoding_name) == 0) {
2100 mesg = rb_sprintf("%s from %s to %s",
2101 StringValueCStr(dumped),
2104 }
2105 else {
2106 int i;
2107 mesg = rb_sprintf("%s to %s in conversion from %s",
2108 StringValueCStr(dumped),
2111 for (i = 0; i < ec->num_trans; i++) {
2112 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2113 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2114 rb_str_catf(mesg, " to %s",
2115 ec->elems[i].tc->transcoder->dst_encoding);
2116 }
2117 }
2118 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2120 if (0 <= idx)
2121 rb_enc_associate_index(bytes, idx);
2122 rb_ivar_set(exc, rb_intern("error_char"), bytes);
2123 goto set_encs;
2124 }
2125 return Qnil;
2126
2127 set_encs:
2128 rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
2129 rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
2131 if (0 <= idx)
2132 rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2134 if (0 <= idx)
2135 rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2136 return exc;
2137}
2138
2139static void
2140more_output_buffer(
2141 VALUE destination,
2142 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2143 int max_output,
2144 unsigned char **out_start_ptr,
2145 unsigned char **out_pos,
2146 unsigned char **out_stop_ptr)
2147{
2148 size_t len = (*out_pos - *out_start_ptr);
2149 size_t new_len = (len + max_output) * 2;
2150 *out_start_ptr = resize_destination(destination, len, new_len);
2151 *out_pos = *out_start_ptr + len;
2152 *out_stop_ptr = *out_start_ptr + new_len;
2153}
2154
2155static int
2156make_replacement(rb_econv_t *ec)
2157{
2158 rb_transcoding *tc;
2159 const rb_transcoder *tr;
2160 const unsigned char *replacement;
2161 const char *repl_enc;
2162 const char *ins_enc;
2163 size_t len;
2164
2165 if (ec->replacement_str)
2166 return 0;
2167
2169
2170 tc = ec->last_tc;
2171 if (*ins_enc) {
2172 tr = tc->transcoder;
2173 rb_enc_find(tr->dst_encoding);
2174 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2175 }
2176 else {
2177 replacement = (unsigned char *)"?";
2178 len = 1;
2179 repl_enc = "";
2180 }
2181
2182 ec->replacement_str = replacement;
2183 ec->replacement_len = len;
2184 ec->replacement_enc = repl_enc;
2185 ec->replacement_allocated = 0;
2186 return 0;
2187}
2188
2189int
2191 const unsigned char *str, size_t len, const char *encname)
2192{
2193 unsigned char *str2;
2194 size_t len2;
2195 const char *encname2;
2196
2198
2199 if (!*encname2 || encoding_equal(encname, encname2)) {
2200 str2 = xmalloc(len);
2201 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2202 len2 = len;
2203 encname2 = encname;
2204 }
2205 else {
2206 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2207 if (!str2)
2208 return -1;
2209 }
2210
2211 if (ec->replacement_allocated) {
2212 xfree((void *)ec->replacement_str);
2213 }
2214 ec->replacement_allocated = 1;
2215 ec->replacement_str = str2;
2216 ec->replacement_len = len2;
2217 ec->replacement_enc = encname2;
2218 return 0;
2219}
2220
2221static int
2222output_replacement_character(rb_econv_t *ec)
2223{
2224 int ret;
2225
2226 if (make_replacement(ec) == -1)
2227 return -1;
2228
2230 if (ret == -1)
2231 return -1;
2232
2233 return 0;
2234}
2235
2236#if 1
2237#define hash_fallback rb_hash_aref
2238
2239static VALUE
2240proc_fallback(VALUE fallback, VALUE c)
2241{
2242 return rb_proc_call(fallback, rb_ary_new4(1, &c));
2243}
2244
2245static VALUE
2246method_fallback(VALUE fallback, VALUE c)
2247{
2248 return rb_method_call(1, &c, fallback);
2249}
2250
2251static VALUE
2252aref_fallback(VALUE fallback, VALUE c)
2253{
2254 return rb_funcallv_public(fallback, idAREF, 1, &c);
2255}
2256
2257static void
2258transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2259 const unsigned char *in_stop, unsigned char *out_stop,
2260 VALUE destination,
2261 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2262 const char *src_encoding,
2263 const char *dst_encoding,
2264 int ecflags,
2265 VALUE ecopts)
2266{
2267 rb_econv_t *ec;
2268 rb_transcoding *last_tc;
2270 unsigned char *out_start = *out_pos;
2271 int max_output;
2272 VALUE exc;
2273 VALUE fallback = Qnil;
2274 VALUE (*fallback_func)(VALUE, VALUE) = 0;
2275
2276 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2277 if (!ec)
2278 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2279
2280 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2281 fallback = rb_hash_aref(ecopts, sym_fallback);
2282 if (RB_TYPE_P(fallback, T_HASH)) {
2284 }
2285 else if (rb_obj_is_proc(fallback)) {
2286 fallback_func = proc_fallback;
2287 }
2288 else if (rb_obj_is_method(fallback)) {
2289 fallback_func = method_fallback;
2290 }
2291 else {
2292 fallback_func = aref_fallback;
2293 }
2294 }
2295 last_tc = ec->last_tc;
2296 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2297
2298 resume:
2299 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2300
2301 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2302 VALUE rep = rb_enc_str_new(
2303 (const char *)ec->last_error.error_bytes_start,
2306 rep = (*fallback_func)(fallback, rep);
2307 if (rep != Qundef && !NIL_P(rep)) {
2308 StringValue(rep);
2309 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2310 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2311 if ((int)ret == -1) {
2312 rb_raise(rb_eArgError, "too big fallback string");
2313 }
2314 goto resume;
2315 }
2316 }
2317
2318 if (ret == econv_invalid_byte_sequence ||
2319 ret == econv_incomplete_input ||
2321 exc = make_econv_exception(ec);
2322 rb_econv_close(ec);
2323 rb_exc_raise(exc);
2324 }
2325
2326 if (ret == econv_destination_buffer_full) {
2327 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2328 goto resume;
2329 }
2330
2331 rb_econv_close(ec);
2332 return;
2333}
2334#else
2335/* sample transcode_loop implementation in byte-by-byte stream style */
2336static void
2337transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2338 const unsigned char *in_stop, unsigned char *out_stop,
2339 VALUE destination,
2340 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2341 const char *src_encoding,
2342 const char *dst_encoding,
2343 int ecflags,
2344 VALUE ecopts)
2345{
2346 rb_econv_t *ec;
2347 rb_transcoding *last_tc;
2349 unsigned char *out_start = *out_pos;
2350 const unsigned char *ptr;
2351 int max_output;
2352 VALUE exc;
2353
2354 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2355 if (!ec)
2356 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2357
2358 last_tc = ec->last_tc;
2359 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2360
2362 ptr = *in_pos;
2363 while (ret != econv_finished) {
2364 unsigned char input_byte;
2365 const unsigned char *p = &input_byte;
2366
2367 if (ret == econv_source_buffer_empty) {
2368 if (ptr < in_stop) {
2369 input_byte = *ptr;
2370 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2371 }
2372 else {
2373 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2374 }
2375 }
2376 else {
2377 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2378 }
2379 if (&input_byte != p)
2380 ptr += p - &input_byte;
2381 switch (ret) {
2385 exc = make_econv_exception(ec);
2386 rb_econv_close(ec);
2387 rb_exc_raise(exc);
2388 break;
2389
2391 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2392 break;
2393
2395 break;
2396
2397 case econv_finished:
2398 break;
2399 }
2400 }
2401 rb_econv_close(ec);
2402 *in_pos = in_stop;
2403 return;
2404}
2405#endif
2406
2407
2408/*
2409 * String-specific code
2410 */
2411
2412static unsigned char *
2413str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2414{
2415 rb_str_resize(destination, new_len);
2416 return (unsigned char *)RSTRING_PTR(destination);
2417}
2418
2419static int
2420econv_opts(VALUE opt, int ecflags)
2421{
2422 VALUE v;
2423 int newlineflag = 0;
2424
2425 v = rb_hash_aref(opt, sym_invalid);
2426 if (NIL_P(v)) {
2427 }
2428 else if (v==sym_replace) {
2429 ecflags |= ECONV_INVALID_REPLACE;
2430 }
2431 else {
2432 rb_raise(rb_eArgError, "unknown value for invalid character option");
2433 }
2434
2435 v = rb_hash_aref(opt, sym_undef);
2436 if (NIL_P(v)) {
2437 }
2438 else if (v==sym_replace) {
2439 ecflags |= ECONV_UNDEF_REPLACE;
2440 }
2441 else {
2442 rb_raise(rb_eArgError, "unknown value for undefined character option");
2443 }
2444
2445 v = rb_hash_aref(opt, sym_replace);
2446 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2447 ecflags |= ECONV_UNDEF_REPLACE;
2448 }
2449
2450 v = rb_hash_aref(opt, sym_xml);
2451 if (!NIL_P(v)) {
2452 if (v==sym_text) {
2454 }
2455 else if (v==sym_attr) {
2457 }
2458 else if (RB_TYPE_P(v, T_SYMBOL)) {
2459 rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2460 }
2461 else {
2462 rb_raise(rb_eArgError, "unexpected value for xml option");
2463 }
2464 }
2465
2466#ifdef ENABLE_ECONV_NEWLINE_OPTION
2467 v = rb_hash_aref(opt, sym_newline);
2468 if (!NIL_P(v)) {
2469 newlineflag = 2;
2470 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2471 if (v == sym_universal) {
2473 }
2474 else if (v == sym_crlf) {
2476 }
2477 else if (v == sym_cr) {
2478 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2479 }
2480 else if (v == sym_lf) {
2481 /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
2482 }
2483 else if (SYMBOL_P(v)) {
2484 rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2485 rb_sym2str(v));
2486 }
2487 else {
2488 rb_raise(rb_eArgError, "unexpected value for newline option");
2489 }
2490 }
2491#endif
2492 {
2493 int setflags = 0;
2494
2495 v = rb_hash_aref(opt, sym_universal_newline);
2496 if (RTEST(v))
2498 newlineflag |= !NIL_P(v);
2499
2500 v = rb_hash_aref(opt, sym_crlf_newline);
2501 if (RTEST(v))
2502 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2503 newlineflag |= !NIL_P(v);
2504
2505 v = rb_hash_aref(opt, sym_cr_newline);
2506 if (RTEST(v))
2507 setflags |= ECONV_CR_NEWLINE_DECORATOR;
2508 newlineflag |= !NIL_P(v);
2509
2510 switch (newlineflag) {
2511 case 1:
2512 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2513 ecflags |= setflags;
2514 break;
2515
2516 case 3:
2517 rb_warning(":newline option preceds other newline options");
2518 break;
2519 }
2520 }
2521
2522 return ecflags;
2523}
2524
2525int
2526rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2527{
2528 VALUE newhash = Qnil;
2529 VALUE v;
2530
2531 if (NIL_P(opthash)) {
2532 *opts = Qnil;
2533 return ecflags;
2534 }
2535 ecflags = econv_opts(opthash, ecflags);
2536
2537 v = rb_hash_aref(opthash, sym_replace);
2538 if (!NIL_P(v)) {
2539 StringValue(v);
2541 VALUE dumped = rb_str_dump(v);
2542 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2543 StringValueCStr(dumped),
2545 }
2546 v = rb_str_new_frozen(v);
2547 newhash = rb_hash_new();
2548 rb_hash_aset(newhash, sym_replace, v);
2549 }
2550
2551 v = rb_hash_aref(opthash, sym_fallback);
2552 if (!NIL_P(v)) {
2554 if (NIL_P(h)
2556 : (v = h, 1)) {
2557 if (NIL_P(newhash))
2558 newhash = rb_hash_new();
2559 rb_hash_aset(newhash, sym_fallback, v);
2560 }
2561 }
2562
2563 if (!NIL_P(newhash))
2564 rb_hash_freeze(newhash);
2565 *opts = newhash;
2566
2567 return ecflags;
2568}
2569
2570int
2572{
2573 return rb_econv_prepare_options(opthash, opts, 0);
2574}
2575
2576rb_econv_t *
2577rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2578{
2579 rb_econv_t *ec;
2580 VALUE replacement;
2581
2582 if (NIL_P(opthash)) {
2583 replacement = Qnil;
2584 }
2585 else {
2586 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2587 rb_bug("rb_econv_open_opts called with invalid opthash");
2588 replacement = rb_hash_aref(opthash, sym_replace);
2589 }
2590
2591 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2592 if (!ec)
2593 return ec;
2594
2595 if (!NIL_P(replacement)) {
2596 int ret;
2597 rb_encoding *enc = rb_enc_get(replacement);
2598
2599 ret = rb_econv_set_replacement(ec,
2600 (const unsigned char *)RSTRING_PTR(replacement),
2601 RSTRING_LEN(replacement),
2602 rb_enc_name(enc));
2603 if (ret == -1) {
2604 rb_econv_close(ec);
2605 return NULL;
2606 }
2607 }
2608 return ec;
2609}
2610
2611static int
2612enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2613{
2614 rb_encoding *enc;
2615 const char *n;
2616 int encidx;
2617 VALUE encval;
2618
2619 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2620 !(enc = rb_enc_from_index(encidx))) {
2621 enc = NULL;
2622 encidx = 0;
2623 n = StringValueCStr(*arg);
2624 }
2625 else {
2626 n = rb_enc_name(enc);
2627 }
2628
2629 *name_p = n;
2630 *enc_p = enc;
2631
2632 return encidx;
2633}
2634
2635static int
2636str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2637 const char **sname_p, rb_encoding **senc_p,
2638 const char **dname_p, rb_encoding **denc_p)
2639{
2640 rb_encoding *senc, *denc;
2641 const char *sname, *dname;
2642 int sencidx, dencidx;
2643
2644 dencidx = enc_arg(arg1, &dname, &denc);
2645
2646 if (NIL_P(*arg2)) {
2647 sencidx = rb_enc_get_index(str);
2648 senc = rb_enc_from_index(sencidx);
2649 sname = rb_enc_name(senc);
2650 }
2651 else {
2652 sencidx = enc_arg(arg2, &sname, &senc);
2653 }
2654
2655 *sname_p = sname;
2656 *senc_p = senc;
2657 *dname_p = dname;
2658 *denc_p = denc;
2659 return dencidx;
2660}
2661
2662static int
2663str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2664{
2665 VALUE dest;
2666 VALUE str = *self;
2667 VALUE arg1, arg2;
2668 long blen, slen;
2669 unsigned char *buf, *bp, *sp;
2670 const unsigned char *fromp;
2671 rb_encoding *senc, *denc;
2672 const char *sname, *dname;
2673 int dencidx;
2674 int explicitly_invalid_replace = TRUE;
2675
2676 rb_check_arity(argc, 0, 2);
2677
2678 if (argc == 0) {
2679 arg1 = rb_enc_default_internal();
2680 if (NIL_P(arg1)) {
2681 if (!ecflags) return -1;
2682 arg1 = rb_obj_encoding(str);
2683 }
2684 if (!(ecflags & ECONV_INVALID_MASK)) {
2685 explicitly_invalid_replace = FALSE;
2686 }
2688 }
2689 else {
2690 arg1 = argv[0];
2691 }
2692 arg2 = argc<=1 ? Qnil : argv[1];
2693 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2694
2695 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2699 if (senc && senc == denc) {
2700 if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2701 VALUE rep = Qnil;
2702 if (!NIL_P(ecopts)) {
2703 rep = rb_hash_aref(ecopts, sym_replace);
2704 }
2705 dest = rb_enc_str_scrub(senc, str, rep);
2706 if (NIL_P(dest)) dest = str;
2707 *self = dest;
2708 return dencidx;
2709 }
2710 return NIL_P(arg2) ? -1 : dencidx;
2711 }
2712 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2714 return dencidx;
2715 }
2716 }
2717 if (encoding_equal(sname, dname)) {
2718 return NIL_P(arg2) ? -1 : dencidx;
2719 }
2720 }
2721 else {
2722 if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2723 rb_encoding *utf8 = rb_utf8_encoding();
2724 str = rb_str_conv_enc(str, senc, utf8);
2725 senc = utf8;
2726 sname = "UTF-8";
2727 }
2728 if (encoding_equal(sname, dname)) {
2729 sname = "";
2730 dname = "";
2731 }
2732 }
2733
2734 fromp = sp = (unsigned char *)RSTRING_PTR(str);
2735 slen = RSTRING_LEN(str);
2736 blen = slen + 30; /* len + margin */
2737 dest = rb_str_tmp_new(blen);
2738 bp = (unsigned char *)RSTRING_PTR(dest);
2739
2740 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2741 if (fromp != sp+slen) {
2742 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2743 }
2744 buf = (unsigned char *)RSTRING_PTR(dest);
2745 *bp = '\0';
2746 rb_str_set_len(dest, bp - buf);
2747
2748 /* set encoding */
2749 if (!denc) {
2750 dencidx = rb_define_dummy_encoding(dname);
2751 RB_GC_GUARD(arg1);
2752 RB_GC_GUARD(arg2);
2753 }
2754 *self = dest;
2755
2756 return dencidx;
2757}
2758
2759static int
2760str_transcode(int argc, VALUE *argv, VALUE *self)
2761{
2762 VALUE opt;
2763 int ecflags = 0;
2764 VALUE ecopts = Qnil;
2765
2766 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2767 if (!NIL_P(opt)) {
2768 ecflags = rb_econv_prepare_opts(opt, &ecopts);
2769 }
2770 return str_transcode0(argc, argv, self, ecflags, ecopts);
2771}
2772
2773static inline VALUE
2774str_encode_associate(VALUE str, int encidx)
2775{
2776 int cr = 0;
2777
2778 rb_enc_associate_index(str, encidx);
2779
2780 /* transcoded string never be broken. */
2782 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
2783 }
2784 else {
2786 }
2788 return str;
2789}
2790
2791/*
2792 * call-seq:
2793 * str.encode!(encoding, **options) -> str
2794 * str.encode!(dst_encoding, src_encoding, **options) -> str
2795 *
2796 * The first form transcodes the contents of <i>str</i> from
2797 * str.encoding to +encoding+.
2798 * The second form transcodes the contents of <i>str</i> from
2799 * src_encoding to dst_encoding.
2800 * The +options+ keyword arguments give details for conversion. See String#encode
2801 * for details.
2802 * Returns the string even if no changes were made.
2803 */
2804
2805static VALUE
2806str_encode_bang(int argc, VALUE *argv, VALUE str)
2807{
2808 VALUE newstr;
2809 int encidx;
2810
2812
2813 newstr = str;
2814 encidx = str_transcode(argc, argv, &newstr);
2815
2816 if (encidx < 0) return str;
2817 if (newstr == str) {
2818 rb_enc_associate_index(str, encidx);
2819 return str;
2820 }
2821 rb_str_shared_replace(str, newstr);
2822 return str_encode_associate(str, encidx);
2823}
2824
2825static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2826
2827/*
2828 * call-seq:
2829 * str.encode(encoding, **options) -> str
2830 * str.encode(dst_encoding, src_encoding, **options) -> str
2831 * str.encode(**options) -> str
2832 *
2833 * The first form returns a copy of +str+ transcoded
2834 * to encoding +encoding+.
2835 * The second form returns a copy of +str+ transcoded
2836 * from src_encoding to dst_encoding.
2837 * The last form returns a copy of +str+ transcoded to
2838 * <tt>Encoding.default_internal</tt>.
2839 *
2840 * By default, the first and second form raise
2841 * Encoding::UndefinedConversionError for characters that are
2842 * undefined in the destination encoding, and
2843 * Encoding::InvalidByteSequenceError for invalid byte sequences
2844 * in the source encoding. The last form by default does not raise
2845 * exceptions but uses replacement strings.
2846 *
2847 * The +options+ keyword arguments give details for conversion.
2848 * The arguments are:
2849 *
2850 * :invalid ::
2851 * If the value is +:replace+, #encode replaces invalid byte sequences in
2852 * +str+ with the replacement character. The default is to raise the
2853 * Encoding::InvalidByteSequenceError exception
2854 * :undef ::
2855 * If the value is +:replace+, #encode replaces characters which are
2856 * undefined in the destination encoding with the replacement character.
2857 * The default is to raise the Encoding::UndefinedConversionError.
2858 * :replace ::
2859 * Sets the replacement string to the given value. The default replacement
2860 * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
2861 * :fallback ::
2862 * Sets the replacement string by the given object for undefined
2863 * character. The object should be a Hash, a Proc, a Method, or an
2864 * object which has [] method.
2865 * Its key is an undefined character encoded in the source encoding
2866 * of current transcoder. Its value can be any encoding until it
2867 * can be converted into the destination encoding of the transcoder.
2868 * :xml ::
2869 * The value must be +:text+ or +:attr+.
2870 * If the value is +:text+ #encode replaces undefined characters with their
2871 * (upper-case hexadecimal) numeric character references. '&', '<', and '>'
2872 * are converted to "&amp;", "&lt;", and "&gt;", respectively.
2873 * If the value is +:attr+, #encode also quotes the replacement result
2874 * (using '"'), and replaces '"' with "&quot;".
2875 * :cr_newline ::
2876 * Replaces LF ("\n") with CR ("\r") if value is true.
2877 * :crlf_newline ::
2878 * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
2879 * :universal_newline ::
2880 * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
2881 */
2882
2883static VALUE
2884str_encode(int argc, VALUE *argv, VALUE str)
2885{
2886 VALUE newstr = str;
2887 int encidx = str_transcode(argc, argv, &newstr);
2888 return encoded_dup(newstr, str, encidx);
2889}
2890
2891VALUE
2892rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2893{
2894 int argc = 1;
2895 VALUE *argv = &to;
2896 VALUE newstr = str;
2897 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2898 return encoded_dup(newstr, str, encidx);
2899}
2900
2901static VALUE
2902encoded_dup(VALUE newstr, VALUE str, int encidx)
2903{
2904 if (encidx < 0) return rb_str_dup(str);
2905 if (newstr == str) {
2906 newstr = rb_str_dup(str);
2907 rb_enc_associate_index(newstr, encidx);
2908 return newstr;
2909 }
2910 else {
2911 RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2912 }
2913 return str_encode_associate(newstr, encidx);
2914}
2915
2916/*
2917 * Document-class: Encoding::Converter
2918 *
2919 * Encoding conversion class.
2920 */
2921static void
2922econv_free(void *ptr)
2923{
2924 rb_econv_t *ec = ptr;
2925 rb_econv_close(ec);
2926}
2927
2928static size_t
2929econv_memsize(const void *ptr)
2930{
2931 return sizeof(rb_econv_t);
2932}
2933
2934static const rb_data_type_t econv_data_type = {
2935 "econv",
2936 {0, econv_free, econv_memsize,},
2938};
2939
2940static VALUE
2941econv_s_allocate(VALUE klass)
2942{
2943 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2944}
2945
2946static rb_encoding *
2947make_dummy_encoding(const char *name)
2948{
2949 rb_encoding *enc;
2950 int idx;
2952 enc = rb_enc_from_index(idx);
2953 return enc;
2954}
2955
2956static rb_encoding *
2957make_encoding(const char *name)
2958{
2959 rb_encoding *enc;
2960 enc = rb_enc_find(name);
2961 if (!enc)
2962 enc = make_dummy_encoding(name);
2963 return enc;
2964}
2965
2966static VALUE
2967make_encobj(const char *name)
2968{
2969 return rb_enc_from_encoding(make_encoding(name));
2970}
2971
2972/*
2973 * call-seq:
2974 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2975 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2976 *
2977 * Returns the corresponding ASCII compatible encoding.
2978 *
2979 * Returns nil if the argument is an ASCII compatible encoding.
2980 *
2981 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
2982 * can represents exactly the same characters as the given ASCII incompatible encoding.
2983 * So, no conversion undefined error occurs when converting between the two encodings.
2984 *
2985 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
2986 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
2987 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
2988 *
2989 */
2990static VALUE
2991econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
2992{
2993 const char *arg_name, *result_name;
2994 rb_encoding *arg_enc, *result_enc;
2995
2996 enc_arg(&arg, &arg_name, &arg_enc);
2997
2998 result_name = rb_econv_asciicompat_encoding(arg_name);
2999
3000 if (result_name == NULL)
3001 return Qnil;
3002
3003 result_enc = make_encoding(result_name);
3004
3005 return rb_enc_from_encoding(result_enc);
3006}
3007
3008static void
3009econv_args(int argc, VALUE *argv,
3010 VALUE *snamev_p, VALUE *dnamev_p,
3011 const char **sname_p, const char **dname_p,
3012 rb_encoding **senc_p, rb_encoding **denc_p,
3013 int *ecflags_p,
3014 VALUE *ecopts_p)
3015{
3016 VALUE opt, flags_v, ecopts;
3017 int sidx, didx;
3018 const char *sname, *dname;
3019 rb_encoding *senc, *denc;
3020 int ecflags;
3021
3022 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3023
3024 if (!NIL_P(flags_v)) {
3025 if (!NIL_P(opt)) {
3026 rb_error_arity(argc + 1, 2, 3);
3027 }
3028 ecflags = NUM2INT(rb_to_int(flags_v));
3029 ecopts = Qnil;
3030 }
3031 else if (!NIL_P(opt)) {
3032 ecflags = rb_econv_prepare_opts(opt, &ecopts);
3033 }
3034 else {
3035 ecflags = 0;
3036 ecopts = Qnil;
3037 }
3038
3039 senc = NULL;
3040 sidx = rb_to_encoding_index(*snamev_p);
3041 if (0 <= sidx) {
3042 senc = rb_enc_from_index(sidx);
3043 }
3044 else {
3045 StringValue(*snamev_p);
3046 }
3047
3048 denc = NULL;
3049 didx = rb_to_encoding_index(*dnamev_p);
3050 if (0 <= didx) {
3051 denc = rb_enc_from_index(didx);
3052 }
3053 else {
3054 StringValue(*dnamev_p);
3055 }
3056
3057 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3058 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3059
3060 *sname_p = sname;
3061 *dname_p = dname;
3062 *senc_p = senc;
3063 *denc_p = denc;
3064 *ecflags_p = ecflags;
3065 *ecopts_p = ecopts;
3066}
3067
3068static int
3069decorate_convpath(VALUE convpath, int ecflags)
3070{
3071 int num_decorators;
3072 const char *decorators[MAX_ECFLAGS_DECORATORS];
3073 int i;
3074 int n, len;
3075
3076 num_decorators = decorator_names(ecflags, decorators);
3077 if (num_decorators == -1)
3078 return -1;
3079
3080 len = n = RARRAY_LENINT(convpath);
3081 if (n != 0) {
3082 VALUE pair = RARRAY_AREF(convpath, n-1);
3083 if (RB_TYPE_P(pair, T_ARRAY)) {
3084 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3085 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3086 transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3087 const rb_transcoder *tr = load_transcoder_entry(entry);
3088 if (!tr)
3089 return -1;
3090 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3091 tr->asciicompat_type == asciicompat_encoder) {
3092 n--;
3093 rb_ary_store(convpath, len + num_decorators - 1, pair);
3094 }
3095 }
3096 else {
3097 rb_ary_store(convpath, len + num_decorators - 1, pair);
3098 }
3099 }
3100
3101 for (i = 0; i < num_decorators; i++)
3102 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3103
3104 return 0;
3105}
3106
3107static void
3108search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3109{
3110 VALUE *ary_p = arg;
3111 VALUE v;
3112
3113 if (*ary_p == Qnil) {
3114 *ary_p = rb_ary_new();
3115 }
3116
3117 if (DECORATOR_P(sname, dname)) {
3118 v = rb_str_new_cstr(dname);
3119 }
3120 else {
3121 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3122 }
3123 rb_ary_store(*ary_p, depth, v);
3124}
3125
3126/*
3127 * call-seq:
3128 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3129 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3130 *
3131 * Returns a conversion path.
3132 *
3133 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3134 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3135 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3136 *
3137 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3138 * or
3139 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3140 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3141 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3142 * # "universal_newline"]
3143 *
3144 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3145 * or
3146 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3147 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3148 * # "universal_newline",
3149 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3150 */
3151static VALUE
3152econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3153{
3154 VALUE snamev, dnamev;
3155 const char *sname, *dname;
3156 rb_encoding *senc, *denc;
3157 int ecflags;
3158 VALUE ecopts;
3159 VALUE convpath;
3160
3161 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3162
3163 convpath = Qnil;
3164 transcode_search_path(sname, dname, search_convpath_i, &convpath);
3165
3166 if (NIL_P(convpath)) {
3167 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3168 RB_GC_GUARD(snamev);
3169 RB_GC_GUARD(dnamev);
3170 rb_exc_raise(exc);
3171 }
3172
3173 if (decorate_convpath(convpath, ecflags) == -1) {
3174 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3175 RB_GC_GUARD(snamev);
3176 RB_GC_GUARD(dnamev);
3177 rb_exc_raise(exc);
3178 }
3179
3180 return convpath;
3181}
3182
3183/*
3184 * Check the existence of a conversion path.
3185 * Returns the number of converters in the conversion path.
3186 * result: >=0:success -1:failure
3187 */
3188int
3189rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3190{
3191 VALUE convpath = Qnil;
3192 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3193 &convpath);
3194 return RTEST(convpath);
3195}
3196
3200 int ret;
3201};
3202
3203static void
3204rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3205{
3207 int ret;
3208
3209 if (a->ret == -1)
3210 return;
3211
3212 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3213
3214 a->ret = ret;
3215 return;
3216}
3217
3218static rb_econv_t *
3219rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3220 const char **sname_p, const char **dname_p,
3221 rb_encoding **senc_p, rb_encoding**denc_p)
3222{
3223 rb_econv_t *ec;
3224 long i;
3225 int ret, first=1;
3226 VALUE elt;
3227 rb_encoding *senc = 0, *denc = 0;
3228 const char *sname, *dname;
3229
3230 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3231 DATA_PTR(self) = ec;
3232
3233 for (i = 0; i < RARRAY_LEN(convpath); i++) {
3234 VALUE snamev, dnamev;
3235 VALUE pair;
3236 elt = rb_ary_entry(convpath, i);
3237 if (!NIL_P(pair = rb_check_array_type(elt))) {
3238 if (RARRAY_LEN(pair) != 2)
3239 rb_raise(rb_eArgError, "not a 2-element array in convpath");
3240 snamev = rb_ary_entry(pair, 0);
3241 enc_arg(&snamev, &sname, &senc);
3242 dnamev = rb_ary_entry(pair, 1);
3243 enc_arg(&dnamev, &dname, &denc);
3244 }
3245 else {
3246 sname = "";
3247 dname = StringValueCStr(elt);
3248 }
3249 if (DECORATOR_P(sname, dname)) {
3250 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3251 if (ret == -1) {
3252 VALUE msg = rb_sprintf("decoration failed: %s", dname);
3253 RB_GC_GUARD(snamev);
3254 RB_GC_GUARD(dnamev);
3256 }
3257 }
3258 else {
3259 int j = ec->num_trans;
3260 struct rb_econv_init_by_convpath_t arg;
3261 arg.ec = ec;
3262 arg.index = ec->num_trans;
3263 arg.ret = 0;
3264 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3265 if (ret == -1 || arg.ret == -1) {
3266 VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3267 RB_GC_GUARD(snamev);
3268 RB_GC_GUARD(dnamev);
3270 }
3271 if (first) {
3272 first = 0;
3273 *senc_p = senc;
3274 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3275 }
3276 *denc_p = denc;
3277 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3278 }
3279 }
3280
3281 if (first) {
3282 *senc_p = NULL;
3283 *denc_p = NULL;
3284 *sname_p = "";
3285 *dname_p = "";
3286 }
3287
3288 ec->source_encoding_name = *sname_p;
3289 ec->destination_encoding_name = *dname_p;
3290
3291 return ec;
3292}
3293
3294/*
3295 * call-seq:
3296 * Encoding::Converter.new(source_encoding, destination_encoding)
3297 * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3298 * Encoding::Converter.new(convpath)
3299 *
3300 * possible options elements:
3301 * hash form:
3302 * :invalid => nil # raise error on invalid byte sequence (default)
3303 * :invalid => :replace # replace invalid byte sequence
3304 * :undef => nil # raise error on undefined conversion (default)
3305 * :undef => :replace # replace undefined conversion
3306 * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3307 * :newline => :universal # decorator for converting CRLF and CR to LF
3308 * :newline => :crlf # decorator for converting LF to CRLF
3309 * :newline => :cr # decorator for converting LF to CR
3310 * :universal_newline => true # decorator for converting CRLF and CR to LF
3311 * :crlf_newline => true # decorator for converting LF to CRLF
3312 * :cr_newline => true # decorator for converting LF to CR
3313 * :xml => :text # escape as XML CharData.
3314 * :xml => :attr # escape as XML AttValue
3315 * integer form:
3316 * Encoding::Converter::INVALID_REPLACE
3317 * Encoding::Converter::UNDEF_REPLACE
3318 * Encoding::Converter::UNDEF_HEX_CHARREF
3319 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3320 * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3321 * Encoding::Converter::CR_NEWLINE_DECORATOR
3322 * Encoding::Converter::XML_TEXT_DECORATOR
3323 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3324 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3325 *
3326 * Encoding::Converter.new creates an instance of Encoding::Converter.
3327 *
3328 * Source_encoding and destination_encoding should be a string or
3329 * Encoding object.
3330 *
3331 * opt should be nil, a hash or an integer.
3332 *
3333 * convpath should be an array.
3334 * convpath may contain
3335 * - two-element arrays which contain encodings or encoding names, or
3336 * - strings representing decorator names.
3337 *
3338 * Encoding::Converter.new optionally takes an option.
3339 * The option should be a hash or an integer.
3340 * The option hash can contain :invalid => nil, etc.
3341 * The option integer should be logical-or of constants such as
3342 * Encoding::Converter::INVALID_REPLACE, etc.
3343 *
3344 * [:invalid => nil]
3345 * Raise error on invalid byte sequence. This is a default behavior.
3346 * [:invalid => :replace]
3347 * Replace invalid byte sequence by replacement string.
3348 * [:undef => nil]
3349 * Raise an error if a character in source_encoding is not defined in destination_encoding.
3350 * This is a default behavior.
3351 * [:undef => :replace]
3352 * Replace undefined character in destination_encoding with replacement string.
3353 * [:replace => string]
3354 * Specify the replacement string.
3355 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3356 * [:universal_newline => true]
3357 * Convert CRLF and CR to LF.
3358 * [:crlf_newline => true]
3359 * Convert LF to CRLF.
3360 * [:cr_newline => true]
3361 * Convert LF to CR.
3362 * [:xml => :text]
3363 * Escape as XML CharData.
3364 * This form can be used as an HTML 4.0 #PCDATA.
3365 * - '&' -> '&amp;'
3366 * - '<' -> '&lt;'
3367 * - '>' -> '&gt;'
3368 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3369 * [:xml => :attr]
3370 * Escape as XML AttValue.
3371 * The converted result is quoted as "...".
3372 * This form can be used as an HTML 4.0 attribute value.
3373 * - '&' -> '&amp;'
3374 * - '<' -> '&lt;'
3375 * - '>' -> '&gt;'
3376 * - '"' -> '&quot;'
3377 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3378 *
3379 * Examples:
3380 * # UTF-16BE to UTF-8
3381 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3382 *
3383 * # Usually, decorators such as newline conversion are inserted last.
3384 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3385 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3386 * # "universal_newline"]
3387 *
3388 * # But, if the last encoding is ASCII incompatible,
3389 * # decorators are inserted before the last conversion.
3390 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3391 * p ec.convpath #=> ["crlf_newline",
3392 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3393 *
3394 * # Conversion path can be specified directly.
3395 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3396 * p ec.convpath #=> ["universal_newline",
3397 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3398 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3399 */
3400static VALUE
3401econv_init(int argc, VALUE *argv, VALUE self)
3402{
3403 VALUE ecopts;
3404 VALUE snamev, dnamev;
3405 const char *sname, *dname;
3406 rb_encoding *senc, *denc;
3407 rb_econv_t *ec;
3408 int ecflags;
3409 VALUE convpath;
3410
3411 if (rb_check_typeddata(self, &econv_data_type)) {
3412 rb_raise(rb_eTypeError, "already initialized");
3413 }
3414
3415 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3416 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3417 ecflags = 0;
3418 ecopts = Qnil;
3419 }
3420 else {
3421 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3422 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3423 }
3424
3425 if (!ec) {
3426 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3427 RB_GC_GUARD(snamev);
3428 RB_GC_GUARD(dnamev);
3429 rb_exc_raise(exc);
3430 }
3431
3432 if (!DECORATOR_P(sname, dname)) {
3433 if (!senc)
3434 senc = make_dummy_encoding(sname);
3435 if (!denc)
3436 denc = make_dummy_encoding(dname);
3437 RB_GC_GUARD(snamev);
3438 RB_GC_GUARD(dnamev);
3439 }
3440
3441 ec->source_encoding = senc;
3442 ec->destination_encoding = denc;
3443
3444 DATA_PTR(self) = ec;
3445
3446 return self;
3447}
3448
3449/*
3450 * call-seq:
3451 * ec.inspect -> string
3452 *
3453 * Returns a printable version of <i>ec</i>
3454 *
3455 * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3456 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3457 *
3458 */
3459static VALUE
3460econv_inspect(VALUE self)
3461{
3462 const char *cname = rb_obj_classname(self);
3463 rb_econv_t *ec;
3464
3465 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3466 if (!ec)
3467 return rb_sprintf("#<%s: uninitialized>", cname);
3468 else {
3469 const char *sname = ec->source_encoding_name;
3470 const char *dname = ec->destination_encoding_name;
3471 VALUE str;
3472 str = rb_sprintf("#<%s: ", cname);
3473 econv_description(sname, dname, ec->flags, str);
3474 rb_str_cat2(str, ">");
3475 return str;
3476 }
3477}
3478
3479static rb_econv_t *
3480check_econv(VALUE self)
3481{
3482 rb_econv_t *ec;
3483
3484 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3485 if (!ec) {
3486 rb_raise(rb_eTypeError, "uninitialized encoding converter");
3487 }
3488 return ec;
3489}
3490
3491/*
3492 * call-seq:
3493 * ec.source_encoding -> encoding
3494 *
3495 * Returns the source encoding as an Encoding object.
3496 */
3497static VALUE
3498econv_source_encoding(VALUE self)
3499{
3500 rb_econv_t *ec = check_econv(self);
3501 if (!ec->source_encoding)
3502 return Qnil;
3504}
3505
3506/*
3507 * call-seq:
3508 * ec.destination_encoding -> encoding
3509 *
3510 * Returns the destination encoding as an Encoding object.
3511 */
3512static VALUE
3513econv_destination_encoding(VALUE self)
3514{
3515 rb_econv_t *ec = check_econv(self);
3517 return Qnil;
3519}
3520
3521/*
3522 * call-seq:
3523 * ec.convpath -> ary
3524 *
3525 * Returns the conversion path of ec.
3526 *
3527 * The result is an array of conversions.
3528 *
3529 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3530 * p ec.convpath
3531 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3532 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3533 * # "crlf_newline"]
3534 *
3535 * Each element of the array is a pair of encodings or a string.
3536 * A pair means an encoding conversion.
3537 * A string means a decorator.
3538 *
3539 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3540 * a converter from ISO-8859-1 to UTF-8.
3541 * "crlf_newline" means newline converter from LF to CRLF.
3542 */
3543static VALUE
3544econv_convpath(VALUE self)
3545{
3546 rb_econv_t *ec = check_econv(self);
3547 VALUE result;
3548 int i;
3549
3550 result = rb_ary_new();
3551 for (i = 0; i < ec->num_trans; i++) {
3552 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3553 VALUE v;
3554 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3555 v = rb_str_new_cstr(tr->dst_encoding);
3556 else
3557 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3558 rb_ary_push(result, v);
3559 }
3560 return result;
3561}
3562
3563/*
3564 * call-seq:
3565 * ec == other -> true or false
3566 */
3567static VALUE
3568econv_equal(VALUE self, VALUE other)
3569{
3570 rb_econv_t *ec1 = check_econv(self);
3571 rb_econv_t *ec2;
3572 int i;
3573
3574 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3575 return Qnil;
3576 }
3577 ec2 = DATA_PTR(other);
3578 if (!ec2) return Qfalse;
3579 if (ec1->source_encoding_name != ec2->source_encoding_name &&
3580 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3581 return Qfalse;
3584 return Qfalse;
3585 if (ec1->flags != ec2->flags) return Qfalse;
3586 if (ec1->replacement_enc != ec2->replacement_enc &&
3587 strcmp(ec1->replacement_enc, ec2->replacement_enc))
3588 return Qfalse;
3589 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3590 if (ec1->replacement_str != ec2->replacement_str &&
3592 return Qfalse;
3593
3594 if (ec1->num_trans != ec2->num_trans) return Qfalse;
3595 for (i = 0; i < ec1->num_trans; i++) {
3596 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3597 return Qfalse;
3598 }
3599 return Qtrue;
3600}
3601
3602static VALUE
3603econv_result_to_symbol(rb_econv_result_t res)
3604{
3605 switch (res) {
3606 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3607 case econv_incomplete_input: return sym_incomplete_input;
3608 case econv_undefined_conversion: return sym_undefined_conversion;
3609 case econv_destination_buffer_full: return sym_destination_buffer_full;
3610 case econv_source_buffer_empty: return sym_source_buffer_empty;
3611 case econv_finished: return sym_finished;
3612 case econv_after_output: return sym_after_output;
3613 default: return INT2NUM(res); /* should not be reached */
3614 }
3615}
3616
3617/*
3618 * call-seq:
3619 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3620 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3621 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3622 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3623 *
3624 * possible opt elements:
3625 * hash form:
3626 * :partial_input => true # source buffer may be part of larger source
3627 * :after_output => true # stop conversion after output before input
3628 * integer form:
3629 * Encoding::Converter::PARTIAL_INPUT
3630 * Encoding::Converter::AFTER_OUTPUT
3631 *
3632 * possible results:
3633 * :invalid_byte_sequence
3634 * :incomplete_input
3635 * :undefined_conversion
3636 * :after_output
3637 * :destination_buffer_full
3638 * :source_buffer_empty
3639 * :finished
3640 *
3641 * primitive_convert converts source_buffer into destination_buffer.
3642 *
3643 * source_buffer should be a string or nil.
3644 * nil means an empty string.
3645 *
3646 * destination_buffer should be a string.
3647 *
3648 * destination_byteoffset should be an integer or nil.
3649 * nil means the end of destination_buffer.
3650 * If it is omitted, nil is assumed.
3651 *
3652 * destination_bytesize should be an integer or nil.
3653 * nil means unlimited.
3654 * If it is omitted, nil is assumed.
3655 *
3656 * opt should be nil, a hash or an integer.
3657 * nil means no flags.
3658 * If it is omitted, nil is assumed.
3659 *
3660 * primitive_convert converts the content of source_buffer from beginning
3661 * and store the result into destination_buffer.
3662 *
3663 * destination_byteoffset and destination_bytesize specify the region which
3664 * the converted result is stored.
3665 * destination_byteoffset specifies the start position in destination_buffer in bytes.
3666 * If destination_byteoffset is nil,
3667 * destination_buffer.bytesize is used for appending the result.
3668 * destination_bytesize specifies maximum number of bytes.
3669 * If destination_bytesize is nil,
3670 * destination size is unlimited.
3671 * After conversion, destination_buffer is resized to
3672 * destination_byteoffset + actually produced number of bytes.
3673 * Also destination_buffer's encoding is set to destination_encoding.
3674 *
3675 * primitive_convert drops the converted part of source_buffer.
3676 * the dropped part is converted in destination_buffer or
3677 * buffered in Encoding::Converter object.
3678 *
3679 * primitive_convert stops conversion when one of following condition met.
3680 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3681 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3682 * - unexpected end of source buffer (:incomplete_input)
3683 * this occur only when :partial_input is not specified.
3684 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3685 * - character not representable in output encoding (:undefined_conversion)
3686 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3687 * - after some output is generated, before input is done (:after_output)
3688 * this occur only when :after_output is specified.
3689 * - destination buffer is full (:destination_buffer_full)
3690 * this occur only when destination_bytesize is non-nil.
3691 * - source buffer is empty (:source_buffer_empty)
3692 * this occur only when :partial_input is specified.
3693 * - conversion is finished (:finished)
3694 *
3695 * example:
3696 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3697 * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3698 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3699 *
3700 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3701 * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3702 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3703 * ret = ec.primitive_convert(src, dst="", nil, 1)
3704 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3705 * ret = ec.primitive_convert(src, dst="", nil, 1)
3706 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3707 * ret = ec.primitive_convert(src, dst="", nil, 1)
3708 * p [ret, src, dst] #=> [:finished, "", "i"]
3709 *
3710 */
3711static VALUE
3712econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3713{
3714 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3715 rb_econv_t *ec = check_econv(self);
3717 const unsigned char *ip, *is;
3718 unsigned char *op, *os;
3719 long output_byteoffset, output_bytesize;
3720 unsigned long output_byteend;
3721 int flags;
3722
3723 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3724
3725 if (NIL_P(output_byteoffset_v))
3726 output_byteoffset = 0; /* dummy */
3727 else
3728 output_byteoffset = NUM2LONG(output_byteoffset_v);
3729
3730 if (NIL_P(output_bytesize_v))
3731 output_bytesize = 0; /* dummy */
3732 else
3733 output_bytesize = NUM2LONG(output_bytesize_v);
3734
3735 if (!NIL_P(flags_v)) {
3736 if (!NIL_P(opt)) {
3737 rb_error_arity(argc + 1, 2, 5);
3738 }
3739 flags = NUM2INT(rb_to_int(flags_v));
3740 }
3741 else if (!NIL_P(opt)) {
3742 VALUE v;
3743 flags = 0;
3744 v = rb_hash_aref(opt, sym_partial_input);
3745 if (RTEST(v))
3746 flags |= ECONV_PARTIAL_INPUT;
3747 v = rb_hash_aref(opt, sym_after_output);
3748 if (RTEST(v))
3749 flags |= ECONV_AFTER_OUTPUT;
3750 }
3751 else {
3752 flags = 0;
3753 }
3754
3755 StringValue(output);
3756 if (!NIL_P(input))
3758 rb_str_modify(output);
3759
3760 if (NIL_P(output_bytesize_v)) {
3761 output_bytesize = RSTRING_EMBED_LEN_MAX;
3762 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3763 output_bytesize = RSTRING_LEN(input);
3764 }
3765
3766 retry:
3767
3768 if (NIL_P(output_byteoffset_v))
3769 output_byteoffset = RSTRING_LEN(output);
3770
3771 if (output_byteoffset < 0)
3772 rb_raise(rb_eArgError, "negative output_byteoffset");
3773
3774 if (RSTRING_LEN(output) < output_byteoffset)
3775 rb_raise(rb_eArgError, "output_byteoffset too big");
3776
3777 if (output_bytesize < 0)
3778 rb_raise(rb_eArgError, "negative output_bytesize");
3779
3780 output_byteend = (unsigned long)output_byteoffset +
3781 (unsigned long)output_bytesize;
3782
3783 if (output_byteend < (unsigned long)output_byteoffset ||
3784 LONG_MAX < output_byteend)
3785 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3786
3787 if (rb_str_capacity(output) < output_byteend)
3788 rb_str_resize(output, output_byteend);
3789
3790 if (NIL_P(input)) {
3791 ip = is = NULL;
3792 }
3793 else {
3794 ip = (const unsigned char *)RSTRING_PTR(input);
3795 is = ip + RSTRING_LEN(input);
3796 }
3797
3798 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3799 os = op + output_bytesize;
3800
3801 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3802 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3803 if (!NIL_P(input)) {
3804 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3805 }
3806
3807 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3808 if (LONG_MAX / 2 < output_bytesize)
3809 rb_raise(rb_eArgError, "too long conversion result");
3810 output_bytesize *= 2;
3811 output_byteoffset_v = Qnil;
3812 goto retry;
3813 }
3814
3815 if (ec->destination_encoding) {
3817 }
3818
3819 return econv_result_to_symbol(res);
3820}
3821
3822/*
3823 * call-seq:
3824 * ec.convert(source_string) -> destination_string
3825 *
3826 * Convert source_string and return destination_string.
3827 *
3828 * source_string is assumed as a part of source.
3829 * i.e. :partial_input=>true is specified internally.
3830 * finish method should be used last.
3831 *
3832 * ec = Encoding::Converter.new("utf-8", "euc-jp")
3833 * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3834 * puts ec.finish.dump #=> ""
3835 *
3836 * ec = Encoding::Converter.new("euc-jp", "utf-8")
3837 * puts ec.convert("\xA4").dump #=> ""
3838 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3839 * puts ec.finish.dump #=> ""
3840 *
3841 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3842 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3843 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3844 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3845 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3846 *
3847 * If a conversion error occur,
3848 * Encoding::UndefinedConversionError or
3849 * Encoding::InvalidByteSequenceError is raised.
3850 * Encoding::Converter#convert doesn't supply methods to recover or restart
3851 * from these exceptions.
3852 * When you want to handle these conversion errors,
3853 * use Encoding::Converter#primitive_convert.
3854 *
3855 */
3856static VALUE
3857econv_convert(VALUE self, VALUE source_string)
3858{
3859 VALUE ret, dst;
3860 VALUE av[5];
3861 int ac;
3862 rb_econv_t *ec = check_econv(self);
3863
3864 StringValue(source_string);
3865
3866 dst = rb_str_new(NULL, 0);
3867
3868 av[0] = rb_str_dup(source_string);
3869 av[1] = dst;
3870 av[2] = Qnil;
3871 av[3] = Qnil;
3873 ac = 5;
3874
3875 ret = econv_primitive_convert(ac, av, self);
3876
3877 if (ret == sym_invalid_byte_sequence ||
3878 ret == sym_undefined_conversion ||
3879 ret == sym_incomplete_input) {
3880 VALUE exc = make_econv_exception(ec);
3881 rb_exc_raise(exc);
3882 }
3883
3884 if (ret == sym_finished) {
3885 rb_raise(rb_eArgError, "converter already finished");
3886 }
3887
3888 if (ret != sym_source_buffer_empty) {
3889 rb_bug("unexpected result of econv_primitive_convert");
3890 }
3891
3892 return dst;
3893}
3894
3895/*
3896 * call-seq:
3897 * ec.finish -> string
3898 *
3899 * Finishes the converter.
3900 * It returns the last part of the converted string.
3901 *
3902 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3903 * p ec.convert("\u3042") #=> "\e$B$\""
3904 * p ec.finish #=> "\e(B"
3905 */
3906static VALUE
3907econv_finish(VALUE self)
3908{
3909 VALUE ret, dst;
3910 VALUE av[5];
3911 int ac;
3912 rb_econv_t *ec = check_econv(self);
3913
3914 dst = rb_str_new(NULL, 0);
3915
3916 av[0] = Qnil;
3917 av[1] = dst;
3918 av[2] = Qnil;
3919 av[3] = Qnil;
3920 av[4] = INT2FIX(0);
3921 ac = 5;
3922
3923 ret = econv_primitive_convert(ac, av, self);
3924
3925 if (ret == sym_invalid_byte_sequence ||
3926 ret == sym_undefined_conversion ||
3927 ret == sym_incomplete_input) {
3928 VALUE exc = make_econv_exception(ec);
3929 rb_exc_raise(exc);
3930 }
3931
3932 if (ret != sym_finished) {
3933 rb_bug("unexpected result of econv_primitive_convert");
3934 }
3935
3936 return dst;
3937}
3938
3939/*
3940 * call-seq:
3941 * ec.primitive_errinfo -> array
3942 *
3943 * primitive_errinfo returns important information regarding the last error
3944 * as a 5-element array:
3945 *
3946 * [result, enc1, enc2, error_bytes, readagain_bytes]
3947 *
3948 * result is the last result of primitive_convert.
3949 *
3950 * Other elements are only meaningful when result is
3951 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3952 *
3953 * enc1 and enc2 indicate a conversion step as a pair of strings.
3954 * For example, a converter from EUC-JP to ISO-8859-1 converts
3955 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3956 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3957 *
3958 * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3959 * error_bytes is discarded portion.
3960 * readagain_bytes is buffered portion which is read again on next conversion.
3961 *
3962 * Example:
3963 *
3964 * # \xff is invalid as EUC-JP.
3965 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3966 * ec.primitive_convert(src="\xff", dst="", nil, 10)
3967 * p ec.primitive_errinfo
3968 * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
3969 *
3970 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
3971 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
3972 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
3973 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3974 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
3975 * p ec.primitive_errinfo
3976 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
3977 *
3978 * # partial character is invalid
3979 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3980 * ec.primitive_convert(src="\xa4", dst="", nil, 10)
3981 * p ec.primitive_errinfo
3982 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
3983 *
3984 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
3985 * # partial characters.
3986 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3987 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
3988 * p ec.primitive_errinfo
3989 * #=> [:source_buffer_empty, nil, nil, nil, nil]
3990 *
3991 * # \xd8\x00\x00@ is invalid as UTF-16BE because
3992 * # no low surrogate after high surrogate (\xd8\x00).
3993 * # It is detected by 3rd byte (\00) which is part of next character.
3994 * # So the high surrogate (\xd8\x00) is discarded and
3995 * # the 3rd byte is read again later.
3996 * # Since the byte is buffered in ec, it is dropped from src.
3997 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3998 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
3999 * p ec.primitive_errinfo
4000 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4001 * p src
4002 * #=> "@"
4003 *
4004 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4005 * # The problem is detected by 4th byte.
4006 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4007 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4008 * p ec.primitive_errinfo
4009 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4010 * p src
4011 * #=> ""
4012 *
4013 */
4014static VALUE
4015econv_primitive_errinfo(VALUE self)
4016{
4017 rb_econv_t *ec = check_econv(self);
4018
4019 VALUE ary;
4020
4021 ary = rb_ary_new2(5);
4022
4023 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4024 rb_ary_store(ary, 4, Qnil);
4025
4028
4031
4035 }
4036
4037 return ary;
4038}
4039
4040/*
4041 * call-seq:
4042 * ec.insert_output(string) -> nil
4043 *
4044 * Inserts string into the encoding converter.
4045 * The string will be converted to the destination encoding and
4046 * output on later conversions.
4047 *
4048 * If the destination encoding is stateful,
4049 * string is converted according to the state and the state is updated.
4050 *
4051 * This method should be used only when a conversion error occurs.
4052 *
4053 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4054 * src = "HIRAGANA LETTER A is \u{3042}."
4055 * dst = ""
4056 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4057 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4058 * ec.insert_output("<err>")
4059 * p ec.primitive_convert(src, dst) #=> :finished
4060 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4061 *
4062 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4063 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4064 * dst = ""
4065 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4066 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4067 * ec.insert_output "?" # state change required to output "?".
4068 * p ec.primitive_convert(src, dst) #=> :finished
4069 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4070 *
4071 */
4072static VALUE
4073econv_insert_output(VALUE self, VALUE string)
4074{
4075 const char *insert_enc;
4076
4077 int ret;
4078
4079 rb_econv_t *ec = check_econv(self);
4080
4081 StringValue(string);
4083 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4084
4085 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4086 if (ret == -1) {
4087 rb_raise(rb_eArgError, "too big string");
4088 }
4089
4090 return Qnil;
4091}
4092
4093/*
4094 * call-seq:
4095 * ec.putback -> string
4096 * ec.putback(max_numbytes) -> string
4097 *
4098 * Put back the bytes which will be converted.
4099 *
4100 * The bytes are caused by invalid_byte_sequence error.
4101 * When invalid_byte_sequence error, some bytes are discarded and
4102 * some bytes are buffered to be converted later.
4103 * The latter bytes can be put back.
4104 * It can be observed by
4105 * Encoding::InvalidByteSequenceError#readagain_bytes and
4106 * Encoding::Converter#primitive_errinfo.
4107 *
4108 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4109 * src = "\x00\xd8\x61\x00"
4110 * dst = ""
4111 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4112 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4113 * p ec.putback #=> "a\x00"
4114 * p ec.putback #=> "" # no more bytes to put back
4115 *
4116 */
4117static VALUE
4118econv_putback(int argc, VALUE *argv, VALUE self)
4119{
4120 rb_econv_t *ec = check_econv(self);
4121 int n;
4122 int putbackable;
4123 VALUE str, max;
4124
4125 if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4127 }
4128 else {
4129 n = NUM2INT(max);
4130 putbackable = rb_econv_putbackable(ec);
4131 if (putbackable < n)
4132 n = putbackable;
4133 }
4134
4135 str = rb_str_new(NULL, n);
4136 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4137
4138 if (ec->source_encoding) {
4140 }
4141
4142 return str;
4143}
4144
4145/*
4146 * call-seq:
4147 * ec.last_error -> exception or nil
4148 *
4149 * Returns an exception object for the last conversion.
4150 * Returns nil if the last conversion did not produce an error.
4151 *
4152 * "error" means that
4153 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4154 * Encoding::Converter#convert and
4155 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4156 * Encoding::Converter#primitive_convert.
4157 *
4158 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4159 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4160 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4161 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4162 * p ec.last_error #=> nil
4163 *
4164 */
4165static VALUE
4166econv_last_error(VALUE self)
4167{
4168 rb_econv_t *ec = check_econv(self);
4169 VALUE exc;
4170
4171 exc = make_econv_exception(ec);
4172 if (NIL_P(exc))
4173 return Qnil;
4174 return exc;
4175}
4176
4177/*
4178 * call-seq:
4179 * ec.replacement -> string
4180 *
4181 * Returns the replacement string.
4182 *
4183 * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4184 * p ec.replacement #=> "?"
4185 *
4186 * ec = Encoding::Converter.new("euc-jp", "utf-8")
4187 * p ec.replacement #=> "\uFFFD"
4188 */
4189static VALUE
4190econv_get_replacement(VALUE self)
4191{
4192 rb_econv_t *ec = check_econv(self);
4193 int ret;
4194 rb_encoding *enc;
4195
4196 ret = make_replacement(ec);
4197 if (ret == -1) {
4198 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4199 }
4200
4202 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4203}
4204
4205/*
4206 * call-seq:
4207 * ec.replacement = string
4208 *
4209 * Sets the replacement string.
4210 *
4211 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4212 * ec.replacement = "<undef>"
4213 * p ec.convert("a \u3042 b") #=> "a <undef> b"
4214 */
4215static VALUE
4216econv_set_replacement(VALUE self, VALUE arg)
4217{
4218 rb_econv_t *ec = check_econv(self);
4219 VALUE string = arg;
4220 int ret;
4221 rb_encoding *enc;
4222
4223 StringValue(string);
4224 enc = rb_enc_get(string);
4225
4227 (const unsigned char *)RSTRING_PTR(string),
4228 RSTRING_LEN(string),
4229 rb_enc_name(enc));
4230
4231 if (ret == -1) {
4232 /* xxx: rb_eInvalidByteSequenceError? */
4233 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4234 }
4235
4236 return arg;
4237}
4238
4239VALUE
4241{
4242 return make_econv_exception(ec);
4243}
4244
4245void
4247{
4248 VALUE exc;
4249
4250 exc = make_econv_exception(ec);
4251 if (NIL_P(exc))
4252 return;
4253 rb_exc_raise(exc);
4254}
4255
4256/*
4257 * call-seq:
4258 * ecerr.source_encoding_name -> string
4259 *
4260 * Returns the source encoding name as a string.
4261 */
4262static VALUE
4263ecerr_source_encoding_name(VALUE self)
4264{
4265 return rb_attr_get(self, rb_intern("source_encoding_name"));
4266}
4267
4268/*
4269 * call-seq:
4270 * ecerr.source_encoding -> encoding
4271 *
4272 * Returns the source encoding as an encoding object.
4273 *
4274 * Note that the result may not be equal to the source encoding of
4275 * the encoding converter if the conversion has multiple steps.
4276 *
4277 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4278 * begin
4279 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4280 * rescue Encoding::UndefinedConversionError
4281 * p $!.source_encoding #=> #<Encoding:UTF-8>
4282 * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4283 * p $!.source_encoding_name #=> "UTF-8"
4284 * p $!.destination_encoding_name #=> "EUC-JP"
4285 * end
4286 *
4287 */
4288static VALUE
4289ecerr_source_encoding(VALUE self)
4290{
4291 return rb_attr_get(self, rb_intern("source_encoding"));
4292}
4293
4294/*
4295 * call-seq:
4296 * ecerr.destination_encoding_name -> string
4297 *
4298 * Returns the destination encoding name as a string.
4299 */
4300static VALUE
4301ecerr_destination_encoding_name(VALUE self)
4302{
4303 return rb_attr_get(self, rb_intern("destination_encoding_name"));
4304}
4305
4306/*
4307 * call-seq:
4308 * ecerr.destination_encoding -> string
4309 *
4310 * Returns the destination encoding as an encoding object.
4311 */
4312static VALUE
4313ecerr_destination_encoding(VALUE self)
4314{
4315 return rb_attr_get(self, rb_intern("destination_encoding"));
4316}
4317
4318/*
4319 * call-seq:
4320 * ecerr.error_char -> string
4321 *
4322 * Returns the one-character string which cause Encoding::UndefinedConversionError.
4323 *
4324 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4325 * begin
4326 * ec.convert("\xa0")
4327 * rescue Encoding::UndefinedConversionError
4328 * puts $!.error_char.dump #=> "\xC2\xA0"
4329 * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4330 * end
4331 *
4332 */
4333static VALUE
4334ecerr_error_char(VALUE self)
4335{
4336 return rb_attr_get(self, rb_intern("error_char"));
4337}
4338
4339/*
4340 * call-seq:
4341 * ecerr.error_bytes -> string
4342 *
4343 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4344 *
4345 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4346 * begin
4347 * ec.convert("abc\xA1\xFFdef")
4348 * rescue Encoding::InvalidByteSequenceError
4349 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4350 * puts $!.error_bytes.dump #=> "\xA1"
4351 * puts $!.readagain_bytes.dump #=> "\xFF"
4352 * end
4353 */
4354static VALUE
4355ecerr_error_bytes(VALUE self)
4356{
4357 return rb_attr_get(self, rb_intern("error_bytes"));
4358}
4359
4360/*
4361 * call-seq:
4362 * ecerr.readagain_bytes -> string
4363 *
4364 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4365 */
4366static VALUE
4367ecerr_readagain_bytes(VALUE self)
4368{
4369 return rb_attr_get(self, rb_intern("readagain_bytes"));
4370}
4371
4372/*
4373 * call-seq:
4374 * ecerr.incomplete_input? -> true or false
4375 *
4376 * Returns true if the invalid byte sequence error is caused by
4377 * premature end of string.
4378 *
4379 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4380 *
4381 * begin
4382 * ec.convert("abc\xA1z")
4383 * rescue Encoding::InvalidByteSequenceError
4384 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4385 * p $!.incomplete_input? #=> false
4386 * end
4387 *
4388 * begin
4389 * ec.convert("abc\xA1")
4390 * ec.finish
4391 * rescue Encoding::InvalidByteSequenceError
4392 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4393 * p $!.incomplete_input? #=> true
4394 * end
4395 */
4396static VALUE
4397ecerr_incomplete_input(VALUE self)
4398{
4399 return rb_attr_get(self, rb_intern("incomplete_input"));
4400}
4401
4402/*
4403 * Document-class: Encoding::UndefinedConversionError
4404 *
4405 * Raised by Encoding and String methods when a transcoding operation
4406 * fails.
4407 */
4408
4409/*
4410 * Document-class: Encoding::InvalidByteSequenceError
4411 *
4412 * Raised by Encoding and String methods when the string being
4413 * transcoded contains a byte invalid for the either the source or
4414 * target encoding.
4415 */
4416
4417/*
4418 * Document-class: Encoding::ConverterNotFoundError
4419 *
4420 * Raised by transcoding methods when a named encoding does not
4421 * correspond with a known converter.
4422 */
4423
4424#undef rb_intern
4425void
4427{
4428 transcoder_table = st_init_strcasetable();
4429
4430 sym_invalid = ID2SYM(rb_intern("invalid"));
4431 sym_undef = ID2SYM(rb_intern("undef"));
4432 sym_replace = ID2SYM(rb_intern("replace"));
4433 sym_fallback = ID2SYM(rb_intern("fallback"));
4434 sym_xml = ID2SYM(rb_intern("xml"));
4435 sym_text = ID2SYM(rb_intern("text"));
4436 sym_attr = ID2SYM(rb_intern("attr"));
4437
4438 sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
4439 sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
4440 sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
4441 sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
4442 sym_finished = ID2SYM(rb_intern("finished"));
4443 sym_after_output = ID2SYM(rb_intern("after_output"));
4444 sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
4445 sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
4446 sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
4447 sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
4448 sym_partial_input = ID2SYM(rb_intern("partial_input"));
4449
4450#ifdef ENABLE_ECONV_NEWLINE_OPTION
4451 sym_newline = ID2SYM(rb_intern("newline"));
4452 sym_universal = ID2SYM(rb_intern("universal"));
4453 sym_crlf = ID2SYM(rb_intern("crlf"));
4454 sym_cr = ID2SYM(rb_intern("cr"));
4455 sym_lf = ID2SYM(rb_intern("lf"));
4456#endif
4457
4458 InitVM(transcode);
4459}
4460
4461void
4463{
4464 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4465 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4466 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4467
4468 rb_define_method(rb_cString, "encode", str_encode, -1);
4469 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4470
4473 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4474 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4475 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4476 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4477 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4478 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4479 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4480 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4481 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4482 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4483 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4484 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4485 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4486 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4487 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4488 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4489 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4490
4491 /* Document-const: INVALID_MASK
4492 *
4493 * Mask for invalid byte sequences
4494 */
4496
4497 /* Document-const: INVALID_REPLACE
4498 *
4499 * Replace invalid byte sequences
4500 */
4502
4503 /* Document-const: UNDEF_MASK
4504 *
4505 * Mask for a valid character in the source encoding but no related
4506 * character(s) in destination encoding.
4507 */
4509
4510 /* Document-const: UNDEF_REPLACE
4511 *
4512 * Replace byte sequences that are undefined in the destination encoding.
4513 */
4515
4516 /* Document-const: UNDEF_HEX_CHARREF
4517 *
4518 * Replace byte sequences that are undefined in the destination encoding
4519 * with an XML hexadecimal character reference. This is valid for XML
4520 * conversion.
4521 */
4523
4524 /* Document-const: PARTIAL_INPUT
4525 *
4526 * Indicates the source may be part of a larger string. See
4527 * primitive_convert for an example.
4528 */
4530
4531 /* Document-const: AFTER_OUTPUT
4532 *
4533 * Stop converting after some output is complete but before all of the
4534 * input was consumed. See primitive_convert for an example.
4535 */
4537
4538 /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4539 *
4540 * Decorator for converting CRLF and CR to LF
4541 */
4543
4544 /* Document-const: CRLF_NEWLINE_DECORATOR
4545 *
4546 * Decorator for converting LF to CRLF
4547 */
4549
4550 /* Document-const: CR_NEWLINE_DECORATOR
4551 *
4552 * Decorator for converting LF to CR
4553 */
4555
4556 /* Document-const: XML_TEXT_DECORATOR
4557 *
4558 * Escape as XML CharData
4559 */
4561
4562 /* Document-const: XML_ATTR_CONTENT_DECORATOR
4563 *
4564 * Escape as XML AttValue
4565 */
4567
4568 /* Document-const: XML_ATTR_QUOTE_DECORATOR
4569 *
4570 * Escape as XML AttValue
4571 */
4573
4574 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4575 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4576 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4577 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4578 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4579
4580 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4581 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4582 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4583 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4584 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4585 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4586 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4587
4588 Init_newline();
4589}
void rb_ary_store(VALUE ary, long idx, VALUE val)
Definition: array.c:1141
VALUE rb_ary_push(VALUE ary, VALUE item)
Definition: array.c:1301
VALUE rb_ary_new(void)
Definition: array.c:749
VALUE rb_check_array_type(VALUE ary)
Definition: array.c:988
VALUE rb_ary_entry(VALUE ary, long offset)
Definition: array.c:1672
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Definition: array.c:975
Our own, locale independent, character handling routines.
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
Definition: cxxanyargs.hpp:653
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:668
#define fail()
struct RIMemo * ptr
Definition: debug.c:88
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1230
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:977
int rb_to_encoding_index(VALUE enc)
Definition: encoding.c:267
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:1064
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1537
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:414
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:1070
rb_encoding * rb_enc_find(const char *name)
Definition: encoding.c:916
int rb_define_dummy_encoding(const char *name)
Definition: encoding.c:616
VALUE rb_cEncoding
Definition: encoding.c:57
VALUE rb_enc_default_internal(void)
Definition: encoding.c:1743
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:1202
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:329
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:188
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:1036
int rb_enc_find_index(const char *name)
Definition: encoding.c:879
int max
Definition: enough.c:225
void cleanup(void)
Definition: enough.c:244
uint8_t len
Definition: escape.c:17
char str[HTML_ESCAPE_MAX_LEN+1]
Definition: escape.c:18
#define RSTRING_LEN(string)
Definition: fbuffer.h:22
#define RSTRING_PTR(string)
Definition: fbuffer.h:19
#define memcpy(d, s, n)
Definition: ffi_common.h:55
#define PRIsVALUE
Definition: function.c:10
VALUE rb_cString
Definition: string.c:80
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition: class.c:797
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Definition: class.c:2296
#define OBJ_FROZEN
Definition: fl_type.h:136
#define OBJ_FREEZE
Definition: fl_type.h:134
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:2917
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:712
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:1007
void rb_bug(const char *fmt,...)
Definition: error.c:768
VALUE rb_eTypeError
Definition: error.c:1057
VALUE rb_eRuntimeError
Definition: error.c:1055
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:1024
VALUE rb_exc_new_str(VALUE etype, VALUE str)
Definition: error.c:1107
VALUE rb_eArgError
Definition: error.c:1058
VALUE rb_eEncodingError
Definition: error.c:1063
void rb_warning(const char *fmt,...)
Definition: error.c:439
VALUE rb_cObject
Object class.
Definition: object.c:49
VALUE rb_obj_class(VALUE)
Definition: object.c:245
VALUE rb_to_int(VALUE)
Converts val into Integer.
Definition: object.c:3051
VALUE rb_check_hash_type(VALUE hash)
Definition: hash.c:1860
VALUE rb_hash_aref(VALUE hash, VALUE key)
Definition: hash.c:2046
VALUE rb_hash_freeze(VALUE hash)
Definition: hash.c:101
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Definition: hash.c:2901
VALUE rb_hash_new(void)
Definition: hash.c:1538
@ idAREF
Definition: id.h:105
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Definition: encoding.h:399
#define ECONV_AFTER_OUTPUT
Definition: encoding.h:407
#define ENC_CODERANGE_7BIT
Definition: encoding.h:93
#define ENC_CODERANGE_VALID
Definition: encoding.h:94
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Definition: encoding.h:393
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Definition: string.c:1100
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Definition: encoding.h:397
#define ECONV_INVALID_MASK
Definition: encoding.h:384
#define ECONV_CRLF_NEWLINE_DECORATOR
Definition: encoding.h:394
rb_econv_result_t
Definition: encoding.h:288
@ econv_incomplete_input
Definition: encoding.h:295
@ econv_finished
Definition: encoding.h:293
@ econv_undefined_conversion
Definition: encoding.h:290
@ econv_after_output
Definition: encoding.h:294
@ econv_source_buffer_empty
Definition: encoding.h:292
@ econv_destination_buffer_full
Definition: encoding.h:291
@ econv_invalid_byte_sequence
Definition: encoding.h:289
#define ECONV_UNDEF_REPLACE
Definition: encoding.h:387
int rb_enc_str_coderange(VALUE)
Definition: string.c:725
#define ECONV_XML_TEXT_DECORATOR
Definition: encoding.h:396
#define ECONV_CR_NEWLINE_DECORATOR
Definition: encoding.h:395
VALUE rb_enc_str_new(const char *, long, rb_encoding *)
Definition: string.c:857
#define rb_enc_name(enc)
Definition: encoding.h:168
#define ECONV_INVALID_REPLACE
Definition: encoding.h:385
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:199
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:183
#define rb_enc_asciicompat(enc)
Definition: encoding.h:236
#define ECONV_UNDEF_MASK
Definition: encoding.h:386
#define ECONV_PARTIAL_INPUT
Definition: encoding.h:406
#define ECONV_ERROR_HANDLER_MASK
Definition: encoding.h:383
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:95
long rb_str_coderange_scan_restartable(const char *, const char *, rb_encoding *, int *)
Definition: string.c:617
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:182
#define ECONV_UNDEF_HEX_CHARREF
Definition: encoding.h:388
#define ECONV_NEWLINE_DECORATOR_MASK
Definition: encoding.h:390
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:99
Thin wrapper to ruby/config.h.
VALUE rb_funcallv_public(VALUE, ID, int, const VALUE *)
Calls a method.
Definition: vm_eval.c:1137
#define rb_ary_new4
Definition: array.h:74
#define rb_ary_new2
Definition: array.h:72
#define rb_check_frozen
Definition: error.h:72
#define rb_exc_new3
Definition: error.h:31
void rb_error_arity(int, int, int)
#define rb_check_arity
Definition: error.h:34
VALUE rb_require_string(VALUE)
Definition: load.c:1183
VALUE rb_obj_is_method(VALUE)
Definition: proc.c:1590
VALUE rb_obj_is_proc(VALUE)
Definition: proc.c:152
VALUE rb_method_call(int, const VALUE *, VALUE)
Definition: proc.c:2398
VALUE rb_proc_call(VALUE, VALUE)
Definition: proc.c:981
#define rb_str_new2
Definition: string.h:276
VALUE rb_str_resize(VALUE, long)
Definition: string.c:2859
void rb_str_shared_replace(VALUE, VALUE)
Definition: string.c:1480
#define rb_str_cat2
Definition: string.h:285
#define rb_str_new(str, len)
Definition: string.h:213
void rb_str_set_len(VALUE, long)
Definition: string.c:2842
void rb_str_modify(VALUE)
Definition: string.c:2262
VALUE rb_str_buf_new(long)
Definition: string.c:1398
VALUE rb_str_tmp_new(long)
Definition: string.c:1427
VALUE rb_str_dump(VALUE)
Definition: string.c:6311
VALUE rb_str_new_frozen(VALUE)
Definition: string.c:1273
VALUE rb_str_drop_bytes(VALUE, long)
Definition: string.c:4795
VALUE rb_str_dup(VALUE)
Definition: string.c:1631
size_t rb_str_capacity(VALUE)
Definition: string.c:773
#define rb_str_new_cstr(str)
Definition: string.h:219
VALUE rb_attr_get(VALUE, ID)
Definition: variable.c:1242
VALUE rb_ivar_set(VALUE, ID, VALUE)
Definition: variable.c:1493
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:2561
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
#define ID2SYM
Definition: symbol.h:44
VALUE rb_sym2str(VALUE)
Definition: symbol.c:927
ID rb_intern(const char *)
Definition: symbol.c:785
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:3150
Internal header aggregating init functions.
void Init_newline(void)
#define NUM2INT
Definition: int.h:44
#define INT2NUM
Definition: int.h:43
Internal header for Array.
Internal header for Object.
Internal header for String.
VALUE rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
Definition: string.c:10587
#define bp()
Definition: internal.h:105
#define PRIdPTRDIFF
Definition: inttypes.h:105
voidpf void uLong size
Definition: ioapi.h:138
typedef long(ZCALLBACK *tell_file_func) OF((voidpf opaque
typedef int(ZCALLBACK *close_file_func) OF((voidpf opaque
voidpf void * buf
Definition: ioapi.h:138
#define SIZE_MAX
Definition: limits.h:71
#define LONG_MAX
Definition: limits.h:36
#define INT2FIX
Definition: long.h:48
#define NUM2LONG
Definition: long.h:51
int memcmp(const void *s1, const void *s2, size_t len)
Definition: memcmp.c:7
#define MEMCPY(p1, p2, type, n)
Definition: memory.h:129
#define REALLOC_N
Definition: memory.h:137
#define ALLOCA_N(type, n)
Definition: memory.h:112
#define ALLOC_N
Definition: memory.h:133
#define RB_GC_GUARD(v)
Definition: memory.h:91
#define MEMMOVE(p1, p2, type, n)
Definition: memory.h:130
unsigned int input
Definition: nkf.c:4325
const char * name
Definition: nkf.c:208
#define TRUE
Definition: nkf.h:175
#define FALSE
Definition: nkf.h:174
#define RARRAY_AREF(a, i)
Definition: psych_emitter.c:7
#define RARRAY_LEN
Definition: rarray.h:52
#define DATA_PTR(obj)
Definition: rdata.h:56
#define NULL
Definition: regenc.h:69
#define StringValue(v)
Definition: rstring.h:50
#define RSTRING_EMBED_LEN_MAX
Definition: rstring.h:39
#define StringValueCStr(v)
Definition: rstring.h:52
#define TypedData_Get_Struct(obj, type, data_type, sval)
Definition: rtypeddata.h:130
#define TypedData_Wrap_Struct(klass, data_type, sval)
Definition: rtypeddata.h:101
@ RUBY_TYPED_FREE_IMMEDIATELY
Definition: rtypeddata.h:62
const char * rb_obj_classname(VALUE)
Definition: variable.c:308
#define InitVM(ext)
Definition: ruby.h:112
int argc
Definition: ruby.c:240
char ** argv
Definition: ruby.c:241
#define Qundef
#define Qtrue
#define RTEST
#define Qnil
#define Qfalse
#define NIL_P
#define f
VALUE rb_str_catf(VALUE, const char *,...)
Definition: sprintf.c:1243
VALUE rb_sprintf(const char *,...)
Definition: sprintf.c:1203
@ ST_STOP
Definition: st.h:99
@ ST_CONTINUE
Definition: st.h:99
unsigned long st_data_t
Definition: st.h:22
#define st_init_strcasetable
Definition: st.h:114
#define st_foreach
Definition: st.h:142
#define st_lookup
Definition: st.h:128
#define st_add_direct
Definition: st.h:154
#define st_free_table
Definition: st.h:156
size_t strlen(const char *)
const char * ascii_compat_name
Definition: transcode.c:1756
const char * ascii_incompat_name
Definition: transcode.c:1757
unsigned char * out_data_start
Definition: transcode.c:114
struct rb_transcoding * tc
Definition: transcode.c:112
unsigned char * out_buf_start
Definition: transcode.c:113
rb_econv_result_t last_result
Definition: transcode.c:117
unsigned char * out_buf_end
Definition: transcode.c:116
unsigned char * out_data_end
Definition: transcode.c:115
rb_encoding * destination_encoding
Definition: transcode.c:156
unsigned char * in_buf_start
Definition: transcode.c:131
struct rb_econv_t::@177 last_error
size_t error_bytes_len
Definition: transcode.c:149
const char * source_encoding_name
Definition: transcode.c:124
size_t readagain_len
Definition: transcode.c:150
unsigned char * in_buf_end
Definition: transcode.c:134
size_t replacement_len
Definition: transcode.c:128
struct rb_transcoding * error_tc
Definition: transcode.c:145
int num_trans
Definition: transcode.c:138
unsigned char * in_data_start
Definition: transcode.c:132
rb_encoding * source_encoding
Definition: transcode.c:155
rb_econv_elem_t * elems
Definition: transcode.c:135
int started
Definition: transcode.c:122
const char * replacement_enc
Definition: transcode.c:129
const char * source_encoding
Definition: transcode.c:146
int replacement_allocated
Definition: transcode.c:136
const char * destination_encoding
Definition: transcode.c:147
rb_econv_result_t result
Definition: transcode.c:144
const unsigned char * replacement_str
Definition: transcode.c:127
struct rb_transcoding * last_tc
Definition: transcode.c:140
unsigned char * in_data_end
Definition: transcode.c:133
int num_allocated
Definition: transcode.c:137
const unsigned char * error_bytes_start
Definition: transcode.c:148
const char * destination_encoding_name
Definition: transcode.c:125
int num_finished
Definition: transcode.c:139
const char * dst_encoding
const char * src_encoding
rb_transcoder_asciicompat_type_t asciicompat_type
unsigned int output_index
Definition: transcode.c:71
ssize_t recognized_len
Definition: transcode.c:73
unsigned char next_byte
Definition: transcode.c:70
union rb_transcoding::@176 writebuf
unsigned int next_table
Definition: transcode.c:68
int resume_position
Definition: transcode.c:67
unsigned char ary[8]
Definition: transcode.c:76
ssize_t writebuf_len
Definition: transcode.c:81
VALUE next_info
Definition: transcode.c:69
const rb_transcoder * transcoder
Definition: transcode.c:63
union rb_transcoding::@175 readbuf
unsigned char * ptr
Definition: transcode.c:77
ssize_t readagain_len
Definition: transcode.c:74
union rb_transcoding::rb_transcoding_state_t state
ssize_t writebuf_off
Definition: transcode.c:80
st_table * visited
Definition: transcode.c:258
search_path_queue_t * queue
Definition: transcode.c:259
const char * base_enc
Definition: transcode.c:261
search_path_queue_t ** queue_last_ptr
Definition: transcode.c:260
const char * enc
Definition: transcode.c:254
struct search_path_queue_tag * next
Definition: transcode.c:253
Definition: st.h:79
st_index_t num_entries
Definition: st.h:86
Definition: string.c:7257
transcoder_entry_t ** entries
Definition: transcode.c:964
int num_additional
Definition: transcode.c:965
Definition: transcode.c:165
const char * sname
Definition: transcode.c:166
const rb_transcoder * transcoder
Definition: transcode.c:169
const char * dname
Definition: transcode.c:167
const char * lib
Definition: transcode.c:168
#define snprintf
Definition: subst.h:14
#define TRANSCODING_WRITEBUF(tc)
Definition: transcode.c:97
#define BL_ACTION(byte)
#define writebuf_len
#define hash_fallback
Definition: transcode.c:2237
rb_econv_t * rb_econv_open(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:1061
VALUE rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:2028
#define BL_MIN_BYTE
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Definition: transcode.c:1495
#define next_info
#define TRANSCODING_STATE(tc)
Definition: transcode.c:106
int rb_econv_putbackable(rb_econv_t *ec)
Definition: transcode.c:1734
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Definition: transcode.c:3189
#define SUSPEND_AFTER_OUTPUT(num)
#define SUSPEND_OBUF(num)
VALUE rb_cEncodingConverter
Definition: transcode.c:34
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Definition: transcode.c:1857
#define next_table
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
Definition: transcode.c:242
#define BYTE_ADDR(index)
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
Definition: transcode.c:2577
#define TRANSCODING_WRITEBUF_SIZE(tc)
Definition: transcode.c:101
size_t rb_econv_memsize(rb_econv_t *ec)
Definition: transcode.c:1712
#define DECORATOR_P(sname, dname)
Definition: transcode.c:163
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Definition: transcode.c:1579
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Definition: transcode.c:1869
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
Definition: transcode.c:1848
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1917
#define SUSPEND(ret, num)
void rb_econv_binmode(rb_econv_t *ec)
Definition: transcode.c:1934
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1900
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2892
VALUE rb_econv_make_exception(rb_econv_t *ec)
Definition: transcode.c:4240
void rb_econv_check_error(rb_econv_t *ec)
Definition: transcode.c:4246
int rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
Definition: transcode.c:2571
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Definition: transcode.c:1863
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:1438
#define TRANSCODING_READBUF(tc)
Definition: transcode.c:93
void Init_transcode(void)
Definition: transcode.c:4426
#define MAX_ECFLAGS_DECORATORS
Definition: transcode.c:1020
int rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
Definition: transcode.c:2526
void InitVM_transcode(void)
Definition: transcode.c:4462
void rb_econv_close(rb_econv_t *ec)
Definition: transcode.c:1694
struct search_path_queue_tag search_path_queue_t
#define encoding_equal(enc1, enc2)
Definition: transcode.c:250
void rb_register_transcoder(const rb_transcoder *tr)
Definition: transcode.c:214
#define next_byte
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Definition: transcode.c:1745
#define writebuf_off
VALUE rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
Definition: transcode.c:1805
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Definition: transcode.c:2190
const char * rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
Definition: transcode.c:1778
#define BL_MAX_BYTE
Internal header for Encoding::Converter.
#define FUNio
#define FOURbt
#define FUNso
#define FUNsio
#define STR1
#define getBT3(a)
#define getBT2(a)
#define ZERObt
#define getBT1(a)
#define getGB4bt2(a)
#define getBT0(a)
#define getGB4bt1(a)
#define TWObt
#define FUNsi
#define STR1_LENGTH(byte_addr)
#define ONEbt
#define UNDEF
#define THREEbt
#define NOMAP
#define getGB4bt0(a)
@ asciicompat_encoder
@ asciicompat_decoder
#define INVALID
#define FUNii
#define STR1_BYTEINDEX(w)
#define getGB4bt3(a)
#define GB4bt
char ary[sizeof(double) > sizeof(void *) ? sizeof(double) :sizeof(void *)]
Definition: transcode.c:89
#define ALLOC(size)
Definition: unzip.c:112
unsigned long VALUE
Definition: value.h:38
#define T_HASH
Definition: value_type.h:64
#define T_ARRAY
Definition: value_type.h:55
#define T_SYMBOL
Definition: value_type.h:79
#define SYMBOL_P
Definition: value_type.h:87
VALUE(* fallback_func)(VALUE obj, VALUE name)
Definition: variable.c:143
#define dp(v)
Definition: vm_debug.h:20
int err
Definition: win32.c:142
#define xfree
Definition: xmalloc.h:49
#define xrealloc
Definition: xmalloc.h:47
#define xmalloc
Definition: xmalloc.h:44