Ruby 3.0.5p211 (2022-11-24 revision ba5cf0f7c52d4d35cc6a173c89eda98ceffa2dcf)
encoding.c
Go to the documentation of this file.
1/**********************************************************************
2
3 encoding.c -
4
5 $Author$
6 created at: Thu May 24 17:23:27 JST 2007
7
8 Copyright (C) 2007 Yukihiro Matsumoto
9
10**********************************************************************/
11
13
14#include <ctype.h>
15
16#include "encindex.h"
17#include "internal.h"
18#include "internal/enc.h"
19#include "internal/encoding.h"
20#include "internal/inits.h"
21#include "internal/load.h"
22#include "internal/object.h"
23#include "internal/string.h"
24#include "internal/vm.h"
25#include "regenc.h"
26#include "ruby/encoding.h"
27#include "ruby/util.h"
28#include "ruby_assert.h"
29#include "vm_sync.h"
30
31#ifndef ENC_DEBUG
32#define ENC_DEBUG 0
33#endif
34#define ENC_ASSERT(expr) RUBY_ASSERT_WHEN(ENC_DEBUG, expr)
35#define MUST_STRING(str) (ENC_ASSERT(RB_TYPE_P(str, T_STRING)), str)
36
37#undef rb_ascii8bit_encindex
38#undef rb_utf8_encindex
39#undef rb_usascii_encindex
40
42
43#if defined __GNUC__ && __GNUC__ >= 4
44#pragma GCC visibility push(default)
45int rb_enc_register(const char *name, rb_encoding *encoding);
46void rb_enc_set_base(const char *name, const char *orig);
47int rb_enc_set_dummy(int index);
48void rb_encdb_declare(const char *name);
49int rb_encdb_replicate(const char *name, const char *orig);
50int rb_encdb_dummy(const char *name);
51int rb_encdb_alias(const char *alias, const char *orig);
52void rb_encdb_set_unicode(int index);
53#pragma GCC visibility pop
54#endif
55
56static ID id_encoding;
58
59#define DEFAULT_ENCODING_LIST_CAPA 128
60static VALUE rb_default_encoding_list;
61static VALUE rb_additional_encoding_list;
62
64 const char *name;
67};
68
69static struct enc_table {
70 struct rb_encoding_entry *list;
71 int count;
72 int size;
73 st_table *names;
74} global_enc_table;
75
76static rb_encoding *global_enc_ascii,
77 *global_enc_utf_8,
78 *global_enc_us_ascii;
79
80#define GLOBAL_ENC_TABLE_ENTER(enc_table) struct enc_table *enc_table = &global_enc_table; RB_VM_LOCK_ENTER()
81#define GLOBAL_ENC_TABLE_LEAVE() RB_VM_LOCK_LEAVE()
82#define GLOBAL_ENC_TABLE_EVAL(enc_table, expr) do { \
83 GLOBAL_ENC_TABLE_ENTER(enc_table); \
84 { \
85 expr; \
86 } \
87 GLOBAL_ENC_TABLE_LEAVE(); \
88} while (0)
89
90
91#define ENC_DUMMY_FLAG (1<<24)
92#define ENC_INDEX_MASK (~(~0U<<24))
93
94#define ENC_TO_ENCINDEX(enc) (int)((enc)->ruby_encoding_index & ENC_INDEX_MASK)
95#define ENC_DUMMY_P(enc) ((enc)->ruby_encoding_index & ENC_DUMMY_FLAG)
96#define ENC_SET_DUMMY(enc) ((enc)->ruby_encoding_index |= ENC_DUMMY_FLAG)
97
98#define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
99#define UNSPECIFIED_ENCODING INT_MAX
100
101#define ENCODING_NAMELEN_MAX 63
102#define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX)
103
104static const rb_data_type_t encoding_data_type = {
105 "encoding",
106 {0, 0, 0,},
108};
109
110#define is_data_encoding(obj) (RTYPEDDATA_P(obj) && RTYPEDDATA_TYPE(obj) == &encoding_data_type)
111#define is_obj_encoding(obj) (RB_TYPE_P((obj), T_DATA) && is_data_encoding(obj))
112
113int
115{
116 return is_data_encoding(obj);
117}
118
119static VALUE
120enc_new(rb_encoding *encoding)
121{
122 VALUE enc = TypedData_Wrap_Struct(rb_cEncoding, &encoding_data_type, (void *)encoding);
125 return enc;
126}
127
128static void
129enc_list_update(int index, rb_raw_encoding *encoding)
130{
131 if (index < DEFAULT_ENCODING_LIST_CAPA) {
132 VALUE list = rb_default_encoding_list;
133 if (list && NIL_P(rb_ary_entry(list, index))) {
134 /* initialize encoding data */
135 rb_ary_store(list, index, enc_new(encoding));
136 }
137 }
138 else {
140 {
141 VALUE list = rb_additional_encoding_list;
142 if (list && NIL_P(rb_ary_entry(list, index))) {
143 /* initialize encoding data */
144 rb_ary_store(list, index - DEFAULT_ENCODING_LIST_CAPA, enc_new(encoding));
145 }
146 }
148 }
149}
150
151static VALUE
152enc_list_lookup(int idx)
153{
154 VALUE list, enc;
155
156 if (idx < DEFAULT_ENCODING_LIST_CAPA) {
157 if (!(list = rb_default_encoding_list)) {
158 rb_bug("rb_enc_from_encoding_index(%d): no rb_default_encoding_list", idx);
159 }
160 enc = rb_ary_entry(list, idx);
161 }
162 else {
164 {
165 if (!(list = rb_additional_encoding_list)) {
166 rb_bug("rb_enc_from_encoding_index(%d): no rb_additional_encoding_list", idx);
167 }
169 }
171 }
172
173 if (NIL_P(enc)) {
174 rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx);
175 }
176 else {
177 return enc;
178 }
179}
180
181static VALUE
182rb_enc_from_encoding_index(int idx)
183{
184 return enc_list_lookup(idx);
185}
186
187VALUE
189{
190 int idx;
191 if (!encoding) return Qnil;
192 idx = ENC_TO_ENCINDEX(encoding);
193 return rb_enc_from_encoding_index(idx);
194}
195
196int
198{
199 return enc ? ENC_TO_ENCINDEX(enc) : 0;
200}
201
202int
204{
205 return ENC_DUMMY_P(enc) != 0;
206}
207
208static int
209check_encoding(rb_encoding *enc)
210{
211 int index = rb_enc_to_index(enc);
212 if (rb_enc_from_index(index) != enc)
213 return -1;
214 if (rb_enc_autoload_p(enc)) {
215 index = rb_enc_autoload(enc);
216 }
217 return index;
218}
219
220static int
221enc_check_encoding(VALUE obj)
222{
223 if (!is_obj_encoding(obj)) {
224 return -1;
225 }
226 return check_encoding(RDATA(obj)->data);
227}
228
229NORETURN(static void not_encoding(VALUE enc));
230static void
231not_encoding(VALUE enc)
232{
233 rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Encoding)",
235}
236
237static rb_encoding *
238must_encoding(VALUE enc)
239{
240 int index = enc_check_encoding(enc);
241 if (index < 0) {
242 not_encoding(enc);
243 }
244 return DATA_PTR(enc);
245}
246
247static rb_encoding *
248must_encindex(int index)
249{
251 if (!enc) {
252 rb_raise(rb_eEncodingError, "encoding index out of bound: %d",
253 index);
254 }
255 if (ENC_TO_ENCINDEX(enc) != (int)(index & ENC_INDEX_MASK)) {
256 rb_raise(rb_eEncodingError, "wrong encoding index %d for %s (expected %d)",
258 }
259 if (rb_enc_autoload_p(enc) && rb_enc_autoload(enc) == -1) {
260 rb_loaderror("failed to load encoding (%s)",
262 }
263 return enc;
264}
265
266int
268{
269 int idx;
270 const char *name;
271
272 idx = enc_check_encoding(enc);
273 if (idx >= 0) {
274 return idx;
275 }
276 else if (NIL_P(enc = rb_check_string_type(enc))) {
277 return -1;
278 }
280 return -1;
281 }
282 if (!(name = rb_str_to_cstr(enc))) {
283 return -1;
284 }
285 return rb_enc_find_index(name);
286}
287
288static const char *
289name_for_encoding(volatile VALUE *enc)
290{
292 const char *n;
293
295 rb_raise(rb_eArgError, "invalid encoding name (non ASCII)");
296 }
297 if (!(n = rb_str_to_cstr(name))) {
298 rb_raise(rb_eArgError, "invalid encoding name (NUL byte)");
299 }
300 return n;
301}
302
303/* Returns encoding index or UNSPECIFIED_ENCODING */
304static int
305str_find_encindex(VALUE enc)
306{
307 int idx = rb_enc_find_index(name_for_encoding(&enc));
309 return idx;
310}
311
312static int
313str_to_encindex(VALUE enc)
314{
315 int idx = str_find_encindex(enc);
316 if (idx < 0) {
317 rb_raise(rb_eArgError, "unknown encoding name - %"PRIsVALUE, enc);
318 }
319 return idx;
320}
321
322static rb_encoding *
323str_to_encoding(VALUE enc)
324{
325 return rb_enc_from_index(str_to_encindex(enc));
326}
327
330{
331 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
332 return str_to_encoding(enc);
333}
334
337{
338 int idx;
339 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
340 idx = str_find_encindex(enc);
341 if (idx < 0) return NULL;
342 return rb_enc_from_index(idx);
343}
344
345static int
346enc_table_expand(struct enc_table *enc_table, int newsize)
347{
348 struct rb_encoding_entry *ent;
349 int count = newsize;
350
351 if (enc_table->size >= newsize) return newsize;
352 newsize = (newsize + 7) / 8 * 8;
353 ent = REALLOC_N(enc_table->list, struct rb_encoding_entry, newsize);
354 memset(ent + enc_table->size, 0, sizeof(*ent)*(newsize - enc_table->size));
355 enc_table->list = ent;
356 enc_table->size = newsize;
357 return count;
358}
359
360static int
361enc_register_at(struct enc_table *enc_table, int index, const char *name, rb_encoding *base_encoding)
362{
363 struct rb_encoding_entry *ent = &enc_table->list[index];
364 rb_raw_encoding *encoding;
365
366 if (!valid_encoding_name_p(name)) return -1;
367 if (!ent->name) {
368 ent->name = name = strdup(name);
369 }
370 else if (STRCASECMP(name, ent->name)) {
371 return -1;
372 }
373 encoding = (rb_raw_encoding *)ent->enc;
374 if (!encoding) {
375 encoding = xmalloc(sizeof(rb_encoding));
376 }
377 if (base_encoding) {
378 *encoding = *base_encoding;
379 }
380 else {
381 memset(encoding, 0, sizeof(*ent->enc));
382 }
383 encoding->name = name;
384 encoding->ruby_encoding_index = index;
385 ent->enc = encoding;
386 st_insert(enc_table->names, (st_data_t)name, (st_data_t)index);
387
388 enc_list_update(index, encoding);
389 return index;
390}
391
392static int
393enc_register(struct enc_table *enc_table, const char *name, rb_encoding *encoding)
394{
395 int index = enc_table->count;
396
397 enc_table->count = enc_table_expand(enc_table, index + 1);
398 return enc_register_at(enc_table, index, name, encoding);
399}
400
401static void set_encoding_const(const char *, rb_encoding *);
402static int enc_registered(struct enc_table *enc_table, const char *name);
403
404static rb_encoding *
405enc_from_index(struct enc_table *enc_table, int index)
406{
407 if (UNLIKELY(index < 0 || enc_table->count <= (index &= ENC_INDEX_MASK))) {
408 return 0;
409 }
410 return enc_table->list[index].enc;
411}
412
415{
417
418 switch (index) {
419 case ENCINDEX_ASCII: return global_enc_ascii;
420 case ENCINDEX_UTF_8: return global_enc_utf_8;
421 case ENCINDEX_US_ASCII: return global_enc_us_ascii;
422 default:
423 GLOBAL_ENC_TABLE_EVAL(enc_table,
424 enc = enc_from_index(enc_table, index));
425 return enc;
426 }
427}
428
429int
430rb_enc_register(const char *name, rb_encoding *encoding)
431{
432 int index;
433
434 GLOBAL_ENC_TABLE_ENTER(enc_table);
435 {
436 index = enc_registered(enc_table, name);
437
438 if (index >= 0) {
439 rb_encoding *oldenc = enc_from_index(enc_table, index);
440 if (STRCASECMP(name, rb_enc_name(oldenc))) {
441 index = enc_register(enc_table, name, encoding);
442 }
443 else if (rb_enc_autoload_p(oldenc) || !ENC_DUMMY_P(oldenc)) {
444 enc_register_at(enc_table, index, name, encoding);
445 }
446 else {
447 rb_raise(rb_eArgError, "encoding %s is already registered", name);
448 }
449 }
450 else {
451 index = enc_register(enc_table, name, encoding);
452 set_encoding_const(name, rb_enc_from_index(index));
453 }
454 }
456 return index;
457}
458
459int
460enc_registered(struct enc_table *enc_table, const char *name)
461{
462 st_data_t idx = 0;
463
464 if (!name) return -1;
465 if (!enc_table->list) return -1;
466 if (st_lookup(enc_table->names, (st_data_t)name, &idx)) {
467 return (int)idx;
468 }
469 return -1;
470}
471
472void
474{
475 GLOBAL_ENC_TABLE_ENTER(enc_table);
476 {
477 int idx = enc_registered(enc_table, name);
478 if (idx < 0) {
479 idx = enc_register(enc_table, name, 0);
480 }
481 set_encoding_const(name, rb_enc_from_index(idx));
482 }
484}
485
486static void
487enc_check_duplication(struct enc_table *enc_table, const char *name)
488{
489 if (enc_registered(enc_table, name) >= 0) {
490 rb_raise(rb_eArgError, "encoding %s is already registered", name);
491 }
492}
493
494static rb_encoding*
495set_base_encoding(struct enc_table *enc_table, int index, rb_encoding *base)
496{
497 rb_encoding *enc = enc_table->list[index].enc;
498
499 enc_table->list[index].base = base;
501 return enc;
502}
503
504/* for encdb.h
505 * Set base encoding for encodings which are not replicas
506 * but not in their own files.
507 */
508void
509rb_enc_set_base(const char *name, const char *orig)
510{
511 GLOBAL_ENC_TABLE_ENTER(enc_table);
512 {
513 int idx = enc_registered(enc_table, name);
514 int origidx = enc_registered(enc_table, orig);
515 set_base_encoding(enc_table, idx, rb_enc_from_index(origidx));
516 }
518}
519
520/* for encdb.h
521 * Set encoding dummy.
522 */
523int
525{
527
528 GLOBAL_ENC_TABLE_EVAL(enc_table,
529 enc = enc_table->list[index].enc);
530
532 return index;
533}
534
535static int
536enc_replicate(struct enc_table *enc_table, const char *name, rb_encoding *encoding)
537{
538 int idx;
539
540 enc_check_duplication(enc_table, name);
541 idx = enc_register(enc_table, name, encoding);
542 if (idx < 0) rb_raise(rb_eArgError, "invalid encoding name: %s", name);
543 set_base_encoding(enc_table, idx, encoding);
544 set_encoding_const(name, rb_enc_from_index(idx));
545 return idx;
546}
547
548int
549rb_enc_replicate(const char *name, rb_encoding *encoding)
550{
551 int r;
552
553 GLOBAL_ENC_TABLE_EVAL(enc_table,
554 r = enc_replicate(enc_table, name, encoding));
555
556 return r;
557}
558
559/*
560 * call-seq:
561 * enc.replicate(name) -> encoding
562 *
563 * Returns a replicated encoding of _enc_ whose name is _name_.
564 * The new encoding should have the same byte structure of _enc_.
565 * If _name_ is used by another encoding, raise ArgumentError.
566 *
567 */
568static VALUE
569enc_replicate_m(VALUE encoding, VALUE name)
570{
571 int idx = rb_enc_replicate(name_for_encoding(&name), rb_to_encoding(encoding));
573 return rb_enc_from_encoding_index(idx);
574}
575
576static int
577enc_replicate_with_index(struct enc_table *enc_table, const char *name, rb_encoding *origenc, int idx)
578{
579 if (idx < 0) {
580 idx = enc_register(enc_table, name, origenc);
581 }
582 else {
583 idx = enc_register_at(enc_table, idx, name, origenc);
584 }
585 if (idx >= 0) {
586 set_base_encoding(enc_table, idx, origenc);
587 set_encoding_const(name, rb_enc_from_index(idx));
588 }
589 else {
590 rb_raise(rb_eArgError, "failed to replicate encoding");
591 }
592 return idx;
593}
594
595int
596rb_encdb_replicate(const char *name, const char *orig)
597{
598 int r;
599
600 GLOBAL_ENC_TABLE_ENTER(enc_table);
601 {
602 int origidx = enc_registered(enc_table, orig);
603 int idx = enc_registered(enc_table, name);
604
605 if (origidx < 0) {
606 origidx = enc_register(enc_table, orig, 0);
607 }
608 r = enc_replicate_with_index(enc_table, name, rb_enc_from_index(origidx), idx);
609 }
611
612 return r;
613}
614
615int
617{
618 int index;
619
620 GLOBAL_ENC_TABLE_ENTER(enc_table);
621 {
622 index = enc_replicate(enc_table, name, rb_ascii8bit_encoding());
623 rb_encoding *enc = enc_table->list[index].enc;
625 }
627
628 return index;
629}
630
631int
633{
634 int index;
635
636 GLOBAL_ENC_TABLE_ENTER(enc_table);
637 {
638 index = enc_replicate_with_index(enc_table, name,
640 enc_registered(enc_table, name));
641 rb_encoding *enc = enc_table->list[index].enc;
643 }
645
646 return index;
647}
648
649/*
650 * call-seq:
651 * enc.dummy? -> true or false
652 *
653 * Returns true for dummy encodings.
654 * A dummy encoding is an encoding for which character handling is not properly
655 * implemented.
656 * It is used for stateful encodings.
657 *
658 * Encoding::ISO_2022_JP.dummy? #=> true
659 * Encoding::UTF_8.dummy? #=> false
660 *
661 */
662static VALUE
663enc_dummy_p(VALUE enc)
664{
665 return ENC_DUMMY_P(must_encoding(enc)) ? Qtrue : Qfalse;
666}
667
668/*
669 * call-seq:
670 * enc.ascii_compatible? -> true or false
671 *
672 * Returns whether ASCII-compatible or not.
673 *
674 * Encoding::UTF_8.ascii_compatible? #=> true
675 * Encoding::UTF_16BE.ascii_compatible? #=> false
676 *
677 */
678static VALUE
679enc_ascii_compatible_p(VALUE enc)
680{
681 return rb_enc_asciicompat(must_encoding(enc)) ? Qtrue : Qfalse;
682}
683
684/*
685 * Returns non-zero when the encoding is Unicode series other than UTF-7 else 0.
686 */
687int
689{
690 return ONIGENC_IS_UNICODE(enc);
691}
692
693static st_data_t
694enc_dup_name(st_data_t name)
695{
696 return (st_data_t)strdup((const char *)name);
697}
698
699/*
700 * Returns copied alias name when the key is added for st_table,
701 * else returns NULL.
702 */
703static int
704enc_alias_internal(struct enc_table *enc_table, const char *alias, int idx)
705{
706 return st_insert2(enc_table->names, (st_data_t)alias, (st_data_t)idx,
707 enc_dup_name);
708}
709
710static int
711enc_alias(struct enc_table *enc_table, const char *alias, int idx)
712{
713 if (!valid_encoding_name_p(alias)) return -1;
714 if (!enc_alias_internal(enc_table, alias, idx))
715 set_encoding_const(alias, enc_from_index(enc_table, idx));
716 return idx;
717}
718
719int
720rb_enc_alias(const char *alias, const char *orig)
721{
722 int idx, r;
723
724 GLOBAL_ENC_TABLE_ENTER(enc_table);
725 {
726 enc_check_duplication(enc_table, alias);
727 if ((idx = rb_enc_find_index(orig)) < 0) {
728 r = -1;
729 }
730 else {
731 r = enc_alias(enc_table, alias, idx);
732 }
733 }
735
736 return r;
737}
738
739int
740rb_encdb_alias(const char *alias, const char *orig)
741{
742 int r;
743
744 GLOBAL_ENC_TABLE_ENTER(enc_table);
745 {
746 int idx = enc_registered(enc_table, orig);
747
748 if (idx < 0) {
749 idx = enc_register(enc_table, orig, 0);
750 }
751 r = enc_alias(enc_table, alias, idx);
752 }
754
755 return r;
756}
757
758void
760{
762}
763
764static void
765rb_enc_init(struct enc_table *enc_table)
766{
767 enc_table_expand(enc_table, ENCODING_COUNT + 1);
768 if (!enc_table->names) {
769 enc_table->names = st_init_strcasetable();
770 }
771#define ENC_REGISTER(enc) enc_register_at(enc_table, ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
774 ENC_REGISTER(US_ASCII);
775 global_enc_ascii = enc_table->list[ENCINDEX_ASCII].enc;
776 global_enc_utf_8 = enc_table->list[ENCINDEX_UTF_8].enc;
777 global_enc_us_ascii = enc_table->list[ENCINDEX_US_ASCII].enc;
778#undef ENC_REGISTER
779#define ENCDB_REGISTER(name, enc) enc_register_at(enc_table, ENCINDEX_##enc, name, NULL)
780 ENCDB_REGISTER("UTF-16BE", UTF_16BE);
781 ENCDB_REGISTER("UTF-16LE", UTF_16LE);
782 ENCDB_REGISTER("UTF-32BE", UTF_32BE);
783 ENCDB_REGISTER("UTF-32LE", UTF_32LE);
784 ENCDB_REGISTER("UTF-16", UTF_16);
785 ENCDB_REGISTER("UTF-32", UTF_32);
786 ENCDB_REGISTER("UTF8-MAC", UTF8_MAC);
787
788 ENCDB_REGISTER("EUC-JP", EUC_JP);
789 ENCDB_REGISTER("Windows-31J", Windows_31J);
790#undef ENCDB_REGISTER
791 enc_table->count = ENCINDEX_BUILTIN_MAX;
792}
793
796{
797 return must_encindex(index);
798}
799
800static int
801load_encoding(const char *name)
802{
803 VALUE enclib = rb_sprintf("enc/%s.so", name);
804 VALUE verbose = ruby_verbose;
806 VALUE errinfo;
807 char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3;
808 int loaded;
809 int idx;
810
811 while (s < e) {
812 if (!ISALNUM(*s)) *s = '_';
813 else if (ISUPPER(*s)) *s = (char)TOLOWER(*s);
814 ++s;
815 }
816 enclib = rb_fstring(enclib);
819 errinfo = rb_errinfo();
820 loaded = rb_require_internal(enclib);
821 ruby_verbose = verbose;
823 rb_set_errinfo(errinfo);
824
825 GLOBAL_ENC_TABLE_ENTER(enc_table);
826 {
827 if (loaded < 0 || 1 < loaded) {
828 idx = -1;
829 }
830 else if ((idx = enc_registered(enc_table, name)) < 0) {
831 idx = -1;
832 }
833 else if (rb_enc_autoload_p(enc_table->list[idx].enc)) {
834 idx = -1;
835 }
836 }
838
839 return idx;
840}
841
842static int
843enc_autoload_body(struct enc_table *enc_table, rb_encoding *enc)
844{
845 rb_encoding *base = enc_table->list[ENC_TO_ENCINDEX(enc)].base;
846
847 if (base) {
848 int i = 0;
849 do {
850 if (i >= enc_table->count) return -1;
851 } while (enc_table->list[i].enc != base && (++i, 1));
852 if (rb_enc_autoload_p(base)) {
853 if (rb_enc_autoload(base) < 0) return -1;
854 }
856 enc_register_at(enc_table, i & ENC_INDEX_MASK, rb_enc_name(enc), base);
857 ((rb_raw_encoding *)enc)->ruby_encoding_index = i;
858 i &= ENC_INDEX_MASK;
859 return i;
860 }
861 else {
862 return -2;
863 }
864}
865
866int
868{
869 int i;
870 GLOBAL_ENC_TABLE_EVAL(enc_table, i = enc_autoload_body(enc_table, enc));
871 if (i == -2) {
872 i = load_encoding(rb_enc_name(enc));
873 }
874 return i;
875}
876
877/* Return encoding index or UNSPECIFIED_ENCODING from encoding name */
878int
880{
881 int i;
883
884 GLOBAL_ENC_TABLE_EVAL(enc_table, i = enc_registered(enc_table, name));
885
886 if (i < 0) {
887 i = load_encoding(name);
888 }
889 else if (!(enc = rb_enc_from_index(i))) {
890 if (i != UNSPECIFIED_ENCODING) {
891 rb_raise(rb_eArgError, "encoding %s is not registered", name);
892 }
893 }
894 else if (rb_enc_autoload_p(enc)) {
895 if (rb_enc_autoload(enc) < 0) {
896 rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
897 name);
898 return 0;
899 }
900 }
901 return i;
902}
903
904int
905rb_enc_find_index2(const char *name, long len)
906{
908
909 if (len > ENCODING_NAMELEN_MAX) return -1;
910 memcpy(buf, name, len);
911 buf[len] = '\0';
912 return rb_enc_find_index(buf);
913}
914
916rb_enc_find(const char *name)
917{
918 int idx = rb_enc_find_index(name);
919 if (idx < 0) idx = 0;
920 return rb_enc_from_index(idx);
921}
922
923static inline int
924enc_capable(VALUE obj)
925{
926 if (SPECIAL_CONST_P(obj)) return SYMBOL_P(obj);
927 switch (BUILTIN_TYPE(obj)) {
928 case T_STRING:
929 case T_REGEXP:
930 case T_FILE:
931 case T_SYMBOL:
932 return TRUE;
933 case T_DATA:
934 if (is_data_encoding(obj)) return TRUE;
935 default:
936 return FALSE;
937 }
938}
939
940int
942{
943 return enc_capable(obj);
944}
945
946ID
948{
949 CONST_ID(id_encoding, "encoding");
950 return id_encoding;
951}
952
953static int
954enc_get_index_str(VALUE str)
955{
956 int i = ENCODING_GET_INLINED(str);
957 if (i == ENCODING_INLINE_MAX) {
958 VALUE iv;
959
960#if 0
962 i = NUM2INT(iv);
963#else
964 /*
965 * Tentatively, assume ASCII-8BIT, if encoding index instance
966 * variable is not found. This can happen when freeing after
967 * all instance variables are removed in `obj_free`.
968 */
970 i = NIL_P(iv) ? ENCINDEX_ASCII : NUM2INT(iv);
971#endif
972 }
973 return i;
974}
975
976int
978{
979 int i = -1;
980 VALUE tmp;
981
982 if (SPECIAL_CONST_P(obj)) {
983 if (!SYMBOL_P(obj)) return -1;
984 obj = rb_sym2str(obj);
985 }
986 switch (BUILTIN_TYPE(obj)) {
987 case T_STRING:
988 case T_SYMBOL:
989 case T_REGEXP:
990 i = enc_get_index_str(obj);
991 break;
992 case T_FILE:
993 tmp = rb_funcallv(obj, rb_intern("internal_encoding"), 0, 0);
994 if (NIL_P(tmp)) {
995 tmp = rb_funcallv(obj, rb_intern("external_encoding"), 0, 0);
996 }
997 if (is_obj_encoding(tmp)) {
998 i = enc_check_encoding(tmp);
999 }
1000 break;
1001 case T_DATA:
1002 if (is_data_encoding(obj)) {
1003 i = enc_check_encoding(obj);
1004 }
1005 break;
1006 default:
1007 break;
1008 }
1009 return i;
1010}
1011
1012static void
1013enc_set_index(VALUE obj, int idx)
1014{
1015 if (!enc_capable(obj)) {
1016 rb_raise(rb_eArgError, "cannot set encoding on non-encoding capable object");
1017 }
1018
1019 if (idx < ENCODING_INLINE_MAX) {
1020 ENCODING_SET_INLINED(obj, idx);
1021 return;
1022 }
1024 rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
1025}
1026
1027void
1029{
1030 rb_check_frozen(obj);
1031 must_encindex(idx);
1032 enc_set_index(obj, idx);
1033}
1034
1035VALUE
1037{
1039 int oldidx, oldtermlen, termlen;
1040
1041/* enc_check_capable(obj);*/
1042 rb_check_frozen(obj);
1043 oldidx = rb_enc_get_index(obj);
1044 if (oldidx == idx)
1045 return obj;
1046 if (SPECIAL_CONST_P(obj)) {
1047 rb_raise(rb_eArgError, "cannot set encoding");
1048 }
1049 enc = must_encindex(idx);
1050 if (!ENC_CODERANGE_ASCIIONLY(obj) ||
1053 }
1054 termlen = rb_enc_mbminlen(enc);
1055 oldtermlen = rb_enc_mbminlen(rb_enc_from_index(oldidx));
1056 if (oldtermlen != termlen && RB_TYPE_P(obj, T_STRING)) {
1057 rb_str_change_terminator_length(obj, oldtermlen, termlen);
1058 }
1059 enc_set_index(obj, idx);
1060 return obj;
1061}
1062
1063VALUE
1065{
1067}
1068
1071{
1073}
1074
1075static rb_encoding* enc_compatible_str(VALUE str1, VALUE str2);
1076
1079{
1080 rb_encoding *enc = enc_compatible_str(MUST_STRING(str1), MUST_STRING(str2));
1081 if (!enc)
1082 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
1083 rb_enc_name(rb_enc_get(str1)),
1084 rb_enc_name(rb_enc_get(str2)));
1085 return enc;
1086}
1087
1090{
1091 rb_encoding *enc = rb_enc_compatible(str1, str2);
1092 if (!enc)
1093 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
1094 rb_enc_name(rb_enc_get(str1)),
1095 rb_enc_name(rb_enc_get(str2)));
1096 return enc;
1097}
1098
1099static rb_encoding*
1100enc_compatible_latter(VALUE str1, VALUE str2, int idx1, int idx2)
1101{
1102 int isstr1, isstr2;
1103 rb_encoding *enc1 = rb_enc_from_index(idx1);
1104 rb_encoding *enc2 = rb_enc_from_index(idx2);
1105
1106 isstr2 = RB_TYPE_P(str2, T_STRING);
1107 if (isstr2 && RSTRING_LEN(str2) == 0)
1108 return enc1;
1109 isstr1 = RB_TYPE_P(str1, T_STRING);
1110 if (isstr1 && isstr2 && RSTRING_LEN(str1) == 0)
1111 return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
1112 if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
1113 return 0;
1114 }
1115
1116 /* objects whose encoding is the same of contents */
1117 if (!isstr2 && idx2 == ENCINDEX_US_ASCII)
1118 return enc1;
1119 if (!isstr1 && idx1 == ENCINDEX_US_ASCII)
1120 return enc2;
1121
1122 if (!isstr1) {
1123 VALUE tmp = str1;
1124 int idx0 = idx1;
1125 str1 = str2;
1126 str2 = tmp;
1127 idx1 = idx2;
1128 idx2 = idx0;
1129 idx0 = isstr1;
1130 isstr1 = isstr2;
1131 isstr2 = idx0;
1132 }
1133 if (isstr1) {
1134 int cr1, cr2;
1135
1136 cr1 = rb_enc_str_coderange(str1);
1137 if (isstr2) {
1138 cr2 = rb_enc_str_coderange(str2);
1139 if (cr1 != cr2) {
1140 /* may need to handle ENC_CODERANGE_BROKEN */
1141 if (cr1 == ENC_CODERANGE_7BIT) return enc2;
1142 if (cr2 == ENC_CODERANGE_7BIT) return enc1;
1143 }
1144 if (cr2 == ENC_CODERANGE_7BIT) {
1145 return enc1;
1146 }
1147 }
1148 if (cr1 == ENC_CODERANGE_7BIT)
1149 return enc2;
1150 }
1151 return 0;
1152}
1153
1154static rb_encoding*
1155enc_compatible_str(VALUE str1, VALUE str2)
1156{
1157 int idx1 = enc_get_index_str(str1);
1158 int idx2 = enc_get_index_str(str2);
1159
1160 if (idx1 < 0 || idx2 < 0)
1161 return 0;
1162
1163 if (idx1 == idx2) {
1164 return rb_enc_from_index(idx1);
1165 }
1166 else {
1167 return enc_compatible_latter(str1, str2, idx1, idx2);
1168 }
1169}
1170
1173{
1174 int idx1 = rb_enc_get_index(str1);
1175 int idx2 = rb_enc_get_index(str2);
1176
1177 if (idx1 < 0 || idx2 < 0)
1178 return 0;
1179
1180 if (idx1 == idx2) {
1181 return rb_enc_from_index(idx1);
1182 }
1183
1184 return enc_compatible_latter(str1, str2, idx1, idx2);
1185}
1186
1187void
1189{
1191}
1192
1193
1194/*
1195 * call-seq:
1196 * obj.encoding -> encoding
1197 *
1198 * Returns the Encoding object that represents the encoding of obj.
1199 */
1200
1201VALUE
1203{
1204 int idx = rb_enc_get_index(obj);
1205 if (idx < 0) {
1206 rb_raise(rb_eTypeError, "unknown encoding");
1207 }
1208 return rb_enc_from_encoding_index(idx & ENC_INDEX_MASK);
1209}
1210
1211int
1212rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
1213{
1214 return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1215}
1216
1217int
1218rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
1219{
1220 int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1221 if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p)
1222 return MBCLEN_CHARFOUND_LEN(n);
1223 else {
1224 int min = rb_enc_mbminlen(enc);
1225 return min <= e-p ? min : (int)(e-p);
1226 }
1227}
1228
1229int
1230rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
1231{
1232 int n;
1233 if (e <= p)
1236 if (e-p < n)
1237 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p));
1238 return n;
1239}
1240
1241int
1242rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
1243{
1244 unsigned int c;
1245 int l;
1246 if (e <= p)
1247 return -1;
1248 if (rb_enc_asciicompat(enc)) {
1249 c = (unsigned char)*p;
1250 if (!ISASCII(c))
1251 return -1;
1252 if (len) *len = 1;
1253 return c;
1254 }
1255 l = rb_enc_precise_mbclen(p, e, enc);
1256 if (!MBCLEN_CHARFOUND_P(l))
1257 return -1;
1258 c = rb_enc_mbc_to_codepoint(p, e, enc);
1259 if (!rb_enc_isascii(c, enc))
1260 return -1;
1261 if (len) *len = l;
1262 return c;
1263}
1264
1265unsigned int
1266rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
1267{
1268 int r;
1269 if (e <= p)
1270 rb_raise(rb_eArgError, "empty string");
1271 r = rb_enc_precise_mbclen(p, e, enc);
1272 if (!MBCLEN_CHARFOUND_P(r)) {
1273 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
1274 }
1275 if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r);
1276 return rb_enc_mbc_to_codepoint(p, e, enc);
1277}
1278
1279#undef rb_enc_codepoint
1280unsigned int
1281rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
1282{
1283 return rb_enc_codepoint_len(p, e, 0, enc);
1284}
1285
1286int
1288{
1289 int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
1290 if (n == 0) {
1291 rb_raise(rb_eArgError, "invalid codepoint 0x%x in %s", c, rb_enc_name(enc));
1292 }
1293 return n;
1294}
1295
1296#undef rb_enc_code_to_mbclen
1297int
1299{
1301}
1302
1303int
1305{
1307}
1308
1309int
1311{
1313}
1314
1315/*
1316 * call-seq:
1317 * enc.inspect -> string
1318 *
1319 * Returns a string which represents the encoding for programmers.
1320 *
1321 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
1322 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
1323 */
1324static VALUE
1325enc_inspect(VALUE self)
1326{
1328
1329 if (!is_data_encoding(self)) {
1330 not_encoding(self);
1331 }
1332 if (!(enc = DATA_PTR(self)) || rb_enc_from_index(rb_enc_to_index(enc)) != enc) {
1333 rb_raise(rb_eTypeError, "broken Encoding");
1334 }
1336 "#<%"PRIsVALUE":%s%s%s>", rb_obj_class(self),
1338 (ENC_DUMMY_P(enc) ? " (dummy)" : ""),
1339 rb_enc_autoload_p(enc) ? " (autoload)" : "");
1340}
1341
1342/*
1343 * call-seq:
1344 * enc.name -> string
1345 * enc.to_s -> string
1346 *
1347 * Returns the name of the encoding.
1348 *
1349 * Encoding::UTF_8.name #=> "UTF-8"
1350 */
1351static VALUE
1352enc_name(VALUE self)
1353{
1355}
1356
1357static int
1358enc_names_i(st_data_t name, st_data_t idx, st_data_t args)
1359{
1360 VALUE *arg = (VALUE *)args;
1361
1362 if ((int)idx == (int)arg[0]) {
1363 VALUE str = rb_fstring_cstr((char *)name);
1364 rb_ary_push(arg[1], str);
1365 }
1366 return ST_CONTINUE;
1367}
1368
1369/*
1370 * call-seq:
1371 * enc.names -> array
1372 *
1373 * Returns the list of name and aliases of the encoding.
1374 *
1375 * Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK"]
1376 */
1377static VALUE
1378enc_names(VALUE self)
1379{
1380 VALUE args[2];
1381
1382 args[0] = (VALUE)rb_to_encoding_index(self);
1383 args[1] = rb_ary_new2(0);
1384
1385 GLOBAL_ENC_TABLE_EVAL(enc_table,
1386 st_foreach(enc_table->names, enc_names_i, (st_data_t)args));
1387
1388 return args[1];
1389}
1390
1391/*
1392 * call-seq:
1393 * Encoding.list -> [enc1, enc2, ...]
1394 *
1395 * Returns the list of loaded encodings.
1396 *
1397 * Encoding.list
1398 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1399 * #<Encoding:ISO-2022-JP (dummy)>]
1400 *
1401 * Encoding.find("US-ASCII")
1402 * #=> #<Encoding:US-ASCII>
1403 *
1404 * Encoding.list
1405 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1406 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
1407 *
1408 */
1409static VALUE
1410enc_list(VALUE klass)
1411{
1412 VALUE ary = rb_ary_new2(0);
1413
1415 {
1416 rb_ary_replace(ary, rb_default_encoding_list);
1417 rb_ary_concat(ary, rb_additional_encoding_list);
1418 }
1420
1421 return ary;
1422}
1423
1424/*
1425 * call-seq:
1426 * Encoding.find(string) -> enc
1427 *
1428 * Search the encoding with specified <i>name</i>.
1429 * <i>name</i> should be a string.
1430 *
1431 * Encoding.find("US-ASCII") #=> #<Encoding:US-ASCII>
1432 *
1433 * Names which this method accept are encoding names and aliases
1434 * including following special aliases
1435 *
1436 * "external":: default external encoding
1437 * "internal":: default internal encoding
1438 * "locale":: locale encoding
1439 * "filesystem":: filesystem encoding
1440 *
1441 * An ArgumentError is raised when no encoding with <i>name</i>.
1442 * Only <code>Encoding.find("internal")</code> however returns nil
1443 * when no encoding named "internal", in other words, when Ruby has no
1444 * default internal encoding.
1445 */
1446static VALUE
1447enc_find(VALUE klass, VALUE enc)
1448{
1449 int idx;
1450 if (is_obj_encoding(enc))
1451 return enc;
1452 idx = str_to_encindex(enc);
1453 if (idx == UNSPECIFIED_ENCODING) return Qnil;
1454 return rb_enc_from_encoding_index(idx);
1455}
1456
1457/*
1458 * call-seq:
1459 * Encoding.compatible?(obj1, obj2) -> enc or nil
1460 *
1461 * Checks the compatibility of two objects.
1462 *
1463 * If the objects are both strings they are compatible when they are
1464 * concatenatable. The encoding of the concatenated string will be returned
1465 * if they are compatible, nil if they are not.
1466 *
1467 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
1468 * #=> #<Encoding:ISO-8859-1>
1469 *
1470 * Encoding.compatible?(
1471 * "\xa1".force_encoding("iso-8859-1"),
1472 * "\xa1\xa1".force_encoding("euc-jp"))
1473 * #=> nil
1474 *
1475 * If the objects are non-strings their encodings are compatible when they
1476 * have an encoding and:
1477 * * Either encoding is US-ASCII compatible
1478 * * One of the encodings is a 7-bit encoding
1479 *
1480 */
1481static VALUE
1482enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
1483{
1485
1486 if (!enc_capable(str1)) return Qnil;
1487 if (!enc_capable(str2)) return Qnil;
1488 enc = rb_enc_compatible(str1, str2);
1489 if (!enc) return Qnil;
1490 return rb_enc_from_encoding(enc);
1491}
1492
1493NORETURN(static VALUE enc_s_alloc(VALUE klass));
1494/* :nodoc: */
1495static VALUE
1496enc_s_alloc(VALUE klass)
1497{
1498 rb_undefined_alloc(klass);
1500}
1501
1502/* :nodoc: */
1503static VALUE
1504enc_dump(int argc, VALUE *argv, VALUE self)
1505{
1506 rb_check_arity(argc, 0, 1);
1507 return enc_name(self);
1508}
1509
1510/* :nodoc: */
1511static VALUE
1512enc_load(VALUE klass, VALUE str)
1513{
1514 return str;
1515}
1516
1517/* :nodoc: */
1518static VALUE
1519enc_m_loader(VALUE klass, VALUE str)
1520{
1521 return enc_find(klass, str);
1522}
1523
1526{
1527 return global_enc_ascii;
1528}
1529
1530int
1532{
1533 return ENCINDEX_ASCII;
1534}
1535
1538{
1539 return global_enc_utf_8;
1540}
1541
1542int
1544{
1545 return ENCINDEX_UTF_8;
1546}
1547
1550{
1551 return global_enc_us_ascii;
1552}
1553
1554int
1556{
1557 return ENCINDEX_US_ASCII;
1558}
1559
1560int rb_locale_charmap_index(void);
1561
1562int
1564{
1565 int idx = rb_locale_charmap_index();
1566
1567 if (idx < 0) idx = ENCINDEX_UTF_8;
1568
1569 GLOBAL_ENC_TABLE_ENTER(enc_table);
1570 if (enc_registered(enc_table, "locale") < 0) {
1571# if defined _WIN32
1572 void Init_w32_codepage(void);
1574# endif
1575 enc_alias_internal(enc_table, "locale", idx);
1576 }
1578
1579 return idx;
1580}
1581
1584{
1586}
1587
1588int
1590{
1591 int idx;
1592
1593 GLOBAL_ENC_TABLE_EVAL(enc_table,
1594 idx = enc_registered(enc_table, "filesystem"));
1595
1596 if (idx < 0)
1597 idx = ENCINDEX_ASCII;
1598 return idx;
1599}
1600
1603{
1605}
1606
1608 int index; /* -2 => not yet set, -1 => nil */
1610};
1611
1612static struct default_encoding default_external = {0};
1613
1614static int
1615enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const char *name)
1616{
1617 int overridden = FALSE;
1618
1619 if (def->index != -2)
1620 /* Already set */
1621 overridden = TRUE;
1622
1623 GLOBAL_ENC_TABLE_ENTER(enc_table);
1624 {
1625 if (NIL_P(encoding)) {
1626 def->index = -1;
1627 def->enc = 0;
1628 st_insert(enc_table->names, (st_data_t)strdup(name),
1630 }
1631 else {
1632 def->index = rb_enc_to_index(rb_to_encoding(encoding));
1633 def->enc = 0;
1634 enc_alias_internal(enc_table, name, def->index);
1635 }
1636
1637 if (def == &default_external) {
1638 enc_alias_internal(enc_table, "filesystem", Init_enc_set_filesystem_encoding());
1639 }
1640 }
1642
1643 return overridden;
1644}
1645
1648{
1649 if (default_external.enc) return default_external.enc;
1650
1651 if (default_external.index >= 0) {
1652 default_external.enc = rb_enc_from_index(default_external.index);
1653 return default_external.enc;
1654 }
1655 else {
1656 return rb_locale_encoding();
1657 }
1658}
1659
1660VALUE
1662{
1664}
1665
1666/*
1667 * call-seq:
1668 * Encoding.default_external -> enc
1669 *
1670 * Returns default external encoding.
1671 *
1672 * The default external encoding is used by default for strings created from
1673 * the following locations:
1674 *
1675 * * CSV
1676 * * File data read from disk
1677 * * SDBM
1678 * * StringIO
1679 * * Zlib::GzipReader
1680 * * Zlib::GzipWriter
1681 * * String#inspect
1682 * * Regexp#inspect
1683 *
1684 * While strings created from these locations will have this encoding, the
1685 * encoding may not be valid. Be sure to check String#valid_encoding?.
1686 *
1687 * File data written to disk will be transcoded to the default external
1688 * encoding when written, if default_internal is not nil.
1689 *
1690 * The default external encoding is initialized by the -E option.
1691 * If -E isn't set, it is initialized to UTF-8 on Windows and the locale on
1692 * other operating systems.
1693 */
1694static VALUE
1695get_default_external(VALUE klass)
1696{
1697 return rb_enc_default_external();
1698}
1699
1700void
1702{
1703 if (NIL_P(encoding)) {
1704 rb_raise(rb_eArgError, "default external can not be nil");
1705 }
1706 enc_set_default_encoding(&default_external, encoding,
1707 "external");
1708}
1709
1710/*
1711 * call-seq:
1712 * Encoding.default_external = enc
1713 *
1714 * Sets default external encoding. You should not set
1715 * Encoding::default_external in ruby code as strings created before changing
1716 * the value may have a different encoding from strings created after the value
1717 * was changed., instead you should use <tt>ruby -E</tt> to invoke ruby with
1718 * the correct default_external.
1719 *
1720 * See Encoding::default_external for information on how the default external
1721 * encoding is used.
1722 */
1723static VALUE
1724set_default_external(VALUE klass, VALUE encoding)
1725{
1726 rb_warning("setting Encoding.default_external");
1728 return encoding;
1729}
1730
1731static struct default_encoding default_internal = {-2};
1732
1735{
1736 if (!default_internal.enc && default_internal.index >= 0) {
1737 default_internal.enc = rb_enc_from_index(default_internal.index);
1738 }
1739 return default_internal.enc; /* can be NULL */
1740}
1741
1742VALUE
1744{
1745 /* Note: These functions cope with default_internal not being set */
1747}
1748
1749/*
1750 * call-seq:
1751 * Encoding.default_internal -> enc
1752 *
1753 * Returns default internal encoding. Strings will be transcoded to the
1754 * default internal encoding in the following places if the default internal
1755 * encoding is not nil:
1756 *
1757 * * CSV
1758 * * Etc.sysconfdir and Etc.systmpdir
1759 * * File data read from disk
1760 * * File names from Dir
1761 * * Integer#chr
1762 * * String#inspect and Regexp#inspect
1763 * * Strings returned from Readline
1764 * * Strings returned from SDBM
1765 * * Time#zone
1766 * * Values from ENV
1767 * * Values in ARGV including $PROGRAM_NAME
1768 *
1769 * Additionally String#encode and String#encode! use the default internal
1770 * encoding if no encoding is given.
1771 *
1772 * The script encoding (__ENCODING__), not default_internal, is used as the
1773 * encoding of created strings.
1774 *
1775 * Encoding::default_internal is initialized with -E option or nil otherwise.
1776 */
1777static VALUE
1778get_default_internal(VALUE klass)
1779{
1780 return rb_enc_default_internal();
1781}
1782
1783void
1785{
1786 enc_set_default_encoding(&default_internal, encoding,
1787 "internal");
1788}
1789
1790/*
1791 * call-seq:
1792 * Encoding.default_internal = enc or nil
1793 *
1794 * Sets default internal encoding or removes default internal encoding when
1795 * passed nil. You should not set Encoding::default_internal in ruby code as
1796 * strings created before changing the value may have a different encoding
1797 * from strings created after the change. Instead you should use
1798 * <tt>ruby -E</tt> to invoke ruby with the correct default_internal.
1799 *
1800 * See Encoding::default_internal for information on how the default internal
1801 * encoding is used.
1802 */
1803static VALUE
1804set_default_internal(VALUE klass, VALUE encoding)
1805{
1806 rb_warning("setting Encoding.default_internal");
1808 return encoding;
1809}
1810
1811static void
1812set_encoding_const(const char *name, rb_encoding *enc)
1813{
1814 VALUE encoding = rb_enc_from_encoding(enc);
1815 char *s = (char *)name;
1816 int haslower = 0, hasupper = 0, valid = 0;
1817
1818 if (ISDIGIT(*s)) return;
1819 if (ISUPPER(*s)) {
1820 hasupper = 1;
1821 while (*++s && (ISALNUM(*s) || *s == '_')) {
1822 if (ISLOWER(*s)) haslower = 1;
1823 }
1824 }
1825 if (!*s) {
1826 if (s - name > ENCODING_NAMELEN_MAX) return;
1827 valid = 1;
1828 rb_define_const(rb_cEncoding, name, encoding);
1829 }
1830 if (!valid || haslower) {
1831 size_t len = s - name;
1832 if (len > ENCODING_NAMELEN_MAX) return;
1833 if (!haslower || !hasupper) {
1834 do {
1835 if (ISLOWER(*s)) haslower = 1;
1836 if (ISUPPER(*s)) hasupper = 1;
1837 } while (*++s && (!haslower || !hasupper));
1838 len = s - name;
1839 }
1840 len += strlen(s);
1841 if (len++ > ENCODING_NAMELEN_MAX) return;
1842 MEMCPY(s = ALLOCA_N(char, len), name, char, len);
1843 name = s;
1844 if (!valid) {
1845 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1846 for (; *s; ++s) {
1847 if (!ISALNUM(*s)) *s = '_';
1848 }
1849 if (hasupper) {
1850 rb_define_const(rb_cEncoding, name, encoding);
1851 }
1852 }
1853 if (haslower) {
1854 for (s = (char *)name; *s; ++s) {
1855 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1856 }
1857 rb_define_const(rb_cEncoding, name, encoding);
1858 }
1859 }
1860}
1861
1862static int
1863rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg)
1864{
1865 VALUE ary = (VALUE)arg;
1866 VALUE str = rb_fstring_cstr((char *)name);
1867 rb_ary_push(ary, str);
1868 return ST_CONTINUE;
1869}
1870
1871/*
1872 * call-seq:
1873 * Encoding.name_list -> ["enc1", "enc2", ...]
1874 *
1875 * Returns the list of available encoding names.
1876 *
1877 * Encoding.name_list
1878 * #=> ["US-ASCII", "ASCII-8BIT", "UTF-8",
1879 * "ISO-8859-1", "Shift_JIS", "EUC-JP",
1880 * "Windows-31J",
1881 * "BINARY", "CP932", "eucJP"]
1882 *
1883 */
1884
1885static VALUE
1886rb_enc_name_list(VALUE klass)
1887{
1888 VALUE ary;
1889
1890 GLOBAL_ENC_TABLE_ENTER(enc_table);
1891 {
1892 ary = rb_ary_new2(enc_table->names->num_entries);
1893 st_foreach(enc_table->names, rb_enc_name_list_i, (st_data_t)ary);
1894 }
1896
1897 return ary;
1898}
1899
1900static int
1901rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg)
1902{
1903 VALUE *p = (VALUE *)arg;
1904 VALUE aliases = p[0], ary = p[1];
1905 int idx = (int)orig;
1906 VALUE key, str = rb_ary_entry(ary, idx);
1907
1908 if (NIL_P(str)) {
1910
1911 if (!enc) return ST_CONTINUE;
1912 if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) {
1913 return ST_CONTINUE;
1914 }
1916 rb_ary_store(ary, idx, str);
1917 }
1918 key = rb_fstring_cstr((char *)name);
1919 rb_hash_aset(aliases, key, str);
1920 return ST_CONTINUE;
1921}
1922
1923/*
1924 * call-seq:
1925 * Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...}
1926 *
1927 * Returns the hash of available encoding alias and original encoding name.
1928 *
1929 * Encoding.aliases
1930 * #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1968"=>"US-ASCII",
1931 * "SJIS"=>"Windows-31J", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1932 *
1933 */
1934
1935static VALUE
1936rb_enc_aliases(VALUE klass)
1937{
1938 VALUE aliases[2];
1939 aliases[0] = rb_hash_new();
1940 aliases[1] = rb_ary_new();
1941
1942 GLOBAL_ENC_TABLE_EVAL(enc_table,
1943 st_foreach(enc_table->names, rb_enc_aliases_enc_i, (st_data_t)aliases));
1944
1945 return aliases[0];
1946}
1947
1948/*
1949 * An Encoding instance represents a character encoding usable in Ruby. It is
1950 * defined as a constant under the Encoding namespace. It has a name and
1951 * optionally, aliases:
1952 *
1953 * Encoding::ISO_8859_1.name
1954 * #=> "ISO-8859-1"
1955 *
1956 * Encoding::ISO_8859_1.names
1957 * #=> ["ISO-8859-1", "ISO8859-1"]
1958 *
1959 * Ruby methods dealing with encodings return or accept Encoding instances as
1960 * arguments (when a method accepts an Encoding instance as an argument, it
1961 * can be passed an Encoding name or alias instead).
1962 *
1963 * "some string".encoding
1964 * #=> #<Encoding:UTF-8>
1965 *
1966 * string = "some string".encode(Encoding::ISO_8859_1)
1967 * #=> "some string"
1968 * string.encoding
1969 * #=> #<Encoding:ISO-8859-1>
1970 *
1971 * "some string".encode "ISO-8859-1"
1972 * #=> "some string"
1973 *
1974 * Encoding::ASCII_8BIT is a special encoding that is usually used for
1975 * a byte string, not a character string. But as the name insists, its
1976 * characters in the range of ASCII are considered as ASCII
1977 * characters. This is useful when you use ASCII-8BIT characters with
1978 * other ASCII compatible characters.
1979 *
1980 * == Changing an encoding
1981 *
1982 * The associated Encoding of a String can be changed in two different ways.
1983 *
1984 * First, it is possible to set the Encoding of a string to a new Encoding
1985 * without changing the internal byte representation of the string, with
1986 * String#force_encoding. This is how you can tell Ruby the correct encoding
1987 * of a string.
1988 *
1989 * string
1990 * #=> "R\xC3\xA9sum\xC3\xA9"
1991 * string.encoding
1992 * #=> #<Encoding:ISO-8859-1>
1993 * string.force_encoding(Encoding::UTF_8)
1994 * #=> "R\u00E9sum\u00E9"
1995 *
1996 * Second, it is possible to transcode a string, i.e. translate its internal
1997 * byte representation to another encoding. Its associated encoding is also
1998 * set to the other encoding. See String#encode for the various forms of
1999 * transcoding, and the Encoding::Converter class for additional control over
2000 * the transcoding process.
2001 *
2002 * string
2003 * #=> "R\u00E9sum\u00E9"
2004 * string.encoding
2005 * #=> #<Encoding:UTF-8>
2006 * string = string.encode!(Encoding::ISO_8859_1)
2007 * #=> "R\xE9sum\xE9"
2008 * string.encoding
2009 * #=> #<Encoding::ISO-8859-1>
2010 *
2011 * == Script encoding
2012 *
2013 * All Ruby script code has an associated Encoding which any String literal
2014 * created in the source code will be associated to.
2015 *
2016 * The default script encoding is Encoding::UTF_8 after v2.0, but it
2017 * can be changed by a magic comment on the first line of the source
2018 * code file (or second line, if there is a shebang line on the
2019 * first). The comment must contain the word <code>coding</code> or
2020 * <code>encoding</code>, followed by a colon, space and the Encoding
2021 * name or alias:
2022 *
2023 * # encoding: UTF-8
2024 *
2025 * "some string".encoding
2026 * #=> #<Encoding:UTF-8>
2027 *
2028 * The <code>__ENCODING__</code> keyword returns the script encoding of the file
2029 * which the keyword is written:
2030 *
2031 * # encoding: ISO-8859-1
2032 *
2033 * __ENCODING__
2034 * #=> #<Encoding:ISO-8859-1>
2035 *
2036 * <code>ruby -K</code> will change the default locale encoding, but this is
2037 * not recommended. Ruby source files should declare its script encoding by a
2038 * magic comment even when they only depend on US-ASCII strings or regular
2039 * expressions.
2040 *
2041 * == Locale encoding
2042 *
2043 * The default encoding of the environment. Usually derived from locale.
2044 *
2045 * see Encoding.locale_charmap, Encoding.find('locale')
2046 *
2047 * == Filesystem encoding
2048 *
2049 * The default encoding of strings from the filesystem of the environment.
2050 * This is used for strings of file names or paths.
2051 *
2052 * see Encoding.find('filesystem')
2053 *
2054 * == External encoding
2055 *
2056 * Each IO object has an external encoding which indicates the encoding that
2057 * Ruby will use to read its data. By default Ruby sets the external encoding
2058 * of an IO object to the default external encoding. The default external
2059 * encoding is set by locale encoding or the interpreter <code>-E</code> option.
2060 * Encoding.default_external returns the current value of the external
2061 * encoding.
2062 *
2063 * ENV["LANG"]
2064 * #=> "UTF-8"
2065 * Encoding.default_external
2066 * #=> #<Encoding:UTF-8>
2067 *
2068 * $ ruby -E ISO-8859-1 -e "p Encoding.default_external"
2069 * #<Encoding:ISO-8859-1>
2070 *
2071 * $ LANG=C ruby -e 'p Encoding.default_external'
2072 * #<Encoding:US-ASCII>
2073 *
2074 * The default external encoding may also be set through
2075 * Encoding.default_external=, but you should not do this as strings created
2076 * before and after the change will have inconsistent encodings. Instead use
2077 * <code>ruby -E</code> to invoke ruby with the correct external encoding.
2078 *
2079 * When you know that the actual encoding of the data of an IO object is not
2080 * the default external encoding, you can reset its external encoding with
2081 * IO#set_encoding or set it at IO object creation (see IO.new options).
2082 *
2083 * == Internal encoding
2084 *
2085 * To process the data of an IO object which has an encoding different
2086 * from its external encoding, you can set its internal encoding. Ruby will use
2087 * this internal encoding to transcode the data when it is read from the IO
2088 * object.
2089 *
2090 * Conversely, when data is written to the IO object it is transcoded from the
2091 * internal encoding to the external encoding of the IO object.
2092 *
2093 * The internal encoding of an IO object can be set with
2094 * IO#set_encoding or at IO object creation (see IO.new options).
2095 *
2096 * The internal encoding is optional and when not set, the Ruby default
2097 * internal encoding is used. If not explicitly set this default internal
2098 * encoding is +nil+ meaning that by default, no transcoding occurs.
2099 *
2100 * The default internal encoding can be set with the interpreter option
2101 * <code>-E</code>. Encoding.default_internal returns the current internal
2102 * encoding.
2103 *
2104 * $ ruby -e 'p Encoding.default_internal'
2105 * nil
2106 *
2107 * $ ruby -E ISO-8859-1:UTF-8 -e "p [Encoding.default_external, \
2108 * Encoding.default_internal]"
2109 * [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>]
2110 *
2111 * The default internal encoding may also be set through
2112 * Encoding.default_internal=, but you should not do this as strings created
2113 * before and after the change will have inconsistent encodings. Instead use
2114 * <code>ruby -E</code> to invoke ruby with the correct internal encoding.
2115 *
2116 * == IO encoding example
2117 *
2118 * In the following example a UTF-8 encoded string "R\u00E9sum\u00E9" is transcoded for
2119 * output to ISO-8859-1 encoding, then read back in and transcoded to UTF-8:
2120 *
2121 * string = "R\u00E9sum\u00E9"
2122 *
2123 * open("transcoded.txt", "w:ISO-8859-1") do |io|
2124 * io.write(string)
2125 * end
2126 *
2127 * puts "raw text:"
2128 * p File.binread("transcoded.txt")
2129 * puts
2130 *
2131 * open("transcoded.txt", "r:ISO-8859-1:UTF-8") do |io|
2132 * puts "transcoded text:"
2133 * p io.read
2134 * end
2135 *
2136 * While writing the file, the internal encoding is not specified as it is
2137 * only necessary for reading. While reading the file both the internal and
2138 * external encoding must be specified to obtain the correct result.
2139 *
2140 * $ ruby t.rb
2141 * raw text:
2142 * "R\xE9sum\xE9"
2143 *
2144 * transcoded text:
2145 * "R\u00E9sum\u00E9"
2146 *
2147 */
2148
2149void
2151{
2152 VALUE list;
2153 int i;
2154
2156 rb_define_alloc_func(rb_cEncoding, enc_s_alloc);
2158 rb_define_method(rb_cEncoding, "to_s", enc_name, 0);
2159 rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
2160 rb_define_method(rb_cEncoding, "name", enc_name, 0);
2161 rb_define_method(rb_cEncoding, "names", enc_names, 0);
2162 rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
2163 rb_define_method(rb_cEncoding, "ascii_compatible?", enc_ascii_compatible_p, 0);
2164 rb_define_method(rb_cEncoding, "replicate", enc_replicate_m, 1);
2165 rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0);
2166 rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0);
2167 rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0);
2168 rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1);
2169 rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2);
2170
2171 rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
2172 rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);
2173
2174 rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0);
2175 rb_define_singleton_method(rb_cEncoding, "default_external=", set_default_external, 1);
2176 rb_define_singleton_method(rb_cEncoding, "default_internal", get_default_internal, 0);
2177 rb_define_singleton_method(rb_cEncoding, "default_internal=", set_default_internal, 1);
2178 rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0); /* in localeinit.c */
2179
2180 struct enc_table *enc_table = &global_enc_table;
2181
2182 if (DEFAULT_ENCODING_LIST_CAPA < enc_table->count) rb_bug("DEFAULT_ENCODING_LIST_CAPA is too small");
2183
2184 list = rb_additional_encoding_list = rb_ary_new();
2185 RBASIC_CLEAR_CLASS(list);
2187
2188 list = rb_default_encoding_list = rb_ary_new2(DEFAULT_ENCODING_LIST_CAPA);
2189 RBASIC_CLEAR_CLASS(list);
2191
2192 for (i = 0; i < enc_table->count; ++i) {
2193 rb_ary_push(list, enc_new(enc_table->list[i].enc));
2194 }
2195
2196 rb_marshal_define_compat(rb_cEncoding, Qnil, 0, enc_m_loader);
2197}
2198
2199void
2201{
2202 rb_enc_init(&global_enc_table);
2203}
2204
2205/* locale insensitive ctype functions */
2206
2207void
2209{
2210 GLOBAL_ENC_TABLE_EVAL(enc_table, st_foreach(enc_table->names, func, arg));
2211}
void rb_ary_store(VALUE ary, long idx, VALUE val)
Definition: array.c:1141
VALUE rb_ary_push(VALUE ary, VALUE item)
Definition: array.c:1301
VALUE rb_ary_replace(VALUE copy, VALUE orig)
Definition: array.c:4484
VALUE rb_ary_new(void)
Definition: array.c:749
VALUE rb_ary_concat(VALUE x, VALUE y)
Definition: array.c:4859
VALUE rb_ary_entry(VALUE ary, long offset)
Definition: array.c:1672
#define NORETURN(x)
Definition: attributes.h:152
#define UNREACHABLE_RETURN
Definition: assume.h:31
Our own, locale independent, character handling routines.
#define ISUPPER
Definition: ctype.h:39
#define ISDIGIT
Definition: ctype.h:43
#define ISLOWER
Definition: ctype.h:40
#define STRCASECMP
Definition: ctype.h:52
#define ISASCII
Definition: ctype.h:35
#define TOLOWER
Definition: ctype.h:51
#define ISALNUM
Definition: ctype.h:41
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
Definition: cxxanyargs.hpp:653
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:668
Internal header for Encoding.
#define ENCINDEX_BUILTIN_MAX
Definition: encindex.h:55
#define ENCINDEX_UTF_8
Definition: encindex.h:44
#define ENCINDEX_US_ASCII
Definition: encindex.h:45
#define ENCINDEX_ASCII
Definition: encindex.h:43
rb_encoding * rb_find_encoding(VALUE enc)
Definition: encoding.c:336
#define is_data_encoding(obj)
Definition: encoding.c:110
int rb_enc_dummy_p(rb_encoding *enc)
Definition: encoding.c:203
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1230
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:977
int rb_filesystem_encindex(void)
Definition: encoding.c:1589
int rb_enc_replicate(const char *name, rb_encoding *encoding)
Definition: encoding.c:549
int rb_to_encoding_index(VALUE enc)
Definition: encoding.c:267
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:1064
#define ENC_TO_ENCINDEX(enc)
Definition: encoding.c:94
void rb_encdb_declare(const char *name)
Definition: encoding.c:473
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1537
rb_encoding * rb_enc_check_str(VALUE str1, VALUE str2)
Definition: encoding.c:1078
rb_encoding * rb_ascii8bit_encoding(void)
Definition: encoding.c:1525
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
Definition: encoding.c:1266
#define ENC_INDEX_MASK
Definition: encoding.c:92
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:414
rb_encoding * rb_filesystem_encoding(void)
Definition: encoding.c:1602
#define UNSPECIFIED_ENCODING
Definition: encoding.c:99
int rb_enc_autoload(rb_encoding *enc)
Definition: encoding.c:867
#define DEFAULT_ENCODING_LIST_CAPA
Definition: encoding.c:59
rb_encoding * rb_default_internal_encoding(void)
Definition: encoding.c:1734
int rb_enc_register(const char *name, rb_encoding *encoding)
Definition: encoding.c:430
int rb_utf8_encindex(void)
Definition: encoding.c:1543
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1212
void Init_Encoding(void)
Definition: encoding.c:2150
int rb_enc_set_dummy(int index)
Definition: encoding.c:524
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:1070
int rb_ascii8bit_encindex(void)
Definition: encoding.c:1531
rb_encoding * rb_enc_get_from_index(int index)
Definition: encoding.c:795
void rb_enc_set_base(const char *name, const char *orig)
Definition: encoding.c:509
int rb_enc_unicode_p(rb_encoding *enc)
Definition: encoding.c:688
void rb_enc_copy(VALUE obj1, VALUE obj2)
Definition: encoding.c:1188
#define ENCDB_REGISTER(name, enc)
int rb_enc_to_index(rb_encoding *enc)
Definition: encoding.c:197
void Init_encodings(void)
Definition: encoding.c:2200
int rb_data_is_encoding(VALUE obj)
Definition: encoding.c:114
void rb_enc_set_default_internal(VALUE encoding)
Definition: encoding.c:1784
void rb_encdb_set_unicode(int index)
Definition: encoding.c:759
VALUE rb_enc_default_external(void)
Definition: encoding.c:1661
#define ENCODING_NAMELEN_MAX
Definition: encoding.c:101
void rb_enc_set_index(VALUE obj, int idx)
Definition: encoding.c:1028
rb_encoding * rb_enc_find(const char *name)
Definition: encoding.c:916
int rb_define_dummy_encoding(const char *name)
Definition: encoding.c:616
#define GLOBAL_ENC_TABLE_ENTER(enc_table)
Definition: encoding.c:80
int rb_enc_tolower(int c, rb_encoding *enc)
Definition: encoding.c:1310
int rb_encdb_replicate(const char *name, const char *orig)
Definition: encoding.c:596
#define MUST_STRING(str)
Definition: encoding.c:35
int rb_enc_find_index2(const char *name, long len)
Definition: encoding.c:905
rb_encoding * rb_default_external_encoding(void)
Definition: encoding.c:1647
ID rb_id_encoding(void)
Definition: encoding.c:947
int rb_locale_encindex(void)
Definition: encoding.c:1563
VALUE rb_cEncoding
Definition: encoding.c:57
#define GLOBAL_ENC_TABLE_EVAL(enc_table, expr)
Definition: encoding.c:82
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Definition: encoding.c:1089
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1218
int rb_enc_capable(VALUE obj)
Definition: encoding.c:941
#define valid_encoding_name_p(name)
Definition: encoding.c:102
VALUE rb_enc_default_internal(void)
Definition: encoding.c:1743
int rb_encdb_dummy(const char *name)
Definition: encoding.c:632
#define ENC_REGISTER(enc)
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Definition: encoding.c:1172
rb_encoding * rb_locale_encoding(void)
Definition: encoding.c:1583
#define GLOBAL_ENC_TABLE_LEAVE()
Definition: encoding.c:81
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:1202
#define ENCODING_COUNT
Definition: encoding.c:98
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:329
int rb_enc_toupper(int c, rb_encoding *enc)
Definition: encoding.c:1304
rb_encoding * rb_usascii_encoding(void)
Definition: encoding.c:1549
OnigEncodingType rb_raw_encoding
Definition: encoding.c:41
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:188
void rb_enc_set_default_external(VALUE encoding)
Definition: encoding.c:1701
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:1036
int rb_locale_charmap_index(void)
Definition: localeinit.c:109
int rb_enc_find_index(const char *name)
Definition: encoding.c:879
int rb_enc_alias(const char *alias, const char *orig)
Definition: encoding.c:720
int rb_enc_codelen(int c, rb_encoding *enc)
Definition: encoding.c:1287
int rb_encdb_alias(const char *alias, const char *orig)
Definition: encoding.c:740
#define ENC_SET_DUMMY(enc)
Definition: encoding.c:96
#define is_obj_encoding(obj)
Definition: encoding.c:111
void rb_enc_foreach_name(int(*func)(st_data_t name, st_data_t idx, st_data_t arg), st_data_t arg)
Definition: encoding.c:2208
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Definition: encoding.c:1242
int rb_usascii_encindex(void)
Definition: encoding.c:1555
#define ENC_DUMMY_P(enc)
Definition: encoding.c:95
uint8_t len
Definition: escape.c:17
char str[HTML_ESCAPE_MAX_LEN+1]
Definition: escape.c:18
#define RSTRING_LEN(string)
Definition: fbuffer.h:22
#define RSTRING_PTR(string)
Definition: fbuffer.h:19
#define UNLIKELY(x)
Definition: ffi_common.h:126
#define memcpy(d, s, n)
Definition: ffi_common.h:55
@ RUBY_FL_SHAREABLE
Definition: fl_type.h:169
#define PRIsVALUE
Definition: function.c:10
void rb_gc_register_mark_object(VALUE obj)
Inform the garbage collector that object is a live Ruby object that should not be moved.
Definition: gc.c:8022
#define CLASS_OF
Definition: globals.h:153
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:748
void rb_undef_method(VALUE klass, const char *name)
Definition: class.c:1777
#define FL_SET_RAW
Definition: fl_type.h:129
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:2917
void rb_bug(const char *fmt,...)
Definition: error.c:768
void rb_set_errinfo(VALUE err)
Sets the current exception ($!) to the given value.
Definition: eval.c:1925
VALUE rb_eTypeError
Definition: error.c:1057
VALUE rb_eEncCompatError
Definition: error.c:1064
void rb_warn(const char *fmt,...)
Definition: error.c:408
VALUE rb_eArgError
Definition: error.c:1058
void rb_loaderror(const char *fmt,...)
Definition: error.c:2936
VALUE rb_errinfo(void)
The current exception in the current thread.
Definition: eval.c:1911
VALUE rb_eEncodingError
Definition: error.c:1063
void rb_warning(const char *fmt,...)
Definition: error.c:439
VALUE rb_cObject
Object class.
Definition: object.c:49
void rb_undefined_alloc(VALUE klass)
Definition: object.c:1803
VALUE rb_obj_class(VALUE)
Definition: object.c:245
VALUE rb_obj_freeze(VALUE)
Make the object unmodifiable.
Definition: object.c:1101
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Definition: hash.c:2901
VALUE rb_hash_new(void)
Definition: hash.c:1538
#define ENCODING_SET_INLINED(obj, i)
Definition: encoding.h:48
#define ENC_CODERANGE_7BIT
Definition: encoding.h:93
int rb_enc_str_coderange(VALUE)
Definition: string.c:725
#define rb_enc_name(enc)
Definition: encoding.h:168
#define rb_enc_isascii(c, enc)
Definition: encoding.h:221
VALUE rb_locale_charmap(VALUE klass)
Definition: localeinit.c:91
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:199
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:183
#define rb_enc_asciicompat(enc)
Definition: encoding.h:236
#define rb_enc_codepoint(p, e, enc)
Definition: encoding.h:198
VALUE rb_enc_sprintf(rb_encoding *, const char *,...)
Definition: sprintf.c:1184
#define ENCODING_INLINE_MAX
Definition: encoding.h:29
int rb_enc_str_asciionly_p(VALUE)
Definition: string.c:739
#define rb_enc_code_to_mbclen(c, enc)
Definition: encoding.h:205
#define rb_enc_mbminlen(enc)
Definition: encoding.h:171
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:182
#define ENC_CODERANGE_ASCIIONLY(obj)
Definition: encoding.h:98
#define ENCODING_GET_INLINED(obj)
Definition: encoding.h:50
#define ENC_CODERANGE_CLEAR(obj)
Definition: encoding.h:100
Thin wrapper to ruby/config.h.
#define ruby_debug
Definition: error.h:69
#define ruby_verbose
Definition: error.h:68
#define rb_ary_new2
Definition: array.h:72
#define rb_check_frozen
Definition: error.h:72
#define rb_check_arity
Definition: error.h:34
VALUE rb_check_string_type(VALUE)
Definition: string.c:2462
VALUE rb_ivar_get(VALUE, ID)
Definition: variable.c:1234
VALUE rb_attr_get(VALUE, ID)
Definition: variable.c:1242
VALUE rb_ivar_set(VALUE, ID, VALUE)
Definition: variable.c:1493
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
VALUE rb_sym2str(VALUE)
Definition: symbol.c:927
ID rb_intern(const char *)
Definition: symbol.c:785
#define CONST_ID
Definition: symbol.h:47
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:3150
#define strdup(s)
Definition: util.h:39
Internal header aggregating init functions.
int Init_enc_set_filesystem_encoding(void)
Definition: localeinit.c:119
#define NUM2INT
Definition: int.h:44
#define INT2NUM
Definition: int.h:43
Internal header for Encoding.
#define rb_enc_autoload_p(enc)
Definition: encoding.h:15
Internal header for require.
int rb_require_internal(VALUE fname)
Definition: load.c:1165
Internal header for Object.
Internal header for String.
void rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
Definition: string.c:2378
VALUE rb_fstring(VALUE)
Definition: string.c:353
char * rb_str_to_cstr(VALUE str)
Definition: string.c:2432
Internal header for RubyVM.
#define rb_fstring_cstr(...)
Definition: internal.h:71
#define rb_funcallv(...)
Definition: internal.h:77
voidpf void uLong size
Definition: ioapi.h:138
typedef int(ZCALLBACK *close_file_func) OF((voidpf opaque
voidpf void * buf
Definition: ioapi.h:138
void rb_marshal_define_compat(VALUE newclass, VALUE oldclass, VALUE(*dumper)(VALUE), VALUE(*loader)(VALUE, VALUE))
Definition: marshal.c:146
#define MEMCPY(p1, p2, type, n)
Definition: memory.h:129
#define REALLOC_N
Definition: memory.h:137
#define ALLOCA_N(type, n)
Definition: memory.h:112
#define RB_GC_GUARD(v)
Definition: memory.h:91
const char * name
Definition: nkf.c:208
const char * alias
Definition: nkf.c:1159
@ UTF_32
Definition: nkf.c:117
@ UTF_16BE
Definition: nkf.c:113
@ UTF_16
Definition: nkf.c:112
@ UTF8_MAC
Definition: nkf.c:111
@ UTF_32BE
Definition: nkf.c:118
@ UTF_8
Definition: nkf.c:108
@ EUC_JP
Definition: nkf.c:99
@ UTF_32LE
Definition: nkf.c:120
@ UTF_16LE
Definition: nkf.c:115
@ ASCII
Definition: nkf.c:87
int count
Definition: nkf.c:5055
#define TRUE
Definition: nkf.h:175
#define FALSE
Definition: nkf.h:174
#define ONIGENC_MBC_ENC_LEN(enc, p, e)
Definition: onigmo.h:361
#define ONIGENC_IS_UNICODE(enc)
Definition: onigmo.h:327
#define UChar
Definition: onigmo.h:76
#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n)
Definition: onigmo.h:352
#define ONIGENC_PRECISE_MBC_ENC_LEN(enc, p, e)
Definition: onigmo.h:356
#define ONIGENC_FLAG_UNICODE
Definition: onigmo.h:313
#define ONIGENC_CODE_TO_MBCLEN(enc, code)
Definition: onigmo.h:367
#define debug(lvl, x...)
Definition: ffi.c:52
#define DATA_PTR(obj)
Definition: rdata.h:56
#define RDATA(obj)
Definition: rdata.h:55
#define NULL
Definition: regenc.h:69
#define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c)
Definition: regenc.h:217
#define ONIGENC_ASCII_CODE_TO_UPPER_CASE(c)
Definition: regenc.h:218
#define ONIGENC_IS_ASCII_CODE(code)
Definition: regenc.h:216
#define StringValue(v)
Definition: rstring.h:50
#define TypedData_Wrap_Struct(klass, data_type, sval)
Definition: rtypeddata.h:101
@ RUBY_TYPED_FREE_IMMEDIATELY
Definition: rtypeddata.h:62
int argc
Definition: ruby.c:240
char ** argv
Definition: ruby.c:241
#define SPECIAL_CONST_P
#define Qtrue
#define Qnil
#define Qfalse
#define NIL_P
VALUE rb_sprintf(const char *,...)
Definition: sprintf.c:1203
@ ST_CONTINUE
Definition: st.h:99
unsigned long st_data_t
Definition: st.h:22
#define st_init_strcasetable
Definition: st.h:114
#define st_foreach
Definition: st.h:142
#define st_insert2
Definition: st.h:126
#define st_lookup
Definition: st.h:128
#define st_insert
Definition: st.h:124
size_t strlen(const char *)
const char * name
Definition: onigmo.h:162
int ruby_encoding_index
Definition: onigmo.h:178
Definition: inftree9.h:24
rb_encoding * enc
Definition: encoding.c:1609
Definition: encoding.c:63
const char * name
Definition: encoding.c:64
rb_encoding * base
Definition: encoding.c:66
rb_encoding * enc
Definition: encoding.c:65
Definition: st.h:79
unsigned long VALUE
Definition: value.h:38
unsigned long ID
Definition: value.h:39
#define T_FILE
Definition: value_type.h:61
#define T_STRING
Definition: value_type.h:77
#define T_DATA
Definition: value_type.h:59
#define T_SYMBOL
Definition: value_type.h:79
#define BUILTIN_TYPE
Definition: value_type.h:84
#define SYMBOL_P
Definition: value_type.h:87
#define T_REGEXP
Definition: value_type.h:76
#define RB_VM_LOCK_ENTER()
Definition: vm_sync.h:121
#define RB_VM_LOCK_LEAVE()
Definition: vm_sync.h:122
void Init_w32_codepage(void)
Definition: file.c:723
if((ID)(DISPID) nameid !=nameid)
Definition: win32ole.c:357
#define xmalloc
Definition: xmalloc.h:44
int def(FILE *source, FILE *dest, int level)
Definition: zpipe.c:36