Ruby 3.0.5p211 (2022-11-24 revision ba5cf0f7c52d4d35cc6a173c89eda98ceffa2dcf)
string.c
Go to the documentation of this file.
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#if defined HAVE_CRYPT_R
25# if defined HAVE_CRYPT_H
26# include <crypt.h>
27# endif
28#elif !defined HAVE_CRYPT
29# include "missing/crypt.h"
30# define HAVE_CRYPT_R 1
31#endif
32
33#include "debug_counter.h"
34#include "encindex.h"
35#include "gc.h"
36#include "id.h"
37#include "internal.h"
38#include "internal/array.h"
39#include "internal/compar.h"
40#include "internal/compilers.h"
41#include "internal/encoding.h"
42#include "internal/error.h"
43#include "internal/gc.h"
44#include "internal/numeric.h"
45#include "internal/object.h"
46#include "internal/proc.h"
47#include "internal/re.h"
48#include "internal/sanitizers.h"
49#include "internal/string.h"
50#include "internal/transcode.h"
51#include "probes.h"
52#include "ruby/encoding.h"
53#include "ruby/re.h"
54#include "ruby/util.h"
55#include "ruby_assert.h"
56#include "vm_sync.h"
57
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
60
61#undef rb_str_new
62#undef rb_usascii_str_new
63#undef rb_utf8_str_new
64#undef rb_enc_str_new
65#undef rb_str_new_cstr
66#undef rb_tainted_str_new_cstr
67#undef rb_usascii_str_new_cstr
68#undef rb_utf8_str_new_cstr
69#undef rb_enc_str_new_cstr
70#undef rb_external_str_new_cstr
71#undef rb_locale_str_new_cstr
72#undef rb_str_dup_frozen
73#undef rb_str_buf_new_cstr
74#undef rb_str_buf_cat
75#undef rb_str_buf_cat2
76#undef rb_str_cat2
77#undef rb_str_cat_cstr
78#undef rb_fstring_cstr
79
82
83/* FLAGS of RString
84 *
85 * 1: RSTRING_NOEMBED
86 * 2: STR_SHARED (== ELTS_SHARED)
87 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
88 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
89 * other strings that rely on this string's buffer)
90 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
91 * early, specific to rb_str_tmp_frozen_{acquire,release})
92 * 7: STR_TMPLOCK
93 * 8-9: ENC_CODERANGE (2 bits)
94 * 10-16: ENCODING (7 bits == 128)
95 * 17: RSTRING_FSTR
96 * 18: STR_NOFREE
97 * 19: STR_FAKESTR
98 */
99
100#define RUBY_MAX_CHAR_LEN 16
101#define STR_SHARED_ROOT FL_USER5
102#define STR_BORROWED FL_USER6
103#define STR_TMPLOCK FL_USER7
104#define STR_NOFREE FL_USER18
105#define STR_FAKESTR FL_USER19
106
107#define STR_SET_NOEMBED(str) do {\
108 FL_SET((str), STR_NOEMBED);\
109 STR_SET_EMBED_LEN((str), 0);\
110} while (0)
111#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
112#define STR_SET_EMBED_LEN(str, n) do { \
113 long tmp_n = (n);\
114 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
115 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
116} while (0)
117
118#define STR_SET_LEN(str, n) do { \
119 if (STR_EMBED_P(str)) {\
120 STR_SET_EMBED_LEN((str), (n));\
121 }\
122 else {\
123 RSTRING(str)->as.heap.len = (n);\
124 }\
125} while (0)
126
127#define STR_DEC_LEN(str) do {\
128 if (STR_EMBED_P(str)) {\
129 long n = RSTRING_LEN(str);\
130 n--;\
131 STR_SET_EMBED_LEN((str), n);\
132 }\
133 else {\
134 RSTRING(str)->as.heap.len--;\
135 }\
136} while (0)
137
138#define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
139#define TERM_FILL(ptr, termlen) do {\
140 char *const term_fill_ptr = (ptr);\
141 const int term_fill_len = (termlen);\
142 *term_fill_ptr = '\0';\
143 if (UNLIKELY(term_fill_len > 1))\
144 memset(term_fill_ptr, 0, term_fill_len);\
145} while (0)
146
147#define RESIZE_CAPA(str,capacity) do {\
148 const int termlen = TERM_LEN(str);\
149 RESIZE_CAPA_TERM(str,capacity,termlen);\
150} while (0)
151#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
152 if (STR_EMBED_P(str)) {\
153 if (!STR_EMBEDDABLE_P(capacity, termlen)) {\
154 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
155 const long tlen = RSTRING_LEN(str);\
156 memcpy(tmp, RSTRING_PTR(str), tlen);\
157 RSTRING(str)->as.heap.ptr = tmp;\
158 RSTRING(str)->as.heap.len = tlen;\
159 STR_SET_NOEMBED(str);\
160 RSTRING(str)->as.heap.aux.capa = (capacity);\
161 }\
162 }\
163 else {\
164 assert(!FL_TEST((str), STR_SHARED)); \
165 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
166 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
167 RSTRING(str)->as.heap.aux.capa = (capacity);\
168 }\
169} while (0)
170
171#define STR_SET_SHARED(str, shared_str) do { \
172 if (!FL_TEST(str, STR_FAKESTR)) { \
173 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
174 FL_SET((str), STR_SHARED); \
175 FL_SET((shared_str), STR_SHARED_ROOT); \
176 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
177 FL_SET_RAW((shared_str), STR_BORROWED); \
178 } \
179} while (0)
180
181#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
182#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
183/* TODO: include the terminator size in capa. */
184
185#define STR_ENC_GET(str) get_encoding(str)
186
187#if !defined SHARABLE_MIDDLE_SUBSTRING
188# define SHARABLE_MIDDLE_SUBSTRING 0
189#endif
190#if !SHARABLE_MIDDLE_SUBSTRING
191#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
192#else
193#define SHARABLE_SUBSTRING_P(beg, len, end) 1
194#endif
195
196#define STR_EMBEDDABLE_P(len, termlen) \
197 ((len) <= RSTRING_EMBED_LEN_MAX + 1 - (termlen))
198
199static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
200static VALUE str_new_shared(VALUE klass, VALUE str);
201static VALUE str_new_frozen(VALUE klass, VALUE orig);
202static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
203static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
204static VALUE str_new(VALUE klass, const char *ptr, long len);
205static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
206static inline void str_modifiable(VALUE str);
207static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
208
209static inline void
210str_make_independent(VALUE str)
211{
212 long len = RSTRING_LEN(str);
213 int termlen = TERM_LEN(str);
214 str_make_independent_expand((str), len, 0L, termlen);
215}
216
217static inline int str_dependent_p(VALUE str);
218
219void
221{
222 if (str_dependent_p(str)) {
223 str_make_independent(str);
224 }
225}
226
227/* symbols for [up|down|swap]case/capitalize options */
228static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
229
230static rb_encoding *
231get_actual_encoding(const int encidx, VALUE str)
232{
233 const unsigned char *q;
234
235 switch (encidx) {
236 case ENCINDEX_UTF_16:
237 if (RSTRING_LEN(str) < 2) break;
238 q = (const unsigned char *)RSTRING_PTR(str);
239 if (q[0] == 0xFE && q[1] == 0xFF) {
241 }
242 if (q[0] == 0xFF && q[1] == 0xFE) {
244 }
245 return rb_ascii8bit_encoding();
246 case ENCINDEX_UTF_32:
247 if (RSTRING_LEN(str) < 4) break;
248 q = (const unsigned char *)RSTRING_PTR(str);
249 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
251 }
252 if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
254 }
255 return rb_ascii8bit_encoding();
256 }
257 return rb_enc_from_index(encidx);
258}
259
260static rb_encoding *
261get_encoding(VALUE str)
262{
263 return get_actual_encoding(ENCODING_GET(str), str);
264}
265
266static void
267mustnot_broken(VALUE str)
268{
269 if (is_broken_string(str)) {
270 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
271 }
272}
273
274static void
275mustnot_wchar(VALUE str)
276{
278 if (rb_enc_mbminlen(enc) > 1) {
279 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
280 }
281}
282
283static int fstring_cmp(VALUE a, VALUE b);
284
285static VALUE register_fstring(VALUE str, bool copy);
286
288 fstring_cmp,
290};
291
292#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
293
296 bool copy;
297};
298
299static int
300fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
301{
302
303 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
304 VALUE str = (VALUE)*key;
305
306 if (existing) {
307 /* because of lazy sweep, str may be unmarked already and swept
308 * at next time */
309
311 arg->fstr = Qundef;
312 return ST_DELETE;
313 }
314
315 arg->fstr = str;
316 return ST_STOP;
317 }
318 else {
320 if (arg->copy) {
321 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
322 rb_enc_copy(new_str, str);
323 str = new_str;
324 }
325 else {
326 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
327 RSTRING(str)->as.heap.len,
329 }
331 }
332 else {
333 if (!OBJ_FROZEN(str))
334 str = str_new_frozen(rb_cString, str);
335 if (STR_SHARED_P(str)) { /* str should not be shared */
336 /* shared substring */
337 str_make_independent(str);
339 }
340 if (!BARE_STRING_P(str)) {
341 str = str_new_frozen(rb_cString, str);
342 }
343 }
344 RBASIC(str)->flags |= RSTRING_FSTR;
345
346 *key = *value = arg->fstr = str;
347 return ST_CONTINUE;
348 }
349}
350
351RUBY_FUNC_EXPORTED
352VALUE
354{
355 VALUE fstr;
356 int bare;
357
358 Check_Type(str, T_STRING);
359
361 return str;
362
363 bare = BARE_STRING_P(str);
364 if (!bare) {
365 if (STR_EMBED_P(str)) {
367 return str;
368 }
371 return str;
372 }
373 }
374
375 if (!OBJ_FROZEN(str))
377
378 fstr = register_fstring(str, FALSE);
379
380 if (!bare) {
381 str_replace_shared_without_enc(str, fstr);
383 return str;
384 }
385 return fstr;
386}
387
388static VALUE
389register_fstring(VALUE str, bool copy)
390{
391 struct fstr_update_arg args;
392 args.copy = copy;
393
395 {
396 st_table *frozen_strings = rb_vm_fstring_table();
397 do {
398 args.fstr = str;
399 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
400 } while (args.fstr == Qundef);
401 }
403
404 assert(OBJ_FROZEN(args.fstr));
405 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
406 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
407 assert(RBASIC_CLASS(args.fstr) == rb_cString);
408 return args.fstr;
409}
410
411static VALUE
412setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
413{
415 /* SHARED to be allocated by the callback */
416
417 if (!name) {
419 name = "";
420 }
421
422 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
423
424 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
425 fake_str->as.heap.len = len;
426 fake_str->as.heap.ptr = (char *)name;
427 fake_str->as.heap.aux.capa = len;
428 return (VALUE)fake_str;
429}
430
431/*
432 * set up a fake string which refers a static string literal.
433 */
434VALUE
435rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
436{
437 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
438}
439
440/*
441 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
442 * shared string which refers a static string literal. `ptr` must
443 * point a constant string.
444 */
446rb_fstring_new(const char *ptr, long len)
447{
448 struct RString fake_str;
449 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
450}
451
452VALUE
453rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
454{
455 struct RString fake_str;
456 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
457}
458
459VALUE
461{
462 return rb_fstring_new(ptr, strlen(ptr));
463}
464
465static int
466fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
467{
468 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
469 return ST_CONTINUE;
470}
471
472static int
473fstring_cmp(VALUE a, VALUE b)
474{
475 long alen, blen;
476 const char *aptr, *bptr;
477 RSTRING_GETMEM(a, aptr, alen);
478 RSTRING_GETMEM(b, bptr, blen);
479 return (alen != blen ||
480 ENCODING_GET(a) != ENCODING_GET(b) ||
481 memcmp(aptr, bptr, alen) != 0);
482}
483
484static inline int
485single_byte_optimizable(VALUE str)
486{
487 rb_encoding *enc;
488
489 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
491 return 1;
492
493 enc = STR_ENC_GET(str);
494 if (rb_enc_mbmaxlen(enc) == 1)
495 return 1;
496
497 /* Conservative. Possibly single byte.
498 * "\xa1" in Shift_JIS for example. */
499 return 0;
500}
501
503
504static inline const char *
505search_nonascii(const char *p, const char *e)
506{
507 const uintptr_t *s, *t;
508
509#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
510# if SIZEOF_UINTPTR_T == 8
511# define NONASCII_MASK UINT64_C(0x8080808080808080)
512# elif SIZEOF_UINTPTR_T == 4
513# define NONASCII_MASK UINT32_C(0x80808080)
514# else
515# error "don't know what to do."
516# endif
517#else
518# if SIZEOF_UINTPTR_T == 8
519# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
520# elif SIZEOF_UINTPTR_T == 4
521# define NONASCII_MASK 0x80808080UL /* or...? */
522# else
523# error "don't know what to do."
524# endif
525#endif
526
527 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
528#if !UNALIGNED_WORD_ACCESS
529 if ((uintptr_t)p % SIZEOF_VOIDP) {
530 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
531 p += l;
532 switch (l) {
533 default: UNREACHABLE;
534#if SIZEOF_VOIDP > 4
535 case 7: if (p[-7]&0x80) return p-7;
536 case 6: if (p[-6]&0x80) return p-6;
537 case 5: if (p[-5]&0x80) return p-5;
538 case 4: if (p[-4]&0x80) return p-4;
539#endif
540 case 3: if (p[-3]&0x80) return p-3;
541 case 2: if (p[-2]&0x80) return p-2;
542 case 1: if (p[-1]&0x80) return p-1;
543 case 0: break;
544 }
545 }
546#endif
547#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
548#define aligned_ptr(value) \
549 __builtin_assume_aligned((value), sizeof(uintptr_t))
550#else
551#define aligned_ptr(value) (uintptr_t *)(value)
552#endif
553 s = aligned_ptr(p);
554 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
555#undef aligned_ptr
556 for (;s < t; s++) {
557 if (*s & NONASCII_MASK) {
558#ifdef WORDS_BIGENDIAN
559 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
560#else
561 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
562#endif
563 }
564 }
565 p = (const char *)s;
566 }
567
568 switch (e - p) {
569 default: UNREACHABLE;
570#if SIZEOF_VOIDP > 4
571 case 7: if (e[-7]&0x80) return e-7;
572 case 6: if (e[-6]&0x80) return e-6;
573 case 5: if (e[-5]&0x80) return e-5;
574 case 4: if (e[-4]&0x80) return e-4;
575#endif
576 case 3: if (e[-3]&0x80) return e-3;
577 case 2: if (e[-2]&0x80) return e-2;
578 case 1: if (e[-1]&0x80) return e-1;
579 case 0: return NULL;
580 }
581}
582
583static int
584coderange_scan(const char *p, long len, rb_encoding *enc)
585{
586 const char *e = p + len;
587
589 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
590 p = search_nonascii(p, e);
592 }
593
594 if (rb_enc_asciicompat(enc)) {
595 p = search_nonascii(p, e);
596 if (!p) return ENC_CODERANGE_7BIT;
597 for (;;) {
598 int ret = rb_enc_precise_mbclen(p, e, enc);
600 p += MBCLEN_CHARFOUND_LEN(ret);
601 if (p == e) break;
602 p = search_nonascii(p, e);
603 if (!p) break;
604 }
605 }
606 else {
607 while (p < e) {
608 int ret = rb_enc_precise_mbclen(p, e, enc);
610 p += MBCLEN_CHARFOUND_LEN(ret);
611 }
612 }
613 return ENC_CODERANGE_VALID;
614}
615
616long
617rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
618{
619 const char *p = s;
620
621 if (*cr == ENC_CODERANGE_BROKEN)
622 return e - s;
623
625 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
626 if (*cr == ENC_CODERANGE_VALID) return e - s;
627 p = search_nonascii(p, e);
629 return e - s;
630 }
631 else if (rb_enc_asciicompat(enc)) {
632 p = search_nonascii(p, e);
633 if (!p) {
634 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
635 return e - s;
636 }
637 for (;;) {
638 int ret = rb_enc_precise_mbclen(p, e, enc);
639 if (!MBCLEN_CHARFOUND_P(ret)) {
641 return p - s;
642 }
643 p += MBCLEN_CHARFOUND_LEN(ret);
644 if (p == e) break;
645 p = search_nonascii(p, e);
646 if (!p) break;
647 }
648 }
649 else {
650 while (p < e) {
651 int ret = rb_enc_precise_mbclen(p, e, enc);
652 if (!MBCLEN_CHARFOUND_P(ret)) {
654 return p - s;
655 }
656 p += MBCLEN_CHARFOUND_LEN(ret);
657 }
658 }
660 return e - s;
661}
662
663static inline void
664str_enc_copy(VALUE str1, VALUE str2)
665{
666 rb_enc_set_index(str1, ENCODING_GET(str2));
667}
668
669static void
670rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
671{
672 /* this function is designed for copying encoding and coderange
673 * from src to new string "dest" which is made from the part of src.
674 */
675 str_enc_copy(dest, src);
676 if (RSTRING_LEN(dest) == 0) {
679 else
681 return;
682 }
683 switch (ENC_CODERANGE(src)) {
686 break;
688 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
689 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
691 else
693 break;
694 default:
695 break;
696 }
697}
698
699static void
700rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
701{
702 str_enc_copy(dest, src);
704}
705
706static int
707enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx)
708{
709 if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
710 rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
712 }
713 else {
714 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
715 }
716}
717
718int
720{
721 return enc_coderange_scan(str, enc, rb_enc_to_index(enc));
722}
723
724int
726{
727 int cr = ENC_CODERANGE(str);
728
729 if (cr == ENC_CODERANGE_UNKNOWN) {
730 int encidx = ENCODING_GET(str);
731 rb_encoding *enc = rb_enc_from_index(encidx);
732 cr = enc_coderange_scan(str, enc, encidx);
734 }
735 return cr;
736}
737
738int
740{
742
743 if (!rb_enc_asciicompat(enc))
744 return FALSE;
746 return TRUE;
747 return FALSE;
748}
749
750static inline void
751str_mod_check(VALUE s, const char *p, long len)
752{
753 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
754 rb_raise(rb_eRuntimeError, "string modified");
755 }
756}
757
758static size_t
759str_capacity(VALUE str, const int termlen)
760{
761 if (STR_EMBED_P(str)) {
762 return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
763 }
764 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
765 return RSTRING(str)->as.heap.len;
766 }
767 else {
768 return RSTRING(str)->as.heap.aux.capa;
769 }
770}
771
772size_t
774{
775 return str_capacity(str, TERM_LEN(str));
776}
777
778static inline void
779must_not_null(const char *ptr)
780{
781 if (!ptr) {
782 rb_raise(rb_eArgError, "NULL pointer given");
783 }
784}
785
786static inline VALUE
787str_alloc(VALUE klass)
788{
790 return (VALUE)str;
791}
792
793static inline VALUE
794empty_str_alloc(VALUE klass)
795{
796 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
797 return str_alloc(klass);
798}
799
800static VALUE
801str_new0(VALUE klass, const char *ptr, long len, int termlen)
802{
803 VALUE str;
804
805 if (len < 0) {
806 rb_raise(rb_eArgError, "negative string size (or size too big)");
807 }
808
810
811 str = str_alloc(klass);
812 if (!STR_EMBEDDABLE_P(len, termlen)) {
813 RSTRING(str)->as.heap.aux.capa = len;
814 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)len + termlen);
816 }
817 else if (len == 0) {
819 }
820 if (ptr) {
822 }
824 TERM_FILL(RSTRING_PTR(str) + len, termlen);
825 return str;
826}
827
828static VALUE
829str_new(VALUE klass, const char *ptr, long len)
830{
831 return str_new0(klass, ptr, len, 1);
832}
833
834VALUE
835rb_str_new(const char *ptr, long len)
836{
837 return str_new(rb_cString, ptr, len);
838}
839
840VALUE
841rb_usascii_str_new(const char *ptr, long len)
842{
845 return str;
846}
847
848VALUE
849rb_utf8_str_new(const char *ptr, long len)
850{
851 VALUE str = str_new(rb_cString, ptr, len);
853 return str;
854}
855
856VALUE
857rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
858{
859 VALUE str;
860
861 if (!enc) return rb_str_new(ptr, len);
862
863 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
864 rb_enc_associate(str, enc);
865 return str;
866}
867
868VALUE
870{
871 must_not_null(ptr);
872 /* rb_str_new_cstr() can take pointer from non-malloc-generated
873 * memory regions, and that cannot be detected by the MSAN. Just
874 * trust the programmer that the argument passed here is a sane C
875 * string. */
876 __msan_unpoison_string(ptr);
877 return rb_str_new(ptr, strlen(ptr));
878}
879
880VALUE
882{
885 return str;
886}
887
888VALUE
890{
893 return str;
894}
895
896VALUE
898{
899 must_not_null(ptr);
900 if (rb_enc_mbminlen(enc) != 1) {
901 rb_raise(rb_eArgError, "wchar encoding given");
902 }
903 return rb_enc_str_new(ptr, strlen(ptr), enc);
904}
905
906static VALUE
907str_new_static(VALUE klass, const char *ptr, long len, int encindex)
908{
909 VALUE str;
910
911 if (len < 0) {
912 rb_raise(rb_eArgError, "negative string size (or size too big)");
913 }
914
915 if (!ptr) {
916 rb_encoding *enc = rb_enc_get_from_index(encindex);
917 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
918 }
919 else {
921 str = str_alloc(klass);
922 RSTRING(str)->as.heap.len = len;
923 RSTRING(str)->as.heap.ptr = (char *)ptr;
924 RSTRING(str)->as.heap.aux.capa = len;
926 RBASIC(str)->flags |= STR_NOFREE;
927 }
928 rb_enc_associate_index(str, encindex);
929 return str;
930}
931
932VALUE
933rb_str_new_static(const char *ptr, long len)
934{
935 return str_new_static(rb_cString, ptr, len, 0);
936}
937
938VALUE
940{
941 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
942}
943
944VALUE
946{
947 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
948}
949
950VALUE
951rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
952{
953 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
954}
955
956VALUE
957rb_tainted_str_new(const char *ptr, long len)
958{
959 rb_warn_deprecated_to_remove("rb_tainted_str_new", "3.2");
960 return rb_str_new(ptr, len);
961}
962
963VALUE
965{
966 rb_warn_deprecated_to_remove("rb_tainted_str_new_cstr", "3.2");
967 return rb_str_new_cstr(ptr);
968}
969
970static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
971 rb_encoding *from, rb_encoding *to,
972 int ecflags, VALUE ecopts);
973
974static inline bool
975is_enc_ascii_string(VALUE str, rb_encoding *enc)
976{
977 int encidx = rb_enc_to_index(enc);
978 if (rb_enc_get_index(str) == encidx)
979 return is_ascii_string(str);
980 return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT;
981}
982
983VALUE
985{
986 long len;
987 const char *ptr;
988 VALUE newstr;
989
990 if (!to) return str;
991 if (!from) from = rb_enc_get(str);
992 if (from == to) return str;
993 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
994 to == rb_ascii8bit_encoding()) {
995 if (STR_ENC_GET(str) != to) {
996 str = rb_str_dup(str);
998 }
999 return str;
1000 }
1001
1003 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1004 from, to, ecflags, ecopts);
1005 if (NIL_P(newstr)) {
1006 /* some error, return original */
1007 return str;
1008 }
1009 return newstr;
1010}
1011
1012VALUE
1013rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1014 rb_encoding *from, int ecflags, VALUE ecopts)
1015{
1016 long olen;
1017
1018 olen = RSTRING_LEN(newstr);
1019 if (ofs < -olen || olen < ofs)
1020 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1021 if (ofs < 0) ofs += olen;
1022 if (!from) {
1023 STR_SET_LEN(newstr, ofs);
1024 return rb_str_cat(newstr, ptr, len);
1025 }
1026
1027 rb_str_modify(newstr);
1028 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1029 rb_enc_get(newstr),
1030 ecflags, ecopts);
1031}
1032
1033VALUE
1035{
1036 STR_SET_LEN(str, 0);
1037 rb_enc_associate(str, enc);
1038 rb_str_cat(str, ptr, len);
1039 return str;
1040}
1041
1042static VALUE
1043str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1044 rb_encoding *from, rb_encoding *to,
1045 int ecflags, VALUE ecopts)
1046{
1047 rb_econv_t *ec;
1049 long olen;
1050 VALUE econv_wrapper;
1051 const unsigned char *start, *sp;
1052 unsigned char *dest, *dp;
1053 size_t converted_output = (size_t)ofs;
1054
1055 olen = rb_str_capacity(newstr);
1056
1057 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1058 RBASIC_CLEAR_CLASS(econv_wrapper);
1059 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1060 if (!ec) return Qnil;
1061 DATA_PTR(econv_wrapper) = ec;
1062
1063 sp = (unsigned char*)ptr;
1064 start = sp;
1065 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1066 (dp = dest + converted_output),
1067 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1069 /* destination buffer short */
1070 size_t converted_input = sp - start;
1071 size_t rest = len - converted_input;
1072 converted_output = dp - dest;
1073 rb_str_set_len(newstr, converted_output);
1074 if (converted_input && converted_output &&
1075 rest < (LONG_MAX / converted_output)) {
1076 rest = (rest * converted_output) / converted_input;
1077 }
1078 else {
1079 rest = olen;
1080 }
1081 olen += rest < 2 ? 2 : rest;
1082 rb_str_resize(newstr, olen);
1083 }
1084 DATA_PTR(econv_wrapper) = 0;
1085 rb_econv_close(ec);
1086 rb_gc_force_recycle(econv_wrapper);
1087 switch (ret) {
1088 case econv_finished:
1089 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1090 rb_str_set_len(newstr, len);
1091 rb_enc_associate(newstr, to);
1092 return newstr;
1093
1094 default:
1095 return Qnil;
1096 }
1097}
1098
1099VALUE
1101{
1102 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1103}
1104
1105VALUE
1107{
1108 rb_encoding *ienc;
1109 VALUE str;
1110 const int eidx = rb_enc_to_index(eenc);
1111
1112 if (!ptr) {
1113 return rb_enc_str_new(ptr, len, eenc);
1114 }
1115
1116 /* ASCII-8BIT case, no conversion */
1117 if ((eidx == rb_ascii8bit_encindex()) ||
1118 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1119 return rb_str_new(ptr, len);
1120 }
1121 /* no default_internal or same encoding, no conversion */
1123 if (!ienc || eenc == ienc) {
1124 return rb_enc_str_new(ptr, len, eenc);
1125 }
1126 /* ASCII compatible, and ASCII only string, no conversion in
1127 * default_internal */
1128 if ((eidx == rb_ascii8bit_encindex()) ||
1129 (eidx == rb_usascii_encindex()) ||
1130 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1131 return rb_enc_str_new(ptr, len, ienc);
1132 }
1133 /* convert from the given encoding to default_internal */
1134 str = rb_enc_str_new(NULL, 0, ienc);
1135 /* when the conversion failed for some reason, just ignore the
1136 * default_internal and result in the given encoding as-is. */
1137 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1138 rb_str_initialize(str, ptr, len, eenc);
1139 }
1140 return str;
1141}
1142
1143VALUE
1145{
1146 int eidx = rb_enc_to_index(eenc);
1147 if (eidx == rb_usascii_encindex() &&
1150 return str;
1151 }
1154}
1155
1156VALUE
1157rb_external_str_new(const char *ptr, long len)
1158{
1160}
1161
1162VALUE
1164{
1166}
1167
1168VALUE
1169rb_locale_str_new(const char *ptr, long len)
1170{
1172}
1173
1174VALUE
1176{
1178}
1179
1180VALUE
1182{
1184}
1185
1186VALUE
1188{
1190}
1191
1192VALUE
1194{
1196}
1197
1198VALUE
1200{
1202}
1203
1204VALUE
1206{
1207 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1208}
1209
1210static VALUE
1211str_replace_shared_without_enc(VALUE str2, VALUE str)
1212{
1213 const int termlen = TERM_LEN(str);
1214 char *ptr;
1215 long len;
1216
1218 if (STR_EMBEDDABLE_P(len, termlen)) {
1219 char *ptr2 = RSTRING(str2)->as.ary;
1220 STR_SET_EMBED(str2);
1221 memcpy(ptr2, RSTRING_PTR(str), len);
1222 STR_SET_EMBED_LEN(str2, len);
1223 TERM_FILL(ptr2+len, termlen);
1224 }
1225 else {
1226 VALUE root;
1227 if (STR_SHARED_P(str)) {
1228 root = RSTRING(str)->as.heap.aux.shared;
1230 }
1231 else {
1234 }
1235 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1236 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1237 rb_fatal("about to free a possible shared root");
1238 }
1239 char *ptr2 = STR_HEAP_PTR(str2);
1240 if (ptr2 != ptr) {
1241 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1242 }
1243 }
1244 FL_SET(str2, STR_NOEMBED);
1245 RSTRING(str2)->as.heap.len = len;
1246 RSTRING(str2)->as.heap.ptr = ptr;
1247 STR_SET_SHARED(str2, root);
1248 }
1249 return str2;
1250}
1251
1252static VALUE
1253str_replace_shared(VALUE str2, VALUE str)
1254{
1255 str_replace_shared_without_enc(str2, str);
1256 rb_enc_cr_str_exact_copy(str2, str);
1257 return str2;
1258}
1259
1260static VALUE
1261str_new_shared(VALUE klass, VALUE str)
1262{
1263 return str_replace_shared(str_alloc(klass), str);
1264}
1265
1266VALUE
1268{
1269 return str_new_shared(rb_obj_class(str), str);
1270}
1271
1272VALUE
1274{
1275 if (OBJ_FROZEN(orig)) return orig;
1276 return str_new_frozen(rb_obj_class(orig), orig);
1277}
1278
1279static VALUE
1280rb_str_new_frozen_String(VALUE orig)
1281{
1282 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1283 return str_new_frozen(rb_cString, orig);
1284}
1285
1286VALUE
1288{
1289 if (OBJ_FROZEN_RAW(orig)) return orig;
1290 return str_new_frozen_buffer(0, orig, FALSE);
1291}
1292
1293void
1295{
1296 if (RBASIC_CLASS(tmp) != 0)
1297 return;
1298
1299 if (STR_EMBED_P(tmp)) {
1300 assert(OBJ_FROZEN_RAW(tmp));
1302 }
1303 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1305 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1306
1307 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1308 FL_UNSET_RAW(orig, STR_SHARED);
1309 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1310 assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1311 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1312 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1313 assert(OBJ_FROZEN_RAW(tmp));
1315 }
1316 }
1317}
1318
1319static VALUE
1320str_new_frozen(VALUE klass, VALUE orig)
1321{
1322 return str_new_frozen_buffer(klass, orig, TRUE);
1323}
1324
1325static VALUE
1326str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1327{
1328 VALUE str;
1329
1330 if (STR_EMBED_P(orig)) {
1331 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1332 }
1333 else {
1334 if (FL_TEST_RAW(orig, STR_SHARED)) {
1335 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1336 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING(shared)->as.heap.ptr;
1337 long rest = RSTRING(shared)->as.heap.len - ofs - RSTRING(orig)->as.heap.len;
1338 assert(!STR_EMBED_P(shared));
1340
1341 if ((ofs > 0) || (rest > 0) ||
1342 (klass != RBASIC(shared)->klass) ||
1343 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1344 str = str_new_shared(klass, shared);
1345 RSTRING(str)->as.heap.ptr += ofs;
1346 RSTRING(str)->as.heap.len -= ofs + rest;
1347 }
1348 else {
1349 if (RBASIC_CLASS(shared) == 0)
1351 return shared;
1352 }
1353 }
1354 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1355 str = str_alloc(klass);
1359 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1360 }
1361 else {
1362 str = str_alloc(klass);
1364 RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1365 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1366 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1367 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1368 RBASIC(orig)->flags &= ~STR_NOFREE;
1369 STR_SET_SHARED(orig, str);
1370 if (klass == 0)
1372 }
1373 }
1374
1375 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1376 OBJ_FREEZE(str);
1377 return str;
1378}
1379
1380VALUE
1381rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1382{
1383 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1384}
1385
1386static VALUE
1387str_new_empty_String(VALUE str)
1388{
1389 VALUE v = rb_str_new(0, 0);
1390 rb_enc_copy(v, str);
1391 return v;
1392}
1393
1394#define STR_BUF_MIN_SIZE 63
1396
1397VALUE
1399{
1400 VALUE str = str_alloc(rb_cString);
1401
1402 if (capa <= RSTRING_EMBED_LEN_MAX) return str;
1403 if (capa < STR_BUF_MIN_SIZE) {
1405 }
1407 RSTRING(str)->as.heap.aux.capa = capa;
1408 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1409 RSTRING(str)->as.heap.ptr[0] = '\0';
1410
1411 return str;
1412}
1413
1414VALUE
1416{
1417 VALUE str;
1418 long len = strlen(ptr);
1419
1422
1423 return str;
1424}
1425
1426VALUE
1428{
1429 return str_new(0, 0, len);
1430}
1431
1432void
1434{
1435 if (FL_TEST(str, RSTRING_FSTR)) {
1436 st_data_t fstr = (st_data_t)str;
1437
1439 {
1441 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1442 }
1444 }
1445
1446 if (STR_EMBED_P(str)) {
1447 RB_DEBUG_COUNTER_INC(obj_str_embed);
1448 }
1449 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1450 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1451 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1452 }
1453 else {
1454 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1456 }
1457}
1458
1459RUBY_FUNC_EXPORTED size_t
1461{
1463 return STR_HEAP_SIZE(str);
1464 }
1465 else {
1466 return 0;
1467 }
1468}
1469
1470VALUE
1472{
1473 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1474}
1475
1476static inline void str_discard(VALUE str);
1477static void str_shared_replace(VALUE str, VALUE str2);
1478
1479void
1481{
1482 if (str != str2) str_shared_replace(str, str2);
1483}
1484
1485static void
1486str_shared_replace(VALUE str, VALUE str2)
1487{
1488 rb_encoding *enc;
1489 int cr;
1490 int termlen;
1491
1492 RUBY_ASSERT(str2 != str);
1493 enc = STR_ENC_GET(str2);
1494 cr = ENC_CODERANGE(str2);
1495 str_discard(str);
1496 termlen = rb_enc_mbminlen(enc);
1497
1498 if (STR_EMBEDDABLE_P(RSTRING_LEN(str2), termlen)) {
1500 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1502 rb_enc_associate(str, enc);
1504 }
1505 else {
1508 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1509 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1510
1511 if (FL_TEST(str2, STR_SHARED)) {
1512 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1514 }
1515 else {
1516 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1517 }
1518
1519 /* abandon str2 */
1520 STR_SET_EMBED(str2);
1521 RSTRING_PTR(str2)[0] = 0;
1522 STR_SET_EMBED_LEN(str2, 0);
1523 rb_enc_associate(str, enc);
1525 }
1526}
1527
1528VALUE
1530{
1531 VALUE str;
1532
1533 if (RB_TYPE_P(obj, T_STRING)) {
1534 return obj;
1535 }
1536 str = rb_funcall(obj, idTo_s, 0);
1537 return rb_obj_as_string_result(str, obj);
1538}
1539
1542{
1543 if (!RB_TYPE_P(str, T_STRING))
1544 return rb_any_to_s(obj);
1545 return str;
1546}
1547
1548static VALUE
1549str_replace(VALUE str, VALUE str2)
1550{
1551 long len;
1552
1553 len = RSTRING_LEN(str2);
1554 if (STR_SHARED_P(str2)) {
1555 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1558 RSTRING(str)->as.heap.len = len;
1559 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1561 rb_enc_cr_str_exact_copy(str, str2);
1562 }
1563 else {
1564 str_replace_shared(str, str2);
1565 }
1566
1567 return str;
1568}
1569
1570static inline VALUE
1571ec_str_alloc(struct rb_execution_context_struct *ec, VALUE klass)
1572{
1574 return (VALUE)str;
1575}
1576
1577static inline VALUE
1578str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1579{
1580 enum {embed_size = RSTRING_EMBED_LEN_MAX + 1};
1581 const VALUE flag_mask =
1584 FL_FREEZE
1585 ;
1586 VALUE flags = FL_TEST_RAW(str, flag_mask);
1587 int encidx = 0;
1588 MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1589 char, embed_size);
1590 if (flags & STR_NOEMBED) {
1591 if (FL_TEST_RAW(str, STR_SHARED)) {
1592 str = RSTRING(str)->as.heap.aux.shared;
1593 }
1594 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1595 str = str_new_frozen(klass, str);
1596 flags = FL_TEST_RAW(str, flag_mask);
1597 }
1598 if (flags & STR_NOEMBED) {
1599 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, str);
1600 flags |= STR_SHARED;
1601 }
1602 else {
1603 MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1604 char, embed_size);
1605 }
1606 }
1607 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1608 encidx = rb_enc_get_index(str);
1609 flags &= ~ENCODING_MASK;
1610 }
1611 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1612 if (encidx) rb_enc_associate_index(dup, encidx);
1613 return dup;
1614}
1615
1616static inline VALUE
1617ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1618{
1619 VALUE dup = ec_str_alloc(ec, klass);
1620 return str_duplicate_setup(klass, str, dup);
1621}
1622
1623static inline VALUE
1624str_duplicate(VALUE klass, VALUE str)
1625{
1626 VALUE dup = str_alloc(klass);
1627 return str_duplicate_setup(klass, str, dup);
1628}
1629
1630VALUE
1632{
1633 return str_duplicate(rb_obj_class(str), str);
1634}
1635
1636VALUE
1638{
1640 return str_duplicate(rb_cString, str);
1641}
1642
1643VALUE
1645{
1647 return ec_str_duplicate(ec, rb_cString, str);
1648}
1649
1650/*
1651 * call-seq:
1652 * String.new(string = '') -> new_string
1653 * String.new(string = '', encoding: encoding) -> new_string
1654 * String.new(string = '', capacity: size) -> new_string
1655 *
1656 * Returns a new \String that is a copy of +string+.
1657 *
1658 * With no arguments, returns the empty string with the Encoding <tt>ASCII-8BIT</tt>:
1659 * s = String.new
1660 * s # => ""
1661 * s.encoding # => #<Encoding:ASCII-8BIT>
1662 *
1663 * With the single \String argument +string+, returns a copy of +string+
1664 * with the same encoding as +string+:
1665 * s = String.new("Que veut dire \u{e7}a?")
1666 * s # => "Que veut dire \u{e7}a?"
1667 * s.encoding # => #<Encoding:UTF-8>
1668 *
1669 * Literal strings like <tt>""</tt> or here-documents always use
1670 * {script encoding}[Encoding.html#class-Encoding-label-Script+encoding], unlike String.new.
1671 *
1672 * With keyword +encoding+, returns a copy of +str+
1673 * with the specified encoding:
1674 * s = String.new(encoding: 'ASCII')
1675 * s.encoding # => #<Encoding:US-ASCII>
1676 * s = String.new('foo', encoding: 'ASCII')
1677 * s.encoding # => #<Encoding:US-ASCII>
1678 *
1679 * Note that these are equivalent:
1680 * s0 = String.new('foo', encoding: 'ASCII')
1681 * s1 = 'foo'.force_encoding('ASCII')
1682 * s0.encoding == s1.encoding # => true
1683 *
1684 * With keyword +capacity+, returns a copy of +str+;
1685 * the given +capacity+ may set the size of the internal buffer,
1686 * which may affect performance:
1687 * String.new(capacity: 1) # => ""
1688 * String.new(capacity: 4096) # => ""
1689 *
1690 * The +string+, +encoding+, and +capacity+ arguments may all be used together:
1691 * String.new('hello', encoding: 'UTF-8', capacity: 25)
1692 */
1693
1694static VALUE
1695rb_str_init(int argc, VALUE *argv, VALUE str)
1696{
1697 static ID keyword_ids[2];
1698 VALUE orig, opt, venc, vcapa;
1699 VALUE kwargs[2];
1700 rb_encoding *enc = 0;
1701 int n;
1702
1703 if (!keyword_ids[0]) {
1704 keyword_ids[0] = rb_id_encoding();
1705 CONST_ID(keyword_ids[1], "capacity");
1706 }
1707
1708 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1709 if (!NIL_P(opt)) {
1710 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1711 venc = kwargs[0];
1712 vcapa = kwargs[1];
1713 if (venc != Qundef && !NIL_P(venc)) {
1714 enc = rb_to_encoding(venc);
1715 }
1716 if (vcapa != Qundef && !NIL_P(vcapa)) {
1717 long capa = NUM2LONG(vcapa);
1718 long len = 0;
1719 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1720
1721 if (capa < STR_BUF_MIN_SIZE) {
1723 }
1724 if (n == 1) {
1725 StringValue(orig);
1726 len = RSTRING_LEN(orig);
1727 if (capa < len) {
1728 capa = len;
1729 }
1730 if (orig == str) n = 0;
1731 }
1732 str_modifiable(str);
1733 if (STR_EMBED_P(str)) { /* make noembed always */
1734 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1735 memcpy(new_ptr, RSTRING(str)->as.ary, RSTRING_EMBED_LEN_MAX + 1);
1736 RSTRING(str)->as.heap.ptr = new_ptr;
1737 }
1738 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1739 const size_t size = (size_t)capa + termlen;
1740 const char *const old_ptr = RSTRING_PTR(str);
1741 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1742 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1743 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1745 RSTRING(str)->as.heap.ptr = new_ptr;
1746 }
1747 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1748 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1749 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1750 }
1751 RSTRING(str)->as.heap.len = len;
1752 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1753 if (n == 1) {
1754 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1755 rb_enc_cr_str_exact_copy(str, orig);
1756 }
1758 RSTRING(str)->as.heap.aux.capa = capa;
1759 }
1760 else if (n == 1) {
1761 rb_str_replace(str, orig);
1762 }
1763 if (enc) {
1764 rb_enc_associate(str, enc);
1766 }
1767 }
1768 else if (n == 1) {
1769 rb_str_replace(str, orig);
1770 }
1771 return str;
1772}
1773
1774#ifdef NONASCII_MASK
1775#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1776
1777/*
1778 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1779 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1780 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1781 *
1782 * if (!(byte & 0x80))
1783 * byte |= 0x40; // turn on bit6
1784 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1785 *
1786 * This function calculates whether a byte is leading or not for all bytes
1787 * in the argument word by concurrently using the above logic, and then
1788 * adds up the number of leading bytes in the word.
1789 */
1790static inline uintptr_t
1791count_utf8_lead_bytes_with_word(const uintptr_t *s)
1792{
1793 uintptr_t d = *s;
1794
1795 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1796 d = (d>>6) | (~d>>7);
1797 d &= NONASCII_MASK >> 7;
1798
1799 /* Gather all bytes. */
1800#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1801 /* use only if it can use POPCNT */
1802 return rb_popcount_intptr(d);
1803#else
1804 d += (d>>8);
1805 d += (d>>16);
1806# if SIZEOF_VOIDP == 8
1807 d += (d>>32);
1808# endif
1809 return (d&0xF);
1810#endif
1811}
1812#endif
1813
1814static inline long
1815enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1816{
1817 long c;
1818 const char *q;
1819
1820 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1821 long diff = (long)(e - p);
1822 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1823 }
1824#ifdef NONASCII_MASK
1825 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1826 uintptr_t len = 0;
1827 if ((int)sizeof(uintptr_t) * 2 < e - p) {
1828 const uintptr_t *s, *t;
1829 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
1830 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
1831 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
1832 while (p < (const char *)s) {
1833 if (is_utf8_lead_byte(*p)) len++;
1834 p++;
1835 }
1836 while (s < t) {
1837 len += count_utf8_lead_bytes_with_word(s);
1838 s++;
1839 }
1840 p = (const char *)s;
1841 }
1842 while (p < e) {
1843 if (is_utf8_lead_byte(*p)) len++;
1844 p++;
1845 }
1846 return (long)len;
1847 }
1848#endif
1849 else if (rb_enc_asciicompat(enc)) {
1850 c = 0;
1851 if (ENC_CODERANGE_CLEAN_P(cr)) {
1852 while (p < e) {
1853 if (ISASCII(*p)) {
1854 q = search_nonascii(p, e);
1855 if (!q)
1856 return c + (e - p);
1857 c += q - p;
1858 p = q;
1859 }
1860 p += rb_enc_fast_mbclen(p, e, enc);
1861 c++;
1862 }
1863 }
1864 else {
1865 while (p < e) {
1866 if (ISASCII(*p)) {
1867 q = search_nonascii(p, e);
1868 if (!q)
1869 return c + (e - p);
1870 c += q - p;
1871 p = q;
1872 }
1873 p += rb_enc_mbclen(p, e, enc);
1874 c++;
1875 }
1876 }
1877 return c;
1878 }
1879
1880 for (c=0; p<e; c++) {
1881 p += rb_enc_mbclen(p, e, enc);
1882 }
1883 return c;
1884}
1885
1886long
1887rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
1888{
1889 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
1890}
1891
1892/* To get strlen with cr
1893 * Note that given cr is not used.
1894 */
1895long
1896rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
1897{
1898 long c;
1899 const char *q;
1900 int ret;
1901
1902 *cr = 0;
1903 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1904 long diff = (long)(e - p);
1905 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1906 }
1907 else if (rb_enc_asciicompat(enc)) {
1908 c = 0;
1909 while (p < e) {
1910 if (ISASCII(*p)) {
1911 q = search_nonascii(p, e);
1912 if (!q) {
1913 if (!*cr) *cr = ENC_CODERANGE_7BIT;
1914 return c + (e - p);
1915 }
1916 c += q - p;
1917 p = q;
1918 }
1919 ret = rb_enc_precise_mbclen(p, e, enc);
1920 if (MBCLEN_CHARFOUND_P(ret)) {
1921 *cr |= ENC_CODERANGE_VALID;
1922 p += MBCLEN_CHARFOUND_LEN(ret);
1923 }
1924 else {
1926 p++;
1927 }
1928 c++;
1929 }
1930 if (!*cr) *cr = ENC_CODERANGE_7BIT;
1931 return c;
1932 }
1933
1934 for (c=0; p<e; c++) {
1935 ret = rb_enc_precise_mbclen(p, e, enc);
1936 if (MBCLEN_CHARFOUND_P(ret)) {
1937 *cr |= ENC_CODERANGE_VALID;
1938 p += MBCLEN_CHARFOUND_LEN(ret);
1939 }
1940 else {
1942 if (p + rb_enc_mbminlen(enc) <= e)
1943 p += rb_enc_mbminlen(enc);
1944 else
1945 p = e;
1946 }
1947 }
1948 if (!*cr) *cr = ENC_CODERANGE_7BIT;
1949 return c;
1950}
1951
1952/* enc must be str's enc or rb_enc_check(str, str2) */
1953static long
1954str_strlen(VALUE str, rb_encoding *enc)
1955{
1956 const char *p, *e;
1957 int cr;
1958
1959 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1960 if (!enc) enc = STR_ENC_GET(str);
1961 p = RSTRING_PTR(str);
1962 e = RSTRING_END(str);
1963 cr = ENC_CODERANGE(str);
1964
1965 if (cr == ENC_CODERANGE_UNKNOWN) {
1966 long n = rb_enc_strlen_cr(p, e, enc, &cr);
1967 if (cr) ENC_CODERANGE_SET(str, cr);
1968 return n;
1969 }
1970 else {
1971 return enc_strlen(p, e, enc, cr);
1972 }
1973}
1974
1975long
1977{
1978 return str_strlen(str, NULL);
1979}
1980
1981/*
1982 * call-seq:
1983 * string.length -> integer
1984 *
1985 * Returns the count of characters (not bytes) in +self+:
1986 * "\x80\u3042".length # => 2
1987 * "hello".length # => 5
1988 *
1989 * String#size is an alias for String#length.
1990 *
1991 * Related: String#bytesize.
1992 */
1993
1994VALUE
1996{
1997 return LONG2NUM(str_strlen(str, NULL));
1998}
1999
2000/*
2001 * call-seq:
2002 * string.bytesize -> integer
2003 *
2004 * Returns the count of bytes in +self+:
2005 * "\x80\u3042".bytesize # => 4
2006 * "hello".bytesize # => 5
2007 *
2008 * Related: String#length.
2009 */
2010
2011static VALUE
2012rb_str_bytesize(VALUE str)
2013{
2014 return LONG2NUM(RSTRING_LEN(str));
2015}
2016
2017/*
2018 * call-seq:
2019 * string.empty? -> true or false
2020 *
2021 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2022 * "hello".empty? # => false
2023 * " ".empty? # => false
2024 * "".empty? # => true
2025 */
2026
2027static VALUE
2028rb_str_empty(VALUE str)
2029{
2030 if (RSTRING_LEN(str) == 0)
2031 return Qtrue;
2032 return Qfalse;
2033}
2034
2035/*
2036 * call-seq:
2037 * string + other_string -> new_string
2038 *
2039 * Returns a new \String containing +other_string+ concatenated to +self+:
2040 * "Hello from " + self.to_s # => "Hello from main"
2041 */
2042
2043VALUE
2045{
2046 VALUE str3;
2047 rb_encoding *enc;
2048 char *ptr1, *ptr2, *ptr3;
2049 long len1, len2;
2050 int termlen;
2051
2052 StringValue(str2);
2053 enc = rb_enc_check_str(str1, str2);
2054 RSTRING_GETMEM(str1, ptr1, len1);
2055 RSTRING_GETMEM(str2, ptr2, len2);
2056 termlen = rb_enc_mbminlen(enc);
2057 if (len1 > LONG_MAX - len2) {
2058 rb_raise(rb_eArgError, "string size too big");
2059 }
2060 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2061 ptr3 = RSTRING_PTR(str3);
2062 memcpy(ptr3, ptr1, len1);
2063 memcpy(ptr3+len1, ptr2, len2);
2064 TERM_FILL(&ptr3[len1+len2], termlen);
2065
2068 RB_GC_GUARD(str1);
2069 RB_GC_GUARD(str2);
2070 return str3;
2071}
2072
2073/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2076{
2077 assert(RBASIC_CLASS(str1) == rb_cString);
2078 assert(RBASIC_CLASS(str2) == rb_cString);
2079 long len1, len2;
2080 MAYBE_UNUSED(char) *ptr1, *ptr2;
2081 RSTRING_GETMEM(str1, ptr1, len1);
2082 RSTRING_GETMEM(str2, ptr2, len2);
2083 int enc1 = rb_enc_get_index(str1);
2084 int enc2 = rb_enc_get_index(str2);
2085
2086 if (enc1 < 0) {
2087 return Qundef;
2088 }
2089 else if (enc2 < 0) {
2090 return Qundef;
2091 }
2092 else if (enc1 != enc2) {
2093 return Qundef;
2094 }
2095 else if (len1 > LONG_MAX - len2) {
2096 return Qundef;
2097 }
2098 else {
2099 return rb_str_plus(str1, str2);
2100 }
2101
2102}
2103
2104/*
2105 * call-seq:
2106 * string * integer -> new_string
2107 *
2108 * Returns a new \String containing +integer+ copies of +self+:
2109 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2110 * "Ho! " * 0 # => ""
2111 */
2112
2113VALUE
2115{
2116 VALUE str2;
2117 long n, len;
2118 char *ptr2;
2119 int termlen;
2120
2121 if (times == INT2FIX(1)) {
2122 return str_duplicate(rb_cString, str);
2123 }
2124 if (times == INT2FIX(0)) {
2125 str2 = str_alloc(rb_cString);
2126 rb_enc_copy(str2, str);
2127 return str2;
2128 }
2129 len = NUM2LONG(times);
2130 if (len < 0) {
2131 rb_raise(rb_eArgError, "negative argument");
2132 }
2133 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2134 str2 = str_alloc(rb_cString);
2135 if (!STR_EMBEDDABLE_P(len, 1)) {
2136 RSTRING(str2)->as.heap.aux.capa = len;
2137 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2138 STR_SET_NOEMBED(str2);
2139 }
2140 STR_SET_LEN(str2, len);
2141 rb_enc_copy(str2, str);
2142 return str2;
2143 }
2144 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2145 rb_raise(rb_eArgError, "argument too big");
2146 }
2147
2148 len *= RSTRING_LEN(str);
2149 termlen = TERM_LEN(str);
2150 str2 = str_new0(rb_cString, 0, len, termlen);
2151 ptr2 = RSTRING_PTR(str2);
2152 if (len) {
2153 n = RSTRING_LEN(str);
2154 memcpy(ptr2, RSTRING_PTR(str), n);
2155 while (n <= len/2) {
2156 memcpy(ptr2 + n, ptr2, n);
2157 n *= 2;
2158 }
2159 memcpy(ptr2 + n, ptr2, len-n);
2160 }
2161 STR_SET_LEN(str2, len);
2162 TERM_FILL(&ptr2[len], termlen);
2163 rb_enc_cr_str_copy_for_substr(str2, str);
2164
2165 return str2;
2166}
2167
2168/*
2169 * call-seq:
2170 * string % object -> new_string
2171 *
2172 * Returns the result of formatting +object+ into the format specification +self+
2173 * (see Kernel#sprintf for formatting details):
2174 * "%05d" % 123 # => "00123"
2175 *
2176 * If +self+ contains multiple substitutions, +object+ must be
2177 * an \Array or \Hash containing the values to be substituted:
2178 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2179 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2180 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2181 */
2182
2183static VALUE
2184rb_str_format_m(VALUE str, VALUE arg)
2185{
2186 VALUE tmp = rb_check_array_type(arg);
2187
2188 if (!NIL_P(tmp)) {
2189 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2190 }
2191 return rb_str_format(1, &arg, str);
2192}
2193
2194static inline void
2195rb_check_lockedtmp(VALUE str)
2196{
2197 if (FL_TEST(str, STR_TMPLOCK)) {
2198 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2199 }
2200}
2201
2202static inline void
2203str_modifiable(VALUE str)
2204{
2205 rb_check_lockedtmp(str);
2207}
2208
2209static inline int
2210str_dependent_p(VALUE str)
2211{
2212 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2213 return 0;
2214 }
2215 else {
2216 return 1;
2217 }
2218}
2219
2220static inline int
2221str_independent(VALUE str)
2222{
2223 str_modifiable(str);
2224 return !str_dependent_p(str);
2225}
2226
2227static void
2228str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2229{
2230 char *ptr;
2231 char *oldptr;
2232 long capa = len + expand;
2233
2234 if (len > capa) len = capa;
2235
2236 if (!STR_EMBED_P(str) && STR_EMBEDDABLE_P(capa, termlen)) {
2237 ptr = RSTRING(str)->as.heap.ptr;
2239 memcpy(RSTRING(str)->as.ary, ptr, len);
2240 TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2242 return;
2243 }
2244
2245 ptr = ALLOC_N(char, (size_t)capa + termlen);
2246 oldptr = RSTRING_PTR(str);
2247 if (oldptr) {
2248 memcpy(ptr, oldptr, len);
2249 }
2251 xfree(oldptr);
2252 }
2255 TERM_FILL(ptr + len, termlen);
2256 RSTRING(str)->as.heap.ptr = ptr;
2257 RSTRING(str)->as.heap.len = len;
2258 RSTRING(str)->as.heap.aux.capa = capa;
2259}
2260
2261void
2263{
2264 if (!str_independent(str))
2265 str_make_independent(str);
2267}
2268
2269void
2271{
2272 int termlen = TERM_LEN(str);
2273 long len = RSTRING_LEN(str);
2274
2275 if (expand < 0) {
2276 rb_raise(rb_eArgError, "negative expanding string size");
2277 }
2278 if (expand >= LONG_MAX - len) {
2279 rb_raise(rb_eArgError, "string size too big");
2280 }
2281
2282 if (!str_independent(str)) {
2283 str_make_independent_expand(str, len, expand, termlen);
2284 }
2285 else if (expand > 0) {
2286 RESIZE_CAPA_TERM(str, len + expand, termlen);
2287 }
2289}
2290
2291/* As rb_str_modify(), but don't clear coderange */
2292static void
2293str_modify_keep_cr(VALUE str)
2294{
2295 if (!str_independent(str))
2296 str_make_independent(str);
2298 /* Force re-scan later */
2300}
2301
2302static inline void
2303str_discard(VALUE str)
2304{
2305 str_modifiable(str);
2306 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2308 RSTRING(str)->as.heap.ptr = 0;
2309 RSTRING(str)->as.heap.len = 0;
2310 }
2311}
2312
2313void
2315{
2316 rb_encoding *enc = rb_enc_get(str);
2317 if (!rb_enc_asciicompat(enc)) {
2318 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2319 }
2320}
2321
2322VALUE
2324{
2325 VALUE s = *ptr;
2326 if (!RB_TYPE_P(s, T_STRING)) {
2327 s = rb_str_to_str(s);
2328 *ptr = s;
2329 }
2330 return s;
2331}
2332
2333char *
2335{
2337 return RSTRING_PTR(str);
2338}
2339
2340static int
2341zero_filled(const char *s, int n)
2342{
2343 for (; n > 0; --n) {
2344 if (*s++) return 0;
2345 }
2346 return 1;
2347}
2348
2349static const char *
2350str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2351{
2352 const char *e = s + len;
2353
2354 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2355 if (zero_filled(s, minlen)) return s;
2356 }
2357 return 0;
2358}
2359
2360static char *
2361str_fill_term(VALUE str, char *s, long len, int termlen)
2362{
2363 /* This function assumes that (capa + termlen) bytes of memory
2364 * is allocated, like many other functions in this file.
2365 */
2366 if (str_dependent_p(str)) {
2367 if (!zero_filled(s + len, termlen))
2368 str_make_independent_expand(str, len, 0L, termlen);
2369 }
2370 else {
2371 TERM_FILL(s + len, termlen);
2372 return s;
2373 }
2374 return RSTRING_PTR(str);
2375}
2376
2377void
2378rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2379{
2380 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2381 long len = RSTRING_LEN(str);
2382
2383 assert(capa >= len);
2384 if (capa - len < termlen) {
2385 rb_check_lockedtmp(str);
2386 str_make_independent_expand(str, len, 0L, termlen);
2387 }
2388 else if (str_dependent_p(str)) {
2389 if (termlen > oldtermlen)
2390 str_make_independent_expand(str, len, 0L, termlen);
2391 }
2392 else {
2393 if (!STR_EMBED_P(str)) {
2394 /* modify capa instead of realloc */
2396 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2397 }
2398 if (termlen > oldtermlen) {
2399 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2400 }
2401 }
2402
2403 return;
2404}
2405
2406static char *
2407str_null_check(VALUE str, int *w)
2408{
2409 char *s = RSTRING_PTR(str);
2410 long len = RSTRING_LEN(str);
2411 rb_encoding *enc = rb_enc_get(str);
2412 const int minlen = rb_enc_mbminlen(enc);
2413
2414 if (minlen > 1) {
2415 *w = 1;
2416 if (str_null_char(s, len, minlen, enc)) {
2417 return NULL;
2418 }
2419 return str_fill_term(str, s, len, minlen);
2420 }
2421 *w = 0;
2422 if (!s || memchr(s, 0, len)) {
2423 return NULL;
2424 }
2425 if (s[len]) {
2426 s = str_fill_term(str, s, len, minlen);
2427 }
2428 return s;
2429}
2430
2431char *
2433{
2434 int w;
2435 return str_null_check(str, &w);
2436}
2437
2438char *
2440{
2442 int w;
2443 char *s = str_null_check(str, &w);
2444 if (!s) {
2445 if (w) {
2446 rb_raise(rb_eArgError, "string contains null char");
2447 }
2448 rb_raise(rb_eArgError, "string contains null byte");
2449 }
2450 return s;
2451}
2452
2453char *
2454rb_str_fill_terminator(VALUE str, const int newminlen)
2455{
2456 char *s = RSTRING_PTR(str);
2457 long len = RSTRING_LEN(str);
2458 return str_fill_term(str, s, len, newminlen);
2459}
2460
2461VALUE
2463{
2464 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2465 return str;
2466}
2467
2468/*
2469 * call-seq:
2470 * String.try_convert(object) -> object, new_string, or nil
2471 *
2472 * If +object+ is a \String object, returns +object+.
2473 *
2474 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2475 * calls <tt>object.to_str</tt> and returns the result.
2476 *
2477 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>
2478 *
2479 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2480 */
2481static VALUE
2482rb_str_s_try_convert(VALUE dummy, VALUE str)
2483{
2484 return rb_check_string_type(str);
2485}
2486
2487static char*
2488str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2489{
2490 long nth = *nthp;
2491 if (rb_enc_mbmaxlen(enc) == 1) {
2492 p += nth;
2493 }
2494 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2495 p += nth * rb_enc_mbmaxlen(enc);
2496 }
2497 else if (rb_enc_asciicompat(enc)) {
2498 const char *p2, *e2;
2499 int n;
2500
2501 while (p < e && 0 < nth) {
2502 e2 = p + nth;
2503 if (e < e2) {
2504 *nthp = nth;
2505 return (char *)e;
2506 }
2507 if (ISASCII(*p)) {
2508 p2 = search_nonascii(p, e2);
2509 if (!p2) {
2510 nth -= e2 - p;
2511 *nthp = nth;
2512 return (char *)e2;
2513 }
2514 nth -= p2 - p;
2515 p = p2;
2516 }
2517 n = rb_enc_mbclen(p, e, enc);
2518 p += n;
2519 nth--;
2520 }
2521 *nthp = nth;
2522 if (nth != 0) {
2523 return (char *)e;
2524 }
2525 return (char *)p;
2526 }
2527 else {
2528 while (p < e && nth--) {
2529 p += rb_enc_mbclen(p, e, enc);
2530 }
2531 }
2532 if (p > e) p = e;
2533 *nthp = nth;
2534 return (char*)p;
2535}
2536
2537char*
2538rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2539{
2540 return str_nth_len(p, e, &nth, enc);
2541}
2542
2543static char*
2544str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2545{
2546 if (singlebyte)
2547 p += nth;
2548 else {
2549 p = str_nth_len(p, e, &nth, enc);
2550 }
2551 if (!p) return 0;
2552 if (p > e) p = e;
2553 return (char *)p;
2554}
2555
2556/* char offset to byte offset */
2557static long
2558str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2559{
2560 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2561 if (!pp) return e - p;
2562 return pp - p;
2563}
2564
2565long
2567{
2568 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2569 STR_ENC_GET(str), single_byte_optimizable(str));
2570}
2571
2572#ifdef NONASCII_MASK
2573static char *
2574str_utf8_nth(const char *p, const char *e, long *nthp)
2575{
2576 long nth = *nthp;
2577 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2578 const uintptr_t *s, *t;
2579 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2580 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2581 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2582 while (p < (const char *)s) {
2583 if (is_utf8_lead_byte(*p)) nth--;
2584 p++;
2585 }
2586 do {
2587 nth -= count_utf8_lead_bytes_with_word(s);
2588 s++;
2589 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2590 p = (char *)s;
2591 }
2592 while (p < e) {
2593 if (is_utf8_lead_byte(*p)) {
2594 if (nth == 0) break;
2595 nth--;
2596 }
2597 p++;
2598 }
2599 *nthp = nth;
2600 return (char *)p;
2601}
2602
2603static long
2604str_utf8_offset(const char *p, const char *e, long nth)
2605{
2606 const char *pp = str_utf8_nth(p, e, &nth);
2607 return pp - p;
2608}
2609#endif
2610
2611/* byte offset to char offset */
2612long
2614{
2615 if (single_byte_optimizable(str) || pos < 0)
2616 return pos;
2617 else {
2618 char *p = RSTRING_PTR(str);
2619 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2620 }
2621}
2622
2623VALUE
2624rb_str_subseq(VALUE str, long beg, long len)
2625{
2626 VALUE str2;
2627
2628 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2630 long olen;
2631 str2 = rb_str_new_shared(rb_str_new_frozen_String(str));
2632 RSTRING(str2)->as.heap.ptr += beg;
2633 olen = RSTRING(str2)->as.heap.len;
2634 if (olen > len) RSTRING(str2)->as.heap.len = len;
2635 }
2636 else {
2637 str2 = rb_str_new(RSTRING_PTR(str)+beg, len);
2639 }
2640
2641 rb_enc_cr_str_copy_for_substr(str2, str);
2642
2643 return str2;
2644}
2645
2646char *
2647rb_str_subpos(VALUE str, long beg, long *lenp)
2648{
2649 long len = *lenp;
2650 long slen = -1L;
2651 long blen = RSTRING_LEN(str);
2652 rb_encoding *enc = STR_ENC_GET(str);
2653 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2654
2655 if (len < 0) return 0;
2656 if (!blen) {
2657 len = 0;
2658 }
2659 if (single_byte_optimizable(str)) {
2660 if (beg > blen) return 0;
2661 if (beg < 0) {
2662 beg += blen;
2663 if (beg < 0) return 0;
2664 }
2665 if (len > blen - beg)
2666 len = blen - beg;
2667 if (len < 0) return 0;
2668 p = s + beg;
2669 goto end;
2670 }
2671 if (beg < 0) {
2672 if (len > -beg) len = -beg;
2673 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2674 beg = -beg;
2675 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2676 p = e;
2677 if (!p) return 0;
2678 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2679 if (!p) return 0;
2680 len = e - p;
2681 goto end;
2682 }
2683 else {
2684 slen = str_strlen(str, enc);
2685 beg += slen;
2686 if (beg < 0) return 0;
2687 p = s + beg;
2688 if (len == 0) goto end;
2689 }
2690 }
2691 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2692 return 0;
2693 }
2694 if (len == 0) {
2695 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2696 p = s + beg;
2697 }
2698#ifdef NONASCII_MASK
2699 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2700 enc == rb_utf8_encoding()) {
2701 p = str_utf8_nth(s, e, &beg);
2702 if (beg > 0) return 0;
2703 len = str_utf8_offset(p, e, len);
2704 }
2705#endif
2706 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2707 int char_sz = rb_enc_mbmaxlen(enc);
2708
2709 p = s + beg * char_sz;
2710 if (p > e) {
2711 return 0;
2712 }
2713 else if (len * char_sz > e - p)
2714 len = e - p;
2715 else
2716 len *= char_sz;
2717 }
2718 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2719 if (beg > 0) return 0;
2720 len = 0;
2721 }
2722 else {
2723 len = str_offset(p, e, len, enc, 0);
2724 }
2725 end:
2726 *lenp = len;
2728 return p;
2729}
2730
2731static VALUE str_substr(VALUE str, long beg, long len, int empty);
2732
2733VALUE
2734rb_str_substr(VALUE str, long beg, long len)
2735{
2736 return str_substr(str, beg, len, TRUE);
2737}
2738
2739static VALUE
2740str_substr(VALUE str, long beg, long len, int empty)
2741{
2742 VALUE str2;
2743 char *p = rb_str_subpos(str, beg, &len);
2744
2745 if (!p) return Qnil;
2746 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2747 SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2748 long ofs = p - RSTRING_PTR(str);
2749 str2 = rb_str_new_frozen(str);
2750 str2 = str_new_shared(rb_cString, str2);
2751 RSTRING(str2)->as.heap.ptr += ofs;
2752 RSTRING(str2)->as.heap.len = len;
2753 ENC_CODERANGE_CLEAR(str2);
2754 }
2755 else {
2756 if (!len && !empty) return Qnil;
2757 str2 = rb_str_new(p, len);
2759 }
2760 rb_enc_cr_str_copy_for_substr(str2, str);
2761
2762 return str2;
2763}
2764
2765VALUE
2767{
2768 if (OBJ_FROZEN(str)) return str;
2770 return rb_obj_freeze(str);
2771}
2772
2773
2774/*
2775 * call-seq:
2776 * +string -> new_string or self
2777 *
2778 * Returns +self+ if +self+ is not frozen.
2779 *
2780 * Otherwise. returns <tt>self.dup</tt>, which is not frozen.
2781 */
2782static VALUE
2783str_uplus(VALUE str)
2784{
2785 if (OBJ_FROZEN(str)) {
2786 return rb_str_dup(str);
2787 }
2788 else {
2789 return str;
2790 }
2791}
2792
2793/*
2794 * call-seq:
2795 * -string -> frozen_string
2796 *
2797 * Returns a frozen, possibly pre-existing copy of the string.
2798 *
2799 * The returned \String will be deduplicated as long as it does not have
2800 * any instance variables set on it.
2801 */
2802static VALUE
2803str_uminus(VALUE str)
2804{
2805 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
2806 str = rb_str_dup(str);
2807 }
2808 return rb_fstring(str);
2809}
2810
2812#define rb_str_dup_frozen rb_str_new_frozen
2813
2814VALUE
2816{
2817 if (FL_TEST(str, STR_TMPLOCK)) {
2818 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
2819 }
2821 return str;
2822}
2823
2824VALUE
2826{
2827 if (!FL_TEST(str, STR_TMPLOCK)) {
2828 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
2829 }
2831 return str;
2832}
2833
2834RUBY_FUNC_EXPORTED VALUE
2836{
2838 return rb_ensure(func, arg, rb_str_unlocktmp, str);
2839}
2840
2841void
2843{
2844 long capa;
2845 const int termlen = TERM_LEN(str);
2846
2847 str_modifiable(str);
2848 if (STR_SHARED_P(str)) {
2849 rb_raise(rb_eRuntimeError, "can't set length of shared string");
2850 }
2851 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
2852 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
2853 }
2855 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
2856}
2857
2858VALUE
2860{
2861 long slen;
2862 int independent;
2863
2864 if (len < 0) {
2865 rb_raise(rb_eArgError, "negative string size (or size too big)");
2866 }
2867
2868 independent = str_independent(str);
2870 slen = RSTRING_LEN(str);
2871
2872 {
2873 long capa;
2874 const int termlen = TERM_LEN(str);
2875 if (STR_EMBED_P(str)) {
2876 if (len == slen) return str;
2877 if (STR_EMBEDDABLE_P(len, termlen)) {
2879 TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2880 return str;
2881 }
2882 str_make_independent_expand(str, slen, len - slen, termlen);
2883 }
2884 else if (STR_EMBEDDABLE_P(len, termlen)) {
2885 char *ptr = STR_HEAP_PTR(str);
2887 if (slen > len) slen = len;
2888 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
2889 TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2891 if (independent) ruby_xfree(ptr);
2892 return str;
2893 }
2894 else if (!independent) {
2895 if (len == slen) return str;
2896 str_make_independent_expand(str, slen, len - slen, termlen);
2897 }
2898 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
2899 (capa - len) > (len < 1024 ? len : 1024)) {
2900 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2901 (size_t)len + termlen, STR_HEAP_SIZE(str));
2902 RSTRING(str)->as.heap.aux.capa = len;
2903 }
2904 else if (len == slen) return str;
2905 RSTRING(str)->as.heap.len = len;
2906 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
2907 }
2908 return str;
2909}
2910
2911static VALUE
2912str_buf_cat(VALUE str, const char *ptr, long len)
2913{
2914 long capa, total, olen, off = -1;
2915 char *sptr;
2916 const int termlen = TERM_LEN(str);
2917 assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
2918
2919 RSTRING_GETMEM(str, sptr, olen);
2920 if (ptr >= sptr && ptr <= sptr + olen) {
2921 off = ptr - sptr;
2922 }
2924 if (len == 0) return 0;
2925 if (STR_EMBED_P(str)) {
2926 capa = RSTRING_EMBED_LEN_MAX + 1 - termlen;
2927 sptr = RSTRING(str)->as.ary;
2928 olen = RSTRING_EMBED_LEN(str);
2929 }
2930 else {
2931 capa = RSTRING(str)->as.heap.aux.capa;
2932 sptr = RSTRING(str)->as.heap.ptr;
2933 olen = RSTRING(str)->as.heap.len;
2934 }
2935 if (olen > LONG_MAX - len) {
2936 rb_raise(rb_eArgError, "string sizes too big");
2937 }
2938 total = olen + len;
2939 if (capa < total) {
2940 if (total >= LONG_MAX / 2) {
2941 capa = total;
2942 }
2943 while (total > capa) {
2944 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
2945 }
2946 RESIZE_CAPA_TERM(str, capa, termlen);
2947 sptr = RSTRING_PTR(str);
2948 }
2949 if (off != -1) {
2950 ptr = sptr + off;
2951 }
2952 memcpy(sptr + olen, ptr, len);
2953 STR_SET_LEN(str, total);
2954 TERM_FILL(sptr + total, termlen); /* sentinel */
2955
2956 return str;
2957}
2958
2959#define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
2960
2961VALUE
2962rb_str_cat(VALUE str, const char *ptr, long len)
2963{
2964 if (len == 0) return str;
2965 if (len < 0) {
2966 rb_raise(rb_eArgError, "negative string size (or size too big)");
2967 }
2968 return str_buf_cat(str, ptr, len);
2969}
2970
2971VALUE
2973{
2974 must_not_null(ptr);
2975 return rb_str_buf_cat(str, ptr, strlen(ptr));
2976}
2977
2981
2982static VALUE
2983rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
2984 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
2985{
2986 int str_encindex = ENCODING_GET(str);
2987 int res_encindex;
2988 int str_cr, res_cr;
2989 rb_encoding *str_enc, *ptr_enc;
2990
2992
2993 if (str_encindex == ptr_encindex) {
2994 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
2995 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
2996 }
2997 }
2998 else {
2999 str_enc = rb_enc_from_index(str_encindex);
3000 ptr_enc = rb_enc_from_index(ptr_encindex);
3001 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3002 if (len == 0)
3003 return str;
3004 if (RSTRING_LEN(str) == 0) {
3006 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3007 return str;
3008 }
3009 goto incompatible;
3010 }
3011 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3012 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3013 }
3014 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3015 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3016 str_cr = rb_enc_str_coderange(str);
3017 }
3018 }
3019 }
3020 if (ptr_cr_ret)
3021 *ptr_cr_ret = ptr_cr;
3022
3023 if (str_encindex != ptr_encindex &&
3024 str_cr != ENC_CODERANGE_7BIT &&
3025 ptr_cr != ENC_CODERANGE_7BIT) {
3026 str_enc = rb_enc_from_index(str_encindex);
3027 ptr_enc = rb_enc_from_index(ptr_encindex);
3028 goto incompatible;
3029 }
3030
3031 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3032 res_encindex = str_encindex;
3033 res_cr = ENC_CODERANGE_UNKNOWN;
3034 }
3035 else if (str_cr == ENC_CODERANGE_7BIT) {
3036 if (ptr_cr == ENC_CODERANGE_7BIT) {
3037 res_encindex = str_encindex;
3038 res_cr = ENC_CODERANGE_7BIT;
3039 }
3040 else {
3041 res_encindex = ptr_encindex;
3042 res_cr = ptr_cr;
3043 }
3044 }
3045 else if (str_cr == ENC_CODERANGE_VALID) {
3046 res_encindex = str_encindex;
3047 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3048 res_cr = str_cr;
3049 else
3050 res_cr = ptr_cr;
3051 }
3052 else { /* str_cr == ENC_CODERANGE_BROKEN */
3053 res_encindex = str_encindex;
3054 res_cr = str_cr;
3055 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3056 }
3057
3058 if (len < 0) {
3059 rb_raise(rb_eArgError, "negative string size (or size too big)");
3060 }
3061 str_buf_cat(str, ptr, len);
3062 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3063 return str;
3064
3065 incompatible:
3066 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3067 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3069}
3070
3071VALUE
3072rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3073{
3074 return rb_enc_cr_str_buf_cat(str, ptr, len,
3076}
3077
3078VALUE
3080{
3081 /* ptr must reference NUL terminated ASCII string. */
3082 int encindex = ENCODING_GET(str);
3083 rb_encoding *enc = rb_enc_from_index(encindex);
3084 if (rb_enc_asciicompat(enc)) {
3085 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3086 encindex, ENC_CODERANGE_7BIT, 0);
3087 }
3088 else {
3089 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3090 while (*ptr) {
3091 unsigned int c = (unsigned char)*ptr;
3092 int len = rb_enc_codelen(c, enc);
3093 rb_enc_mbcput(c, buf, enc);
3094 rb_enc_cr_str_buf_cat(str, buf, len,
3095 encindex, ENC_CODERANGE_VALID, 0);
3096 ptr++;
3097 }
3098 return str;
3099 }
3100}
3101
3102VALUE
3104{
3105 int str2_cr;
3106
3107 str2_cr = ENC_CODERANGE(str2);
3108
3109 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3110 ENCODING_GET(str2), str2_cr, &str2_cr);
3111
3112 ENC_CODERANGE_SET(str2, str2_cr);
3113
3114 return str;
3115}
3116
3117VALUE
3119{
3120 StringValue(str2);
3121 return rb_str_buf_append(str, str2);
3122}
3123
3124#define MIN_PRE_ALLOC_SIZE 48
3125
3127rb_str_concat_literals(size_t num, const VALUE *strary)
3128{
3129 VALUE str;
3130 size_t i, s;
3131 long len = 1;
3132
3133 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3134 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3135
3136 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3137 if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3138 str = rb_str_resurrect(strary[0]);
3139 s = 1;
3140 }
3141 else {
3143 rb_enc_copy(str, strary[0]);
3144 s = 0;
3145 }
3146
3147 for (i = s; i < num; ++i) {
3148 const VALUE v = strary[i];
3149 int encidx = ENCODING_GET(v);
3150
3151 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
3152 encidx, ENC_CODERANGE(v), NULL);
3153 if (encidx != ENCINDEX_US_ASCII) {
3155 rb_enc_set_index(str, encidx);
3156 }
3157 }
3158 return str;
3159}
3160
3161/*
3162 * call-seq:
3163 * string.concat(*objects) -> string
3164 *
3165 * Concatenates each object in +objects+ to +self+ and returns +self+:
3166 *
3167 * s = 'foo'
3168 * s.concat('bar', 'baz') # => "foobarbaz"
3169 * s # => "foobarbaz"
3170 *
3171 * For each given object +object+ that is an \Integer,
3172 * the value is considered a codepoint and converted to a character before concatenation:
3173 * s = 'foo'
3174 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3175 *
3176 * Related: String#<<, which takes a single argument.
3177 */
3178static VALUE
3179rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3180{
3181 str_modifiable(str);
3182
3183 if (argc == 1) {
3184 return rb_str_concat(str, argv[0]);
3185 }
3186 else if (argc > 1) {
3187 int i;
3188 VALUE arg_str = rb_str_tmp_new(0);
3189 rb_enc_copy(arg_str, str);
3190 for (i = 0; i < argc; i++) {
3191 rb_str_concat(arg_str, argv[i]);
3192 }
3193 rb_str_buf_append(str, arg_str);
3194 }
3195
3196 return str;
3197}
3198
3199/*
3200 * call-seq:
3201 * string << object -> string
3202 *
3203 * Concatenates +object+ to +self+ and returns +self+:
3204 *
3205 * s = 'foo'
3206 * s << 'bar' # => "foobar"
3207 * s # => "foobar"
3208 *
3209 * If +object+ is an \Integer,
3210 * the value is considered a codepoint and converted to a character before concatenation:
3211 * s = 'foo'
3212 * s << 33 # => "foo!"
3213 *
3214 * Related: String#concat, which takes multiple arguments.
3215 */
3216VALUE
3218{
3219 unsigned int code;
3220 rb_encoding *enc = STR_ENC_GET(str1);
3221 int encidx;
3222
3223 if (RB_INTEGER_TYPE_P(str2)) {
3224 if (rb_num_to_uint(str2, &code) == 0) {
3225 }
3226 else if (FIXNUM_P(str2)) {
3227 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3228 }
3229 else {
3230 rb_raise(rb_eRangeError, "bignum out of char range");
3231 }
3232 }
3233 else {
3234 return rb_str_append(str1, str2);
3235 }
3236
3237 encidx = rb_enc_to_index(enc);
3238 if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3239 /* US-ASCII automatically extended to ASCII-8BIT */
3240 char buf[1];
3241 buf[0] = (char)code;
3242 if (code > 0xFF) {
3243 rb_raise(rb_eRangeError, "%u out of char range", code);
3244 }
3245 rb_str_cat(str1, buf, 1);
3246 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3249 }
3250 }
3251 else {
3252 long pos = RSTRING_LEN(str1);
3253 int cr = ENC_CODERANGE(str1);
3254 int len;
3255 char *buf;
3256
3257 switch (len = rb_enc_codelen(code, enc)) {
3259 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3260 break;
3262 case 0:
3263 rb_raise(rb_eRangeError, "%u out of char range", code);
3264 break;
3265 }
3266 buf = ALLOCA_N(char, len + 1);
3267 rb_enc_mbcput(code, buf, enc);
3268 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3269 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3270 }
3271 rb_str_resize(str1, pos+len);
3272 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3273 if (cr == ENC_CODERANGE_7BIT && code > 127)
3275 ENC_CODERANGE_SET(str1, cr);
3276 }
3277 return str1;
3278}
3279
3280/*
3281 * call-seq:
3282 * string.prepend(*other_strings) -> string
3283 *
3284 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3285 * s = 'foo'
3286 * s.prepend('bar', 'baz') # => "barbazfoo"
3287 * s # => "barbazfoo"
3288 *
3289 * Related: String#concat.
3290 */
3291
3292static VALUE
3293rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3294{
3295 str_modifiable(str);
3296
3297 if (argc == 1) {
3298 rb_str_update(str, 0L, 0L, argv[0]);
3299 }
3300 else if (argc > 1) {
3301 int i;
3302 VALUE arg_str = rb_str_tmp_new(0);
3303 rb_enc_copy(arg_str, str);
3304 for (i = 0; i < argc; i++) {
3305 rb_str_append(arg_str, argv[i]);
3306 }
3307 rb_str_update(str, 0L, 0L, arg_str);
3308 }
3309
3310 return str;
3311}
3312
3315{
3316 int e = ENCODING_GET(str);
3318 e = 0;
3319 }
3320 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3321}
3322
3323int
3325{
3326 long len1, len2;
3327 const char *ptr1, *ptr2;
3328 RSTRING_GETMEM(str1, ptr1, len1);
3329 RSTRING_GETMEM(str2, ptr2, len2);
3330 return (len1 != len2 ||
3331 !rb_str_comparable(str1, str2) ||
3332 memcmp(ptr1, ptr2, len1) != 0);
3333}
3334
3335/*
3336 * call-seq:
3337 * string.hash -> integer
3338 *
3339 * Returns the integer hash value for +self+.
3340 * The value is based on the length, content and encoding of +self+.
3341 */
3342
3343static VALUE
3344rb_str_hash_m(VALUE str)
3345{
3346 st_index_t hval = rb_str_hash(str);
3347 return ST2FIX(hval);
3348}
3349
3350#define lesser(a,b) (((a)>(b))?(b):(a))
3351
3352int
3354{
3355 int idx1, idx2;
3356 int rc1, rc2;
3357
3358 if (RSTRING_LEN(str1) == 0) return TRUE;
3359 if (RSTRING_LEN(str2) == 0) return TRUE;
3360 idx1 = ENCODING_GET(str1);
3361 idx2 = ENCODING_GET(str2);
3362 if (idx1 == idx2) return TRUE;
3363 rc1 = rb_enc_str_coderange(str1);
3364 rc2 = rb_enc_str_coderange(str2);
3365 if (rc1 == ENC_CODERANGE_7BIT) {
3366 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3368 return TRUE;
3369 }
3370 if (rc2 == ENC_CODERANGE_7BIT) {
3372 return TRUE;
3373 }
3374 return FALSE;
3375}
3376
3377int
3379{
3380 long len1, len2;
3381 const char *ptr1, *ptr2;
3382 int retval;
3383
3384 if (str1 == str2) return 0;
3385 RSTRING_GETMEM(str1, ptr1, len1);
3386 RSTRING_GETMEM(str2, ptr2, len2);
3387 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3388 if (len1 == len2) {
3389 if (!rb_str_comparable(str1, str2)) {
3390 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3391 return 1;
3392 return -1;
3393 }
3394 return 0;
3395 }
3396 if (len1 > len2) return 1;
3397 return -1;
3398 }
3399 if (retval > 0) return 1;
3400 return -1;
3401}
3402
3403/*
3404 * call-seq:
3405 * string == object -> true or false
3406 * string === object -> true or false
3407 *
3408 * Returns +true+ if +object+ has the same length and content;
3409 * as +self+; +false+ otherwise:
3410 * s = 'foo'
3411 * s == 'foo' # => true
3412 * s == 'food' # => false
3413 * s == 'FOO' # => false
3414 *
3415 * Returns +false+ if the two strings' encodings are not compatible:
3416 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3417 *
3418 * If +object+ is not an instance of \String but responds to +to_str+, then the
3419 * two strings are compared using <code>object.==</code>.
3420 */
3421
3422VALUE
3424{
3425 if (str1 == str2) return Qtrue;
3426 if (!RB_TYPE_P(str2, T_STRING)) {
3427 if (!rb_respond_to(str2, idTo_str)) {
3428 return Qfalse;
3429 }
3430 return rb_equal(str2, str1);
3431 }
3432 return rb_str_eql_internal(str1, str2);
3433}
3434
3435/*
3436 * call-seq:
3437 * string.eql?(object) -> true or false
3438 *
3439 * Returns +true+ if +object+ has the same length and content;
3440 * as +self+; +false+ otherwise:
3441 * s = 'foo'
3442 * s.eql?('foo') # => true
3443 * s.eql?('food') # => false
3444 * s.eql?('FOO') # => false
3445 *
3446 * Returns +false+ if the two strings' encodings are not compatible:
3447 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3448 */
3449
3452{
3453 if (str1 == str2) return Qtrue;
3454 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3455 return rb_str_eql_internal(str1, str2);
3456}
3457
3458/*
3459 * call-seq:
3460 * string <=> other_string -> -1, 0, 1, or nil
3461 *
3462 * Compares +self+ and +other_string+, returning:
3463 * - -1 if +other_string+ is larger.
3464 * - 0 if the two are equal.
3465 * - 1 if +other_string+ is smaller.
3466 * - +nil+ if the two are incomparable.
3467 *
3468 * Examples:
3469 * 'foo' <=> 'foo' # => 0
3470 * 'foo' <=> 'food' # => -1
3471 * 'food' <=> 'foo' # => 1
3472 * 'FOO' <=> 'foo' # => -1
3473 * 'foo' <=> 'FOO' # => 1
3474 * 'foo' <=> 1 # => nil
3475 */
3476
3477static VALUE
3478rb_str_cmp_m(VALUE str1, VALUE str2)
3479{
3480 int result;
3481 VALUE s = rb_check_string_type(str2);
3482 if (NIL_P(s)) {
3483 return rb_invcmp(str1, str2);
3484 }
3485 result = rb_str_cmp(str1, s);
3486 return INT2FIX(result);
3487}
3488
3489static VALUE str_casecmp(VALUE str1, VALUE str2);
3490static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3491
3492/*
3493 * call-seq:
3494 * str.casecmp(other_str) -> -1, 0, 1, or nil
3495 *
3496 * Compares +self+ and +other_string+, ignoring case, and returning:
3497 * - -1 if +other_string+ is larger.
3498 * - 0 if the two are equal.
3499 * - 1 if +other_string+ is smaller.
3500 * - +nil+ if the two are incomparable.
3501 *
3502 * Examples:
3503 * 'foo'.casecmp('foo') # => 0
3504 * 'foo'.casecmp('food') # => -1
3505 * 'food'.casecmp('foo') # => 1
3506 * 'FOO'.casecmp('foo') # => 0
3507 * 'foo'.casecmp('FOO') # => 0
3508 * 'foo'.casecmp(1) # => nil
3509 */
3510
3511static VALUE
3512rb_str_casecmp(VALUE str1, VALUE str2)
3513{
3514 VALUE s = rb_check_string_type(str2);
3515 if (NIL_P(s)) {
3516 return Qnil;
3517 }
3518 return str_casecmp(str1, s);
3519}
3520
3521static VALUE
3522str_casecmp(VALUE str1, VALUE str2)
3523{
3524 long len;
3525 rb_encoding *enc;
3526 char *p1, *p1end, *p2, *p2end;
3527
3528 enc = rb_enc_compatible(str1, str2);
3529 if (!enc) {
3530 return Qnil;
3531 }
3532
3533 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3534 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3535 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3536 while (p1 < p1end && p2 < p2end) {
3537 if (*p1 != *p2) {
3538 unsigned int c1 = TOLOWER(*p1 & 0xff);
3539 unsigned int c2 = TOLOWER(*p2 & 0xff);
3540 if (c1 != c2)
3541 return INT2FIX(c1 < c2 ? -1 : 1);
3542 }
3543 p1++;
3544 p2++;
3545 }
3546 }
3547 else {
3548 while (p1 < p1end && p2 < p2end) {
3549 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3550 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3551
3552 if (0 <= c1 && 0 <= c2) {
3553 c1 = TOLOWER(c1);
3554 c2 = TOLOWER(c2);
3555 if (c1 != c2)
3556 return INT2FIX(c1 < c2 ? -1 : 1);
3557 }
3558 else {
3559 int r;
3560 l1 = rb_enc_mbclen(p1, p1end, enc);
3561 l2 = rb_enc_mbclen(p2, p2end, enc);
3562 len = l1 < l2 ? l1 : l2;
3563 r = memcmp(p1, p2, len);
3564 if (r != 0)
3565 return INT2FIX(r < 0 ? -1 : 1);
3566 if (l1 != l2)
3567 return INT2FIX(l1 < l2 ? -1 : 1);
3568 }
3569 p1 += l1;
3570 p2 += l2;
3571 }
3572 }
3573 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3574 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3575 return INT2FIX(-1);
3576}
3577
3578/*
3579 * call-seq:
3580 * string.casecmp?(other_string) -> true, false, or nil
3581 *
3582 * Returns +true+ if +self+ and +other_string+ are equal after
3583 * Unicode case folding, otherwise +false+:
3584 * 'foo'.casecmp?('foo') # => true
3585 * 'foo'.casecmp?('food') # => false
3586 * 'food'.casecmp?('foo') # => true
3587 * 'FOO'.casecmp?('foo') # => true
3588 * 'foo'.casecmp?('FOO') # => true
3589 *
3590 * Returns +nil+ if the two values are incomparable:
3591 * 'foo'.casecmp?(1) # => nil
3592 */
3593
3594static VALUE
3595rb_str_casecmp_p(VALUE str1, VALUE str2)
3596{
3597 VALUE s = rb_check_string_type(str2);
3598 if (NIL_P(s)) {
3599 return Qnil;
3600 }
3601 return str_casecmp_p(str1, s);
3602}
3603
3604static VALUE
3605str_casecmp_p(VALUE str1, VALUE str2)
3606{
3607 rb_encoding *enc;
3608 VALUE folded_str1, folded_str2;
3609 VALUE fold_opt = sym_fold;
3610
3611 enc = rb_enc_compatible(str1, str2);
3612 if (!enc) {
3613 return Qnil;
3614 }
3615
3616 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3617 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3618
3619 return rb_str_eql(folded_str1, folded_str2);
3620}
3621
3622static long
3623strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3624 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3625{
3626 const char *search_start = str_ptr;
3627 long pos, search_len = str_len - offset;
3628
3629 for (;;) {
3630 const char *t;
3631 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3632 if (pos < 0) return pos;
3633 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3634 if (t == search_start + pos) break;
3635 search_len -= t - search_start;
3636 if (search_len <= 0) return -1;
3637 offset += t - search_start;
3638 search_start = t;
3639 }
3640 return pos + offset;
3641}
3642
3643#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3644
3645static long
3646rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3647{
3648 const char *str_ptr, *str_ptr_end, *sub_ptr;
3649 long str_len, sub_len;
3650 rb_encoding *enc;
3651
3652 enc = rb_enc_check(str, sub);
3653 if (is_broken_string(sub)) return -1;
3654
3655 str_ptr = RSTRING_PTR(str);
3656 str_ptr_end = RSTRING_END(str);
3657 str_len = RSTRING_LEN(str);
3658 sub_ptr = RSTRING_PTR(sub);
3659 sub_len = RSTRING_LEN(sub);
3660
3661 if (str_len < sub_len) return -1;
3662
3663 if (offset != 0) {
3664 long str_len_char, sub_len_char;
3665 int single_byte = single_byte_optimizable(str);
3666 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3667 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3668 if (offset < 0) {
3669 offset += str_len_char;
3670 if (offset < 0) return -1;
3671 }
3672 if (str_len_char - offset < sub_len_char) return -1;
3673 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3674 str_ptr += offset;
3675 }
3676 if (sub_len == 0) return offset;
3677
3678 /* need proceed one character at a time */
3679 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3680}
3681
3682
3683/*
3684 * call-seq:
3685 * string.index(substring, offset = 0) -> integer or nil
3686 * string.index(regexp, offset = 0) -> integer or nil
3687 *
3688 * Returns the \Integer index of the first occurrence of the given +substring+,
3689 * or +nil+ if none found:
3690 * 'foo'.index('f') # => 0
3691 * 'foo'.index('o') # => 1
3692 * 'foo'.index('oo') # => 1
3693 * 'foo'.index('ooo') # => nil
3694 *
3695 * Returns the \Integer index of the first match for the given \Regexp +regexp+,
3696 * or +nil+ if none found:
3697 * 'foo'.index(/f/) # => 0
3698 * 'foo'.index(/o/) # => 1
3699 * 'foo'.index(/oo/) # => 1
3700 * 'foo'.index(/ooo/) # => nil
3701 *
3702 * \Integer argument +offset+, if given, specifies the position in the
3703 * string to begin the search:
3704 * 'foo'.index('o', 1) # => 1
3705 * 'foo'.index('o', 2) # => 2
3706 * 'foo'.index('o', 3) # => nil
3707 *
3708 * If +offset+ is negative, counts backward from the end of +self+:
3709 * 'foo'.index('o', -1) # => 2
3710 * 'foo'.index('o', -2) # => 1
3711 * 'foo'.index('o', -3) # => 1
3712 * 'foo'.index('o', -4) # => nil
3713 *
3714 * Related: String#rindex
3715 */
3716
3717static VALUE
3718rb_str_index_m(int argc, VALUE *argv, VALUE str)
3719{
3720 VALUE sub;
3721 VALUE initpos;
3722 long pos;
3723
3724 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3725 pos = NUM2LONG(initpos);
3726 }
3727 else {
3728 pos = 0;
3729 }
3730 if (pos < 0) {
3731 pos += str_strlen(str, NULL);
3732 if (pos < 0) {
3733 if (RB_TYPE_P(sub, T_REGEXP)) {
3735 }
3736 return Qnil;
3737 }
3738 }
3739
3740 if (RB_TYPE_P(sub, T_REGEXP)) {
3741 if (pos > str_strlen(str, NULL))
3742 return Qnil;
3743 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3744 rb_enc_check(str, sub), single_byte_optimizable(str));
3745
3746 if (rb_reg_search(sub, str, pos, 0) < 0) {
3747 return Qnil;
3748 } else {
3750 struct re_registers *regs = RMATCH_REGS(match);
3751 pos = rb_str_sublen(str, BEG(0));
3752 return LONG2NUM(pos);
3753 }
3754 }
3755 else {
3757 pos = rb_str_index(str, sub, pos);
3758 pos = rb_str_sublen(str, pos);
3759 }
3760
3761 if (pos == -1) return Qnil;
3762 return LONG2NUM(pos);
3763}
3764
3765#ifdef HAVE_MEMRCHR
3766static long
3767str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3768{
3769 char *hit, *adjusted;
3770 int c;
3771 long slen, searchlen;
3772 char *sbeg, *e, *t;
3773
3774 slen = RSTRING_LEN(sub);
3775 if (slen == 0) return pos;
3776 sbeg = RSTRING_PTR(str);
3777 e = RSTRING_END(str);
3778 t = RSTRING_PTR(sub);
3779 c = *t & 0xff;
3780 searchlen = s - sbeg + 1;
3781
3782 do {
3783 hit = memrchr(sbeg, c, searchlen);
3784 if (!hit) break;
3785 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
3786 if (hit != adjusted) {
3787 searchlen = adjusted - sbeg;
3788 continue;
3789 }
3790 if (memcmp(hit, t, slen) == 0)
3791 return rb_str_sublen(str, hit - sbeg);
3792 searchlen = adjusted - sbeg;
3793 } while (searchlen > 0);
3794
3795 return -1;
3796}
3797#else
3798static long
3799str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3800{
3801 long slen;
3802 char *sbeg, *e, *t;
3803
3804 sbeg = RSTRING_PTR(str);
3805 e = RSTRING_END(str);
3806 t = RSTRING_PTR(sub);
3807 slen = RSTRING_LEN(sub);
3808
3809 while (s) {
3810 if (memcmp(s, t, slen) == 0) {
3811 return pos;
3812 }
3813 if (pos == 0) break;
3814 pos--;
3815 s = rb_enc_prev_char(sbeg, s, e, enc);
3816 }
3817
3818 return -1;
3819}
3820#endif
3821
3822static long
3823rb_str_rindex(VALUE str, VALUE sub, long pos)
3824{
3825 long len, slen;
3826 char *sbeg, *s;
3827 rb_encoding *enc;
3828 int singlebyte;
3829
3830 enc = rb_enc_check(str, sub);
3831 if (is_broken_string(sub)) return -1;
3832 singlebyte = single_byte_optimizable(str);
3833 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
3834 slen = str_strlen(sub, enc); /* rb_enc_check */
3835
3836 /* substring longer than string */
3837 if (len < slen) return -1;
3838 if (len - pos < slen) pos = len - slen;
3839 if (len == 0) return pos;
3840
3841 sbeg = RSTRING_PTR(str);
3842
3843 if (pos == 0) {
3844 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
3845 return 0;
3846 else
3847 return -1;
3848 }
3849
3850 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
3851 return str_rindex(str, sub, s, pos, enc);
3852}
3853
3854/*
3855 * call-seq:
3856 * string.rindex(substring, offset = self.length) -> integer or nil
3857 * string.rindex(regexp, offset = self.length) -> integer or nil
3858 *
3859 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
3860 * or +nil+ if none found:
3861 * 'foo'.rindex('f') # => 0
3862 * 'foo'.rindex('o') # => 2
3863 * 'foo'.rindex('oo') # => 1
3864 * 'foo'.rindex('ooo') # => nil
3865 *
3866 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
3867 * or +nil+ if none found:
3868 * 'foo'.rindex(/f/) # => 0
3869 * 'foo'.rindex(/o/) # => 2
3870 * 'foo'.rindex(/oo/) # => 1
3871 * 'foo'.rindex(/ooo/) # => nil
3872 *
3873 * The _last_ match means starting at the possible last position, not
3874 * the last of longest matches.
3875 *
3876 * 'foo'.rindex(/o+/) # => 2
3877 * $~ #=> #<MatchData "o">
3878 *
3879 * To get the last longest match, needs to combine with negative
3880 * lookbehind.
3881 *
3882 * 'foo'.rindex(/(?<!o)o+/) # => 1
3883 * $~ #=> #<MatchData "oo">
3884 *
3885 * Or String#index with negative lookforward.
3886 *
3887 * 'foo'.index(/o+(?!.*o)/) # => 1
3888 * $~ #=> #<MatchData "oo">
3889 *
3890 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
3891 * string to _end_ the search:
3892 * 'foo'.rindex('o', 0) # => nil
3893 * 'foo'.rindex('o', 1) # => 1
3894 * 'foo'.rindex('o', 2) # => 2
3895 * 'foo'.rindex('o', 3) # => 2
3896 *
3897 * If +offset+ is a negative \Integer, the maximum starting position in the
3898 * string to _end_ the search is the sum of the string's length and +offset+:
3899 * 'foo'.rindex('o', -1) # => 2
3900 * 'foo'.rindex('o', -2) # => 1
3901 * 'foo'.rindex('o', -3) # => nil
3902 * 'foo'.rindex('o', -4) # => nil
3903 *
3904 * Related: String#index
3905 */
3906
3907static VALUE
3908rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
3909{
3910 VALUE sub;
3911 VALUE vpos;
3912 rb_encoding *enc = STR_ENC_GET(str);
3913 long pos, len = str_strlen(str, enc); /* str's enc */
3914
3915 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
3916 pos = NUM2LONG(vpos);
3917 if (pos < 0) {
3918 pos += len;
3919 if (pos < 0) {
3920 if (RB_TYPE_P(sub, T_REGEXP)) {
3922 }
3923 return Qnil;
3924 }
3925 }
3926 if (pos > len) pos = len;
3927 }
3928 else {
3929 pos = len;
3930 }
3931
3932 if (RB_TYPE_P(sub, T_REGEXP)) {
3933 /* enc = rb_get_check(str, sub); */
3934 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3935 enc, single_byte_optimizable(str));
3936
3937 if (rb_reg_search(sub, str, pos, 1) >= 0) {
3939 struct re_registers *regs = RMATCH_REGS(match);
3940 pos = rb_str_sublen(str, BEG(0));
3941 return LONG2NUM(pos);
3942 }
3943 }
3944 else {
3946 pos = rb_str_rindex(str, sub, pos);
3947 if (pos >= 0) return LONG2NUM(pos);
3948 }
3949 return Qnil;
3950}
3951
3952/*
3953 * call-seq:
3954 * string =~ regexp -> integer or nil
3955 * string =~ object -> integer or nil
3956 *
3957 * Returns the \Integer index of the first substring that matches
3958 * the given +regexp+, or +nil+ if no match found:
3959 * 'foo' =~ /f/ # => 0
3960 * 'foo' =~ /o/ # => 1
3961 * 'foo' =~ /x/ # => nil
3962 *
3963 * Note: also updates
3964 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
3965 *
3966 * If the given +object+ is not a \Regexp, returns the value
3967 * returned by <tt>object =~ self</tt>.
3968 *
3969 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
3970 * (see {Regexp#=~}[https://ruby-doc.org/core-2.7.1/Regexp.html#method-i-3D-7E]):
3971 * number= nil
3972 * "no. 9" =~ /(?<number>\d+)/
3973 * number # => nil (not assigned)
3974 * /(?<number>\d+)/ =~ "no. 9"
3975 * number #=> "9"
3976 */
3977
3978static VALUE
3979rb_str_match(VALUE x, VALUE y)
3980{
3981 switch (OBJ_BUILTIN_TYPE(y)) {
3982 case T_STRING:
3983 rb_raise(rb_eTypeError, "type mismatch: String given");
3984
3985 case T_REGEXP:
3986 return rb_reg_match(y, x);
3987
3988 default:
3989 return rb_funcall(y, idEqTilde, 1, x);
3990 }
3991}
3992
3993
3994static VALUE get_pat(VALUE);
3995
3996
3997/*
3998 * call-seq:
3999 * string.match(pattern, offset = 0) -> matchdata or nil
4000 * string.match(pattern, offset = 0) {|matchdata| ... } -> object
4001 *
4002 * Returns a \Matchdata object (or +nil+) based on +self+ and the given +pattern+.
4003 *
4004 * Note: also updates
4005 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4006 *
4007 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4008 * regexp = Regexp.new(pattern)
4009 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4010 * (see Regexp#match):
4011 * matchdata = <tt>regexp.match(self)
4012 *
4013 * With no block given, returns the computed +matchdata+:
4014 * 'foo'.match('f') # => #<MatchData "f">
4015 * 'foo'.match('o') # => #<MatchData "o">
4016 * 'foo'.match('x') # => nil
4017 *
4018 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4019 * 'foo'.match('f', 1) # => nil
4020 * 'foo'.match('o', 1) # => #<MatchData "o">
4021 *
4022 * With a block given, calls the block with the computed +matchdata+
4023 * and returns the block's return value:
4024 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4025 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4026 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4027 */
4028
4029static VALUE
4030rb_str_match_m(int argc, VALUE *argv, VALUE str)
4031{
4032 VALUE re, result;
4033 if (argc < 1)
4034 rb_check_arity(argc, 1, 2);
4035 re = argv[0];
4036 argv[0] = str;
4037 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4038 if (!NIL_P(result) && rb_block_given_p()) {
4039 return rb_yield(result);
4040 }
4041 return result;
4042}
4043
4044/*
4045 * call-seq:
4046 * string.match?(pattern, offset = 0) -> true or false
4047 *
4048 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4049 *
4050 * Note: does not update
4051 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4052 *
4053 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4054 * regexp = Regexp.new(pattern)
4055 *
4056 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \Matchdata object,
4057 * +false+ otherwise:
4058 * 'foo'.match?(/o/) # => true
4059 * 'foo'.match?('o') # => true
4060 * 'foo'.match?(/x/) # => false
4061 *
4062 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4063 * 'foo'.match?('f', 1) # => false
4064 * 'foo'.match?('o', 1) # => true
4065 */
4066
4067static VALUE
4068rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4069{
4070 VALUE re;
4071 rb_check_arity(argc, 1, 2);
4072 re = get_pat(argv[0]);
4073 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4074}
4075
4081
4082static enum neighbor_char
4083enc_succ_char(char *p, long len, rb_encoding *enc)
4084{
4085 long i;
4086 int l;
4087
4088 if (rb_enc_mbminlen(enc) > 1) {
4089 /* wchar, trivial case */
4090 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4091 if (!MBCLEN_CHARFOUND_P(r)) {
4092 return NEIGHBOR_NOT_CHAR;
4093 }
4094 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4095 l = rb_enc_code_to_mbclen(c, enc);
4096 if (!l) return NEIGHBOR_NOT_CHAR;
4097 if (l != len) return NEIGHBOR_WRAPPED;
4098 rb_enc_mbcput(c, p, enc);
4099 r = rb_enc_precise_mbclen(p, p + len, enc);
4100 if (!MBCLEN_CHARFOUND_P(r)) {
4101 return NEIGHBOR_NOT_CHAR;
4102 }
4103 return NEIGHBOR_FOUND;
4104 }
4105 while (1) {
4106 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4107 p[i] = '\0';
4108 if (i < 0)
4109 return NEIGHBOR_WRAPPED;
4110 ++((unsigned char*)p)[i];
4111 l = rb_enc_precise_mbclen(p, p+len, enc);
4112 if (MBCLEN_CHARFOUND_P(l)) {
4113 l = MBCLEN_CHARFOUND_LEN(l);
4114 if (l == len) {
4115 return NEIGHBOR_FOUND;
4116 }
4117 else {
4118 memset(p+l, 0xff, len-l);
4119 }
4120 }
4121 if (MBCLEN_INVALID_P(l) && i < len-1) {
4122 long len2;
4123 int l2;
4124 for (len2 = len-1; 0 < len2; len2--) {
4125 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4126 if (!MBCLEN_INVALID_P(l2))
4127 break;
4128 }
4129 memset(p+len2+1, 0xff, len-(len2+1));
4130 }
4131 }
4132}
4133
4134static enum neighbor_char
4135enc_pred_char(char *p, long len, rb_encoding *enc)
4136{
4137 long i;
4138 int l;
4139 if (rb_enc_mbminlen(enc) > 1) {
4140 /* wchar, trivial case */
4141 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4142 if (!MBCLEN_CHARFOUND_P(r)) {
4143 return NEIGHBOR_NOT_CHAR;
4144 }
4145 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4146 if (!c) return NEIGHBOR_NOT_CHAR;
4147 --c;
4148 l = rb_enc_code_to_mbclen(c, enc);
4149 if (!l) return NEIGHBOR_NOT_CHAR;
4150 if (l != len) return NEIGHBOR_WRAPPED;
4151 rb_enc_mbcput(c, p, enc);
4152 r = rb_enc_precise_mbclen(p, p + len, enc);
4153 if (!MBCLEN_CHARFOUND_P(r)) {
4154 return NEIGHBOR_NOT_CHAR;
4155 }
4156 return NEIGHBOR_FOUND;
4157 }
4158 while (1) {
4159 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4160 p[i] = '\xff';
4161 if (i < 0)
4162 return NEIGHBOR_WRAPPED;
4163 --((unsigned char*)p)[i];
4164 l = rb_enc_precise_mbclen(p, p+len, enc);
4165 if (MBCLEN_CHARFOUND_P(l)) {
4166 l = MBCLEN_CHARFOUND_LEN(l);
4167 if (l == len) {
4168 return NEIGHBOR_FOUND;
4169 }
4170 else {
4171 memset(p+l, 0, len-l);
4172 }
4173 }
4174 if (MBCLEN_INVALID_P(l) && i < len-1) {
4175 long len2;
4176 int l2;
4177 for (len2 = len-1; 0 < len2; len2--) {
4178 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4179 if (!MBCLEN_INVALID_P(l2))
4180 break;
4181 }
4182 memset(p+len2+1, 0, len-(len2+1));
4183 }
4184 }
4185}
4186
4187/*
4188 overwrite +p+ by succeeding letter in +enc+ and returns
4189 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4190 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4191 assuming each ranges are successive, and mbclen
4192 never change in each ranges.
4193 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4194 character.
4195 */
4196static enum neighbor_char
4197enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4198{
4199 enum neighbor_char ret;
4200 unsigned int c;
4201 int ctype;
4202 int range;
4203 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4204
4205 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4206 int try;
4207 const int max_gaps = 1;
4208
4209 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4211 ctype = ONIGENC_CTYPE_DIGIT;
4212 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4213 ctype = ONIGENC_CTYPE_ALPHA;
4214 else
4215 return NEIGHBOR_NOT_CHAR;
4216
4217 MEMCPY(save, p, char, len);
4218 for (try = 0; try <= max_gaps; ++try) {
4219 ret = enc_succ_char(p, len, enc);
4220 if (ret == NEIGHBOR_FOUND) {
4221 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4222 if (rb_enc_isctype(c, ctype, enc))
4223 return NEIGHBOR_FOUND;
4224 }
4225 }
4226 MEMCPY(p, save, char, len);
4227 range = 1;
4228 while (1) {
4229 MEMCPY(save, p, char, len);
4230 ret = enc_pred_char(p, len, enc);
4231 if (ret == NEIGHBOR_FOUND) {
4232 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4233 if (!rb_enc_isctype(c, ctype, enc)) {
4234 MEMCPY(p, save, char, len);
4235 break;
4236 }
4237 }
4238 else {
4239 MEMCPY(p, save, char, len);
4240 break;
4241 }
4242 range++;
4243 }
4244 if (range == 1) {
4245 return NEIGHBOR_NOT_CHAR;
4246 }
4247
4248 if (ctype != ONIGENC_CTYPE_DIGIT) {
4249 MEMCPY(carry, p, char, len);
4250 return NEIGHBOR_WRAPPED;
4251 }
4252
4253 MEMCPY(carry, p, char, len);
4254 enc_succ_char(carry, len, enc);
4255 return NEIGHBOR_WRAPPED;
4256}
4257
4258
4259static VALUE str_succ(VALUE str);
4260
4261/*
4262 * call-seq:
4263 * string.succ -> new_str
4264 *
4265 * Returns the successor to +self+. The successor is calculated by
4266 * incrementing characters.
4267 *
4268 * The first character to be incremented is the rightmost alphanumeric:
4269 * or, if no alphanumerics, the rightmost character:
4270 * 'THX1138'.succ # => "THX1139"
4271 * '<<koala>>'.succ # => "<<koalb>>"
4272 * '***'.succ # => '**+'
4273 *
4274 * The successor to a digit is another digit, "carrying" to the next-left
4275 * character for a "rollover" from 9 to 0, and prepending another digit
4276 * if necessary:
4277 * '00'.succ # => "01"
4278 * '09'.succ # => "10"
4279 * '99'.succ # => "100"
4280 *
4281 * The successor to a letter is another letter of the same case,
4282 * carrying to the next-left character for a rollover,
4283 * and prepending another same-case letter if necessary:
4284 * 'aa'.succ # => "ab"
4285 * 'az'.succ # => "ba"
4286 * 'zz'.succ # => "aaa"
4287 * 'AA'.succ # => "AB"
4288 * 'AZ'.succ # => "BA"
4289 * 'ZZ'.succ # => "AAA"
4290 *
4291 * The successor to a non-alphanumeric character is the next character
4292 * in the underlying character set's collating sequence,
4293 * carrying to the next-left character for a rollover,
4294 * and prepending another character if necessary:
4295 * s = 0.chr * 3
4296 * s # => "\x00\x00\x00"
4297 * s.succ # => "\x00\x00\x01"
4298 * s = 255.chr * 3
4299 * s # => "\xFF\xFF\xFF"
4300 * s.succ # => "\x01\x00\x00\x00"
4301 *
4302 * Carrying can occur between and among mixtures of alphanumeric characters:
4303 * s = 'zz99zz99'
4304 * s.succ # => "aaa00aa00"
4305 * s = '99zz99zz'
4306 * s.succ # => "100aa00aa"
4307 *
4308 * The successor to an empty \String is a new empty \String:
4309 * ''.succ # => ""
4310 *
4311 * String#next is an alias for String#succ.
4312 */
4313
4314VALUE
4316{
4317 VALUE str;
4318 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4319 rb_enc_cr_str_copy_for_substr(str, orig);
4320 return str_succ(str);
4321}
4322
4323static VALUE
4324str_succ(VALUE str)
4325{
4326 rb_encoding *enc;
4327 char *sbeg, *s, *e, *last_alnum = 0;
4328 int found_alnum = 0;
4329 long l, slen;
4330 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4331 long carry_pos = 0, carry_len = 1;
4332 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4333
4334 slen = RSTRING_LEN(str);
4335 if (slen == 0) return str;
4336
4337 enc = STR_ENC_GET(str);
4338 sbeg = RSTRING_PTR(str);
4339 s = e = sbeg + slen;
4340
4341 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4342 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4343 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4344 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4345 break;
4346 }
4347 }
4348 l = rb_enc_precise_mbclen(s, e, enc);
4349 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4351 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4352 switch (neighbor) {
4353 case NEIGHBOR_NOT_CHAR:
4354 continue;
4355 case NEIGHBOR_FOUND:
4356 return str;
4357 case NEIGHBOR_WRAPPED:
4358 last_alnum = s;
4359 break;
4360 }
4361 found_alnum = 1;
4362 carry_pos = s - sbeg;
4363 carry_len = l;
4364 }
4365 if (!found_alnum) { /* str contains no alnum */
4366 s = e;
4367 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4368 enum neighbor_char neighbor;
4370 l = rb_enc_precise_mbclen(s, e, enc);
4371 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4373 MEMCPY(tmp, s, char, l);
4374 neighbor = enc_succ_char(tmp, l, enc);
4375 switch (neighbor) {
4376 case NEIGHBOR_FOUND:
4377 MEMCPY(s, tmp, char, l);
4378 return str;
4379 break;
4380 case NEIGHBOR_WRAPPED:
4381 MEMCPY(s, tmp, char, l);
4382 break;
4383 case NEIGHBOR_NOT_CHAR:
4384 break;
4385 }
4386 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4387 /* wrapped to \0...\0. search next valid char. */
4388 enc_succ_char(s, l, enc);
4389 }
4390 if (!rb_enc_asciicompat(enc)) {
4391 MEMCPY(carry, s, char, l);
4392 carry_len = l;
4393 }
4394 carry_pos = s - sbeg;
4395 }
4397 }
4398 RESIZE_CAPA(str, slen + carry_len);
4399 sbeg = RSTRING_PTR(str);
4400 s = sbeg + carry_pos;
4401 memmove(s + carry_len, s, slen - carry_pos);
4402 memmove(s, carry, carry_len);
4403 slen += carry_len;
4404 STR_SET_LEN(str, slen);
4405 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4407 return str;
4408}
4409
4410
4411/*
4412 * call-seq:
4413 * string.succ! -> self
4414 *
4415 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4416 *
4417 * String#next! is an alias for String#succ!.
4418 */
4419
4420static VALUE
4421rb_str_succ_bang(VALUE str)
4422{
4424 str_succ(str);
4425 return str;
4426}
4427
4428static int
4429all_digits_p(const char *s, long len)
4430{
4431 while (len-- > 0) {
4432 if (!ISDIGIT(*s)) return 0;
4433 s++;
4434 }
4435 return 1;
4436}
4437
4438static int
4439str_upto_i(VALUE str, VALUE arg)
4440{
4441 rb_yield(str);
4442 return 0;
4443}
4444
4445/*
4446 * call-seq:
4447 * string.upto(other_string, exclusive = false) {|string| ... } -> self
4448 * string.upto(other_string, exclusive = false) -> new_enumerator
4449 *
4450 * With a block given, calls the block with each \String value
4451 * returned by successive calls to String#succ;
4452 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4453 * the sequence terminates when value +other_string+ is reached;
4454 * returns +self+:
4455 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4456 * Output:
4457 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4458 *
4459 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4460 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4461 * Output:
4462 * a8 a9 b0 b1 b2 b3 b4 b5
4463 *
4464 * If +other_string+ would not be reached, does not call the block:
4465 * '25'.upto('5') {|s| fail s }
4466 * 'aa'.upto('a') {|s| fail s }
4467 *
4468 * With no block given, returns a new \Enumerator:
4469 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4470 */
4471
4472static VALUE
4473rb_str_upto(int argc, VALUE *argv, VALUE beg)
4474{
4475 VALUE end, exclusive;
4476
4477 rb_scan_args(argc, argv, "11", &end, &exclusive);
4479 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4480}
4481
4482VALUE
4483rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4484{
4485 VALUE current, after_end;
4486 ID succ;
4487 int n, ascii;
4488 rb_encoding *enc;
4489
4490 CONST_ID(succ, "succ");
4492 enc = rb_enc_check(beg, end);
4493 ascii = (is_ascii_string(beg) && is_ascii_string(end));
4494 /* single character */
4495 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4496 char c = RSTRING_PTR(beg)[0];
4497 char e = RSTRING_PTR(end)[0];
4498
4499 if (c > e || (excl && c == e)) return beg;
4500 for (;;) {
4501 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4502 if (!excl && c == e) break;
4503 c++;
4504 if (excl && c == e) break;
4505 }
4506 return beg;
4507 }
4508 /* both edges are all digits */
4509 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4510 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4511 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4512 VALUE b, e;
4513 int width;
4514
4515 width = RSTRING_LENINT(beg);
4516 b = rb_str_to_inum(beg, 10, FALSE);
4517 e = rb_str_to_inum(end, 10, FALSE);
4518 if (FIXNUM_P(b) && FIXNUM_P(e)) {
4519 long bi = FIX2LONG(b);
4520 long ei = FIX2LONG(e);
4521 rb_encoding *usascii = rb_usascii_encoding();
4522
4523 while (bi <= ei) {
4524 if (excl && bi == ei) break;
4525 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4526 bi++;
4527 }
4528 }
4529 else {
4530 ID op = excl ? '<' : idLE;
4531 VALUE args[2], fmt = rb_fstring_lit("%.*d");
4532
4533 args[0] = INT2FIX(width);
4534 while (rb_funcall(b, op, 1, e)) {
4535 args[1] = b;
4536 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4537 b = rb_funcallv(b, succ, 0, 0);
4538 }
4539 }
4540 return beg;
4541 }
4542 /* normal case */
4543 n = rb_str_cmp(beg, end);
4544 if (n > 0 || (excl && n == 0)) return beg;
4545
4546 after_end = rb_funcallv(end, succ, 0, 0);
4547 current = str_duplicate(rb_cString, beg);
4548 while (!rb_str_equal(current, after_end)) {
4549 VALUE next = Qnil;
4550 if (excl || !rb_str_equal(current, end))
4551 next = rb_funcallv(current, succ, 0, 0);
4552 if ((*each)(current, arg)) break;
4553 if (NIL_P(next)) break;
4554 current = next;
4555 StringValue(current);
4556 if (excl && rb_str_equal(current, end)) break;
4557 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4558 break;
4559 }
4560
4561 return beg;
4562}
4563
4564VALUE
4566{
4567 VALUE current;
4568 ID succ;
4569
4570 CONST_ID(succ, "succ");
4571 /* both edges are all digits */
4572 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4573 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4574 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4575 int width = RSTRING_LENINT(beg);
4576 b = rb_str_to_inum(beg, 10, FALSE);
4577 if (FIXNUM_P(b)) {
4578 long bi = FIX2LONG(b);
4579 rb_encoding *usascii = rb_usascii_encoding();
4580
4581 while (FIXABLE(bi)) {
4582 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4583 bi++;
4584 }
4585 b = LONG2NUM(bi);
4586 }
4587 args[0] = INT2FIX(width);
4588 while (1) {
4589 args[1] = b;
4590 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4591 b = rb_funcallv(b, succ, 0, 0);
4592 }
4593 }
4594 /* normal case */
4595 current = str_duplicate(rb_cString, beg);
4596 while (1) {
4597 VALUE next = rb_funcallv(current, succ, 0, 0);
4598 if ((*each)(current, arg)) break;
4599 current = next;
4600 StringValue(current);
4601 if (RSTRING_LEN(current) == 0)
4602 break;
4603 }
4604
4605 return beg;
4606}
4607
4608static int
4609include_range_i(VALUE str, VALUE arg)
4610{
4611 VALUE *argp = (VALUE *)arg;
4612 if (!rb_equal(str, *argp)) return 0;
4613 *argp = Qnil;
4614 return 1;
4615}
4616
4617VALUE
4619{
4623 if (NIL_P(val)) return Qfalse;
4624 val = rb_check_string_type(val);
4625 if (NIL_P(val)) return Qfalse;
4629 const char *bp = RSTRING_PTR(beg);
4630 const char *ep = RSTRING_PTR(end);
4631 const char *vp = RSTRING_PTR(val);
4632 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4633 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4634 return Qfalse;
4635 else {
4636 char b = *bp;
4637 char e = *ep;
4638 char v = *vp;
4639
4640 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4641 if (b <= v && v < e) return Qtrue;
4642 if (!RTEST(exclusive) && v == e) return Qtrue;
4643 return Qfalse;
4644 }
4645 }
4646 }
4647#if 0
4648 /* both edges are all digits */
4649 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4650 all_digits_p(bp, RSTRING_LEN(beg)) &&
4651 all_digits_p(ep, RSTRING_LEN(end))) {
4652 /* TODO */
4653 }
4654#endif
4655 }
4656 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4657
4658 return NIL_P(val) ? Qtrue : Qfalse;
4659}
4660
4661static VALUE
4662rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4663{
4664 if (rb_reg_search(re, str, 0, 0) >= 0) {
4666 int nth = rb_reg_backref_number(match, backref);
4667 return rb_reg_nth_match(nth, match);
4668 }
4669 return Qnil;
4670}
4671
4672static VALUE
4673rb_str_aref(VALUE str, VALUE indx)
4674{
4675 long idx;
4676
4677 if (FIXNUM_P(indx)) {
4678 idx = FIX2LONG(indx);
4679 }
4680 else if (RB_TYPE_P(indx, T_REGEXP)) {
4681 return rb_str_subpat(str, indx, INT2FIX(0));
4682 }
4683 else if (RB_TYPE_P(indx, T_STRING)) {
4684 if (rb_str_index(str, indx, 0) != -1)
4685 return str_duplicate(rb_cString, indx);
4686 return Qnil;
4687 }
4688 else {
4689 /* check if indx is Range */
4690 long beg, len = str_strlen(str, NULL);
4691 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4692 case Qfalse:
4693 break;
4694 case Qnil:
4695 return Qnil;
4696 default:
4697 return rb_str_substr(str, beg, len);
4698 }
4699 idx = NUM2LONG(indx);
4700 }
4701
4702 return str_substr(str, idx, 1, FALSE);
4703}
4704
4705
4706/*
4707 * call-seq:
4708 * string[index] -> new_string or nil
4709 * string[start, length] -> new_string or nil
4710 * string[range] -> new_string or nil
4711 * string[regexp, capture = 0] -> new_string or nil
4712 * string[substring] -> new_string or nil
4713 *
4714 * Returns the substring of +self+ specified by the arguments.
4715 *
4716 * When the single \Integer argument +index+ is given,
4717 * returns the 1-character substring found in +self+ at offset +index+:
4718 * 'bar'[2] # => "r"
4719 * Counts backward from the end of +self+ if +index+ is negative:
4720 * 'foo'[-3] # => "f"
4721 * Returns +nil+ if +index+ is out of range:
4722 * 'foo'[3] # => nil
4723 * 'foo'[-4] # => nil
4724 *
4725 * When the two \Integer arguments +start+ and +length+ are given,
4726 * returns the substring of the given +length+ found in +self+ at offset +start+:
4727 * 'foo'[0, 2] # => "fo"
4728 * 'foo'[0, 0] # => ""
4729 * Counts backward from the end of +self+ if +start+ is negative:
4730 * 'foo'[-2, 2] # => "oo"
4731 * Special case: returns a new empty \String if +start+ is equal to the length of +self+:
4732 * 'foo'[3, 2] # => ""
4733 * Returns +nil+ if +start+ is out of range:
4734 * 'foo'[4, 2] # => nil
4735 * 'foo'[-4, 2] # => nil
4736 * Returns the trailing substring of +self+ if +length+ is large:
4737 * 'foo'[1, 50] # => "oo"
4738 * Returns +nil+ if +length+ is negative:
4739 * 'foo'[0, -1] # => nil
4740 *
4741 * When the single \Range argument +range+ is given,
4742 * derives +start+ and +length+ values from the given +range+,
4743 * and returns values as above:
4744 * - <tt>'foo'[0..1]</tt> is equivalent to <tt>'foo'[0, 2]</tt>.
4745 * - <tt>'foo'[0...1]</tt> is equivalent to <tt>'foo'[0, 1]</tt>.
4746 *
4747 * When the \Regexp argument +regexp+ is given,
4748 * and the +capture+ argument is <tt>0</tt>,
4749 * returns the first matching substring found in +self+,
4750 * or +nil+ if none found:
4751 * 'foo'[/o/] # => "o"
4752 * 'foo'[/x/] # => nil
4753 * s = 'hello there'
4754 * s[/[aeiou](.)\1/] # => "ell"
4755 * s[/[aeiou](.)\1/, 0] # => "ell"
4756 *
4757 * If argument +capture+ is given and not <tt>0</tt>,
4758 * it should be either an \Integer capture group index or a \String or \Symbol capture group name;
4759 * the method call returns only the specified capture
4760 * (see {Regexp Capturing}[Regexp.html#class-Regexp-label-Capturing]):
4761 * s = 'hello there'
4762 * s[/[aeiou](.)\1/, 1] # => "l"
4763 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] # => "l"
4764 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, :vowel] # => "e"
4765 *
4766 * If an invalid capture group index is given, +nil+ is returned. If an invalid
4767 * capture group name is given, +IndexError+ is raised.
4768 *
4769 * When the single \String argument +substring+ is given,
4770 * returns the substring from +self+ if found, otherwise +nil+:
4771 * 'foo'['oo'] # => "oo"
4772 * 'foo'['xx'] # => nil
4773 *
4774 * String#slice is an alias for String#[].
4775 */
4776
4777static VALUE
4778rb_str_aref_m(int argc, VALUE *argv, VALUE str)
4779{
4780 if (argc == 2) {
4781 if (RB_TYPE_P(argv[0], T_REGEXP)) {
4782 return rb_str_subpat(str, argv[0], argv[1]);
4783 }
4784 else {
4785 long beg = NUM2LONG(argv[0]);
4786 long len = NUM2LONG(argv[1]);
4787 return rb_str_substr(str, beg, len);
4788 }
4789 }
4790 rb_check_arity(argc, 1, 2);
4791 return rb_str_aref(str, argv[0]);
4792}
4793
4794VALUE
4796{
4797 char *ptr = RSTRING_PTR(str);
4798 long olen = RSTRING_LEN(str), nlen;
4799
4800 str_modifiable(str);
4801 if (len > olen) len = olen;
4802 nlen = olen - len;
4803 if (STR_EMBEDDABLE_P(nlen, TERM_LEN(str))) {
4804 char *oldptr = ptr;
4805 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
4807 STR_SET_EMBED_LEN(str, nlen);
4808 ptr = RSTRING(str)->as.ary;
4809 memmove(ptr, oldptr + len, nlen);
4810 if (fl == STR_NOEMBED) xfree(oldptr);
4811 }
4812 else {
4813 if (!STR_SHARED_P(str)) rb_str_new_frozen(str);
4814 ptr = RSTRING(str)->as.heap.ptr += len;
4815 RSTRING(str)->as.heap.len = nlen;
4816 }
4817 ptr[nlen] = 0;
4819 return str;
4820}
4821
4822static void
4823rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
4824{
4825 char *sptr;
4826 long slen, vlen = RSTRING_LEN(val);
4827 int cr;
4828
4829 if (beg == 0 && vlen == 0) {
4831 return;
4832 }
4833
4834 str_modify_keep_cr(str);
4835 RSTRING_GETMEM(str, sptr, slen);
4836 if (len < vlen) {
4837 /* expand string */
4838 RESIZE_CAPA(str, slen + vlen - len);
4839 sptr = RSTRING_PTR(str);
4840 }
4841
4843 cr = rb_enc_str_coderange(val);
4844 else
4846
4847 if (vlen != len) {
4848 memmove(sptr + beg + vlen,
4849 sptr + beg + len,
4850 slen - (beg + len));
4851 }
4852 if (vlen < beg && len < 0) {
4853 MEMZERO(sptr + slen, char, -len);
4854 }
4855 if (vlen > 0) {
4856 memmove(sptr + beg, RSTRING_PTR(val), vlen);
4857 }
4858 slen += vlen - len;
4859 STR_SET_LEN(str, slen);
4860 TERM_FILL(&sptr[slen], TERM_LEN(str));
4862}
4863
4864void
4866{
4867 long slen;
4868 char *p, *e;
4869 rb_encoding *enc;
4870 int singlebyte = single_byte_optimizable(str);
4871 int cr;
4872
4873 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
4874
4875 StringValue(val);
4876 enc = rb_enc_check(str, val);
4877 slen = str_strlen(str, enc); /* rb_enc_check */
4878
4879 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
4880 rb_raise(rb_eIndexError, "index %ld out of string", beg);
4881 }
4882 if (beg < 0) {
4883 beg += slen;
4884 }
4885 assert(beg >= 0);
4886 assert(beg <= slen);
4887 if (len > slen - beg) {
4888 len = slen - beg;
4889 }
4890 str_modify_keep_cr(str);
4891 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
4892 if (!p) p = RSTRING_END(str);
4893 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
4894 if (!e) e = RSTRING_END(str);
4895 /* error check */
4896 beg = p - RSTRING_PTR(str); /* physical position */
4897 len = e - p; /* physical length */
4898 rb_str_splice_0(str, beg, len, val);
4899 rb_enc_associate(str, enc);
4901 if (cr != ENC_CODERANGE_BROKEN)
4903}
4904
4905#define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
4906
4907static void
4908rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
4909{
4910 int nth;
4911 VALUE match;
4912 long start, end, len;
4913 rb_encoding *enc;
4914 struct re_registers *regs;
4915
4916 if (rb_reg_search(re, str, 0, 0) < 0) {
4917 rb_raise(rb_eIndexError, "regexp not matched");
4918 }
4920 nth = rb_reg_backref_number(match, backref);
4921 regs = RMATCH_REGS(match);
4922 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
4923 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
4924 }
4925 if (nth < 0) {
4926 nth += regs->num_regs;
4927 }
4928
4929 start = BEG(nth);
4930 if (start == -1) {
4931 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
4932 }
4933 end = END(nth);
4934 len = end - start;
4935 StringValue(val);
4936 enc = rb_enc_check_str(str, val);
4937 rb_str_splice_0(str, start, len, val);
4938 rb_enc_associate(str, enc);
4939}
4940
4941static VALUE
4942rb_str_aset(VALUE str, VALUE indx, VALUE val)
4943{
4944 long idx, beg;
4945
4946 switch (TYPE(indx)) {
4947 case T_REGEXP:
4948 rb_str_subpat_set(str, indx, INT2FIX(0), val);
4949 return val;
4950
4951 case T_STRING:
4952 beg = rb_str_index(str, indx, 0);
4953 if (beg < 0) {
4954 rb_raise(rb_eIndexError, "string not matched");
4955 }
4957 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
4958 return val;
4959
4960 default:
4961 /* check if indx is Range */
4962 {
4963 long beg, len;
4964 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
4965 rb_str_splice(str, beg, len, val);
4966 return val;
4967 }
4968 }
4969 /* FALLTHROUGH */
4970
4971 case T_FIXNUM:
4972 idx = NUM2LONG(indx);
4973 rb_str_splice(str, idx, 1, val);
4974 return val;
4975 }
4976}
4977
4978/*
4979 * call-seq:
4980 * str[integer] = new_str
4981 * str[integer, integer] = new_str
4982 * str[range] = aString
4983 * str[regexp] = new_str
4984 * str[regexp, integer] = new_str
4985 * str[regexp, name] = new_str
4986 * str[other_str] = new_str
4987 *
4988 * Element Assignment---Replaces some or all of the content of
4989 * <i>str</i>. The portion of the string affected is determined using
4990 * the same criteria as String#[]. If the replacement string is not
4991 * the same length as the text it is replacing, the string will be
4992 * adjusted accordingly. If the regular expression or string is used
4993 * as the index doesn't match a position in the string, IndexError is
4994 * raised. If the regular expression form is used, the optional
4995 * second Integer allows you to specify which portion of the match to
4996 * replace (effectively using the MatchData indexing rules. The forms
4997 * that take an Integer will raise an IndexError if the value is out
4998 * of range; the Range form will raise a RangeError, and the Regexp
4999 * and String will raise an IndexError on negative match.
5000 */
5001
5002static VALUE
5003rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5004{
5005 if (argc == 3) {
5006 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5007 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5008 }
5009 else {
5011 }
5012 return argv[2];
5013 }
5014 rb_check_arity(argc, 2, 3);
5015 return rb_str_aset(str, argv[0], argv[1]);
5016}
5017
5018/*
5019 * call-seq:
5020 * string.insert(index, other_string) -> self
5021 *
5022 * Inserts the given +other_string+ into +self+; returns +self+.
5023 *
5024 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5025 * 'foo'.insert(1, 'bar') # => "fbaroo"
5026 *
5027 * If the \Integer +index+ is negative, counts backward from the end of +self+
5028 * and inserts +other_string+ at offset <tt>index+1</tt>
5029 * (that is, _after_ <tt>self[index]</tt>):
5030 * 'foo'.insert(-2, 'bar') # => "fobaro"
5031 */
5032
5033static VALUE
5034rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5035{
5036 long pos = NUM2LONG(idx);
5037
5038 if (pos == -1) {
5039 return rb_str_append(str, str2);
5040 }
5041 else if (pos < 0) {
5042 pos++;
5043 }
5044 rb_str_splice(str, pos, 0, str2);
5045 return str;
5046}
5047
5048
5049/*
5050 * call-seq:
5051 * str.slice!(integer) -> new_str or nil
5052 * str.slice!(integer, integer) -> new_str or nil
5053 * str.slice!(range) -> new_str or nil
5054 * str.slice!(regexp) -> new_str or nil
5055 * str.slice!(other_str) -> new_str or nil
5056 *
5057 * Deletes the specified portion from <i>str</i>, and returns the portion
5058 * deleted.
5059 *
5060 * string = "this is a string"
5061 * string.slice!(2) #=> "i"
5062 * string.slice!(3..6) #=> " is "
5063 * string.slice!(/s.*t/) #=> "sa st"
5064 * string.slice!("r") #=> "r"
5065 * string #=> "thing"
5066 */
5067
5068static VALUE
5069rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5070{
5071 VALUE result = Qnil;
5072 VALUE indx;
5073 long beg, len = 1;
5074 char *p;
5075
5076 rb_check_arity(argc, 1, 2);
5077 str_modify_keep_cr(str);
5078 indx = argv[0];
5079 if (RB_TYPE_P(indx, T_REGEXP)) {
5080 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5082 struct re_registers *regs = RMATCH_REGS(match);
5083 int nth = 0;
5084 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5085 if ((nth += regs->num_regs) <= 0) return Qnil;
5086 }
5087 else if (nth >= regs->num_regs) return Qnil;
5088 beg = BEG(nth);
5089 len = END(nth) - beg;
5090 goto subseq;
5091 }
5092 else if (argc == 2) {
5093 beg = NUM2LONG(indx);
5094 len = NUM2LONG(argv[1]);
5095 goto num_index;
5096 }
5097 else if (FIXNUM_P(indx)) {
5098 beg = FIX2LONG(indx);
5099 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5100 if (!len) return Qnil;
5101 beg = p - RSTRING_PTR(str);
5102 goto subseq;
5103 }
5104 else if (RB_TYPE_P(indx, T_STRING)) {
5105 beg = rb_str_index(str, indx, 0);
5106 if (beg == -1) return Qnil;
5107 len = RSTRING_LEN(indx);
5108 result = str_duplicate(rb_cString, indx);
5109 goto squash;
5110 }
5111 else {
5112 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5113 case Qnil:
5114 return Qnil;
5115 case Qfalse:
5116 beg = NUM2LONG(indx);
5117 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5118 if (!len) return Qnil;
5119 beg = p - RSTRING_PTR(str);
5120 goto subseq;
5121 default:
5122 goto num_index;
5123 }
5124 }
5125
5126 num_index:
5127 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5128 beg = p - RSTRING_PTR(str);
5129
5130 subseq:
5131 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5132 rb_enc_cr_str_copy_for_substr(result, str);
5133
5134 squash:
5135 if (len > 0) {
5136 if (beg == 0) {
5138 }
5139 else {
5140 char *sptr = RSTRING_PTR(str);
5141 long slen = RSTRING_LEN(str);
5142 if (beg + len > slen) /* pathological check */
5143 len = slen - beg;
5144 memmove(sptr + beg,
5145 sptr + beg + len,
5146 slen - (beg + len));
5147 slen -= len;
5148 STR_SET_LEN(str, slen);
5149 TERM_FILL(&sptr[slen], TERM_LEN(str));
5150 }
5151 }
5152 return result;
5153}
5154
5155static VALUE
5156get_pat(VALUE pat)
5157{
5158 VALUE val;
5159
5160 switch (OBJ_BUILTIN_TYPE(pat)) {
5161 case T_REGEXP:
5162 return pat;
5163
5164 case T_STRING:
5165 break;
5166
5167 default:
5168 val = rb_check_string_type(pat);
5169 if (NIL_P(val)) {
5170 Check_Type(pat, T_REGEXP);
5171 }
5172 pat = val;
5173 }
5174
5175 return rb_reg_regcomp(pat);
5176}
5177
5178static VALUE
5179get_pat_quoted(VALUE pat, int check)
5180{
5181 VALUE val;
5182
5183 switch (OBJ_BUILTIN_TYPE(pat)) {
5184 case T_REGEXP:
5185 return pat;
5186
5187 case T_STRING:
5188 break;
5189
5190 default:
5191 val = rb_check_string_type(pat);
5192 if (NIL_P(val)) {
5193 Check_Type(pat, T_REGEXP);
5194 }
5195 pat = val;
5196 }
5197 if (check && is_broken_string(pat)) {
5199 }
5200 return pat;
5201}
5202
5203static long
5204rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5205{
5206 if (BUILTIN_TYPE(pat) == T_STRING) {
5207 pos = rb_strseq_index(str, pat, pos, 1);
5208 if (set_backref_str) {
5209 if (pos >= 0) {
5210 str = rb_str_new_frozen_String(str);
5212 }
5213 else {
5215 }
5216 }
5217 return pos;
5218 }
5219 else {
5220 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5221 }
5222}
5223
5224
5225/*
5226 * call-seq:
5227 * str.sub!(pattern, replacement) -> str or nil
5228 * str.sub!(pattern) {|match| block } -> str or nil
5229 *
5230 * Performs the same substitution as String#sub in-place.
5231 *
5232 * Returns +str+ if a substitution was performed or +nil+ if no substitution
5233 * was performed.
5234 */
5235
5236static VALUE
5237rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5238{
5239 VALUE pat, repl, hash = Qnil;
5240 int iter = 0;
5241 long plen;
5242 int min_arity = rb_block_given_p() ? 1 : 2;
5243 long beg;
5244
5245 rb_check_arity(argc, min_arity, 2);
5246 if (argc == 1) {
5247 iter = 1;
5248 }
5249 else {
5250 repl = argv[1];
5251 hash = rb_check_hash_type(argv[1]);
5252 if (NIL_P(hash)) {
5253 StringValue(repl);
5254 }
5255 }
5256
5257 pat = get_pat_quoted(argv[0], 1);
5258
5259 str_modifiable(str);
5260 beg = rb_pat_search(pat, str, 0, 1);
5261 if (beg >= 0) {
5262 rb_encoding *enc;
5263 int cr = ENC_CODERANGE(str);
5264 long beg0, end0;
5265 VALUE match, match0 = Qnil;
5266 struct re_registers *regs;
5267 char *p, *rp;
5268 long len, rlen;
5269
5271 regs = RMATCH_REGS(match);
5272 if (RB_TYPE_P(pat, T_STRING)) {
5273 beg0 = beg;
5274 end0 = beg0 + RSTRING_LEN(pat);
5275 match0 = pat;
5276 }
5277 else {
5278 beg0 = BEG(0);
5279 end0 = END(0);
5280 if (iter) match0 = rb_reg_nth_match(0, match);
5281 }
5282
5283 if (iter || !NIL_P(hash)) {
5285
5286 if (iter) {
5287 repl = rb_obj_as_string(rb_yield(match0));
5288 }
5289 else {
5290 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5291 repl = rb_obj_as_string(repl);
5292 }
5293 str_mod_check(str, p, len);
5295 }
5296 else {
5297 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5298 }
5299
5300 enc = rb_enc_compatible(str, repl);
5301 if (!enc) {
5302 rb_encoding *str_enc = STR_ENC_GET(str);
5304 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5305 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5306 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5307 rb_enc_name(str_enc),
5308 rb_enc_name(STR_ENC_GET(repl)));
5309 }
5310 enc = STR_ENC_GET(repl);
5311 }
5313 rb_enc_associate(str, enc);
5315 int cr2 = ENC_CODERANGE(repl);
5316 if (cr2 == ENC_CODERANGE_BROKEN ||
5317 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5319 else
5320 cr = cr2;
5321 }
5322 plen = end0 - beg0;
5323 rlen = RSTRING_LEN(repl);
5324 len = RSTRING_LEN(str);
5325 if (rlen > plen) {
5326 RESIZE_CAPA(str, len + rlen - plen);
5327 }
5328 p = RSTRING_PTR(str);
5329 if (rlen != plen) {
5330 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5331 }
5332 rp = RSTRING_PTR(repl);
5333 memmove(p + beg0, rp, rlen);
5334 len += rlen - plen;
5338
5339 return str;
5340 }
5341 return Qnil;
5342}
5343
5344
5345/*
5346 * call-seq:
5347 * str.sub(pattern, replacement) -> new_str
5348 * str.sub(pattern, hash) -> new_str
5349 * str.sub(pattern) {|match| block } -> new_str
5350 *
5351 * Returns a copy of +str+ with the _first_ occurrence of +pattern+
5352 * replaced by the second argument. The +pattern+ is typically a Regexp; if
5353 * given as a String, any regular expression metacharacters it contains will
5354 * be interpreted literally, e.g. <code>\d</code> will match a backslash
5355 * followed by 'd', instead of a digit.
5356 *
5357 * If +replacement+ is a String it will be substituted for the matched text.
5358 * It may contain back-references to the pattern's capture groups of the form
5359 * <code>\d</code>, where <i>d</i> is a group number, or
5360 * <code>\k<n></code>, where <i>n</i> is a group name.
5361 * Similarly, <code>\&</code>, <code>\'</code>, <code>\`</code>, and
5362 * <code>\+</code> correspond to special variables, <code>$&</code>,
5363 * <code>$'</code>, <code>$`</code>, and <code>$+</code>, respectively.
5364 * (See rdoc-ref:regexp.rdoc for details.)
5365 * <code>\0</code> is the same as <code>\&</code>.
5366 * <code>\\\</code> is interpreted as an escape, i.e., a single backslash.
5367 * Note that, within +replacement+ the special match variables, such as
5368 * <code>$&</code>, will not refer to the current match.
5369 *
5370 * If the second argument is a Hash, and the matched text is one of its keys,
5371 * the corresponding value is the replacement string.
5372 *
5373 * In the block form, the current match string is passed in as a parameter,
5374 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5375 * <code>$&</code>, and <code>$'</code> will be set appropriately.
5376 * (See rdoc-ref:regexp.rdoc for details.)
5377 * The value returned by the block will be substituted for the match on each
5378 * call.
5379 *
5380 * "hello".sub(/[aeiou]/, '*') #=> "h*llo"
5381 * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo"
5382 * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello"
5383 * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo"
5384 * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
5385 * #=> "Is /bin/bash your preferred shell?"
5386 *
5387 * Note that a string literal consumes backslashes.
5388 * (See rdoc-ref:syntax/literals.rdoc for details about string literals.)
5389 * Back-references are typically preceded by an additional backslash.
5390 * For example, if you want to write a back-reference <code>\&</code> in
5391 * +replacement+ with a double-quoted string literal, you need to write:
5392 * <code>"..\\\\&.."</code>.
5393 * If you want to write a non-back-reference string <code>\&</code> in
5394 * +replacement+, you need first to escape the backslash to prevent
5395 * this method from interpreting it as a back-reference, and then you
5396 * need to escape the backslashes again to prevent a string literal from
5397 * consuming them: <code>"..\\\\\\\\&.."</code>.
5398 * You may want to use the block form to avoid a lot of backslashes.
5399 */
5400
5401static VALUE
5402rb_str_sub(int argc, VALUE *argv, VALUE str)
5403{
5404 str = str_duplicate(rb_cString, str);
5405 rb_str_sub_bang(argc, argv, str);
5406 return str;
5407}
5408
5409static VALUE
5410str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5411{
5412 VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5413 struct re_registers *regs;
5414 long beg, beg0, end0;
5415 long offset, blen, slen, len, last;
5416 enum {STR, ITER, MAP} mode = STR;
5417 char *sp, *cp;
5418 int need_backref = -1;
5419 rb_encoding *str_enc;
5420
5421 switch (argc) {
5422 case 1:
5424 mode = ITER;
5425 break;
5426 case 2:
5427 repl = argv[1];
5428 hash = rb_check_hash_type(argv[1]);
5429 if (NIL_P(hash)) {
5430 StringValue(repl);
5431 }
5432 else {
5433 mode = MAP;
5434 }
5435 break;
5436 default:
5437 rb_error_arity(argc, 1, 2);
5438 }
5439
5440 pat = get_pat_quoted(argv[0], 1);
5441 beg = rb_pat_search(pat, str, 0, need_backref);
5442 if (beg < 0) {
5443 if (bang) return Qnil; /* no match, no substitution */
5444 return str_duplicate(rb_cString, str);
5445 }
5446
5447 offset = 0;
5448 blen = RSTRING_LEN(str) + 30; /* len + margin */
5449 dest = rb_str_buf_new(blen);
5450 sp = RSTRING_PTR(str);
5451 slen = RSTRING_LEN(str);
5452 cp = sp;
5453 str_enc = STR_ENC_GET(str);
5454 rb_enc_associate(dest, str_enc);
5456
5457 do {
5459 regs = RMATCH_REGS(match);
5460 if (RB_TYPE_P(pat, T_STRING)) {
5461 beg0 = beg;
5462 end0 = beg0 + RSTRING_LEN(pat);
5463 match0 = pat;
5464 }
5465 else {
5466 beg0 = BEG(0);
5467 end0 = END(0);
5468 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5469 }
5470
5471 if (mode) {
5472 if (mode == ITER) {
5473 val = rb_obj_as_string(rb_yield(match0));
5474 }
5475 else {
5476 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5477 val = rb_obj_as_string(val);
5478 }
5479 str_mod_check(str, sp, slen);
5480 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5481 rb_raise(rb_eRuntimeError, "block should not cheat");
5482 }
5483 }
5484 else if (need_backref) {
5485 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5486 if (need_backref < 0) {
5487 need_backref = val != repl;
5488 }
5489 }
5490 else {
5491 val = repl;
5492 }
5493
5494 len = beg0 - offset; /* copy pre-match substr */
5495 if (len) {
5496 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5497 }
5498
5499 rb_str_buf_append(dest, val);
5500
5501 last = offset;
5502 offset = end0;
5503 if (beg0 == end0) {
5504 /*
5505 * Always consume at least one character of the input string
5506 * in order to prevent infinite loops.
5507 */
5508 if (RSTRING_LEN(str) <= end0) break;
5509 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5510 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5511 offset = end0 + len;
5512 }
5513 cp = RSTRING_PTR(str) + offset;
5514 if (offset > RSTRING_LEN(str)) break;
5515 beg = rb_pat_search(pat, str, offset, need_backref);
5516 } while (beg >= 0);
5517 if (RSTRING_LEN(str) > offset) {
5518 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5519 }
5520 rb_pat_search(pat, str, last, 1);
5521 if (bang) {
5522 str_shared_replace(str, dest);
5523 }
5524 else {
5525 str = dest;
5526 }
5527
5528 return str;
5529}
5530
5531
5532/*
5533 * call-seq:
5534 * str.gsub!(pattern, replacement) -> str or nil
5535 * str.gsub!(pattern, hash) -> str or nil
5536 * str.gsub!(pattern) {|match| block } -> str or nil
5537 * str.gsub!(pattern) -> an_enumerator
5538 *
5539 * Performs the substitutions of String#gsub in place, returning
5540 * <i>str</i>, or <code>nil</code> if no substitutions were
5541 * performed. If no block and no <i>replacement</i> is given, an
5542 * enumerator is returned instead.
5543 */
5544
5545static VALUE
5546rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5547{
5548 str_modify_keep_cr(str);
5549 return str_gsub(argc, argv, str, 1);
5550}
5551
5552
5553/*
5554 * call-seq:
5555 * str.gsub(pattern, replacement) -> new_str
5556 * str.gsub(pattern, hash) -> new_str
5557 * str.gsub(pattern) {|match| block } -> new_str
5558 * str.gsub(pattern) -> enumerator
5559 *
5560 * Returns a copy of <i>str</i> with <em>all</em> occurrences of
5561 * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
5562 * typically a Regexp; if given as a String, any
5563 * regular expression metacharacters it contains will be interpreted
5564 * literally, e.g. <code>\d</code> will match a backslash followed by 'd',
5565 * instead of a digit.
5566 *
5567 * If +replacement+ is a String it will be substituted for the matched text.
5568 * It may contain back-references to the pattern's capture groups of the form
5569 * <code>\d</code>, where <i>d</i> is a group number, or
5570 * <code>\k<n></code>, where <i>n</i> is a group name.
5571 * Similarly, <code>\&</code>, <code>\'</code>, <code>\`</code>, and
5572 * <code>\+</code> correspond to special variables, <code>$&</code>,
5573 * <code>$'</code>, <code>$`</code>, and <code>$+</code>, respectively.
5574 * (See rdoc-ref:regexp.rdoc for details.)
5575 * <code>\0</code> is the same as <code>\&</code>.
5576 * <code>\\\</code> is interpreted as an escape, i.e., a single backslash.
5577 * Note that, within +replacement+ the special match variables, such as
5578 * <code>$&</code>, will not refer to the current match.
5579 *
5580 * If the second argument is a Hash, and the matched text is one
5581 * of its keys, the corresponding value is the replacement string.
5582 *
5583 * In the block form, the current match string is passed in as a parameter,
5584 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5585 * <code>$&</code>, and <code>$'</code> will be set appropriately.
5586 * (See rdoc-ref:regexp.rdoc for details.)
5587 * The value returned by the block will be substituted for the match on each
5588 * call.
5589 *
5590 * When neither a block nor a second argument is supplied, an
5591 * Enumerator is returned.
5592 *
5593 * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*"
5594 * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>"
5595 * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 "
5596 * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}"
5597 * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*"
5598 *
5599 * Note that a string literal consumes backslashes.
5600 * (See rdoc-ref:syntax/literals.rdoc for details on string literals.)
5601 * Back-references are typically preceded by an additional backslash.
5602 * For example, if you want to write a back-reference <code>\&</code> in
5603 * +replacement+ with a double-quoted string literal, you need to write:
5604 * <code>"..\\\\&.."</code>.
5605 * If you want to write a non-back-reference string <code>\&</code> in
5606 * +replacement+, you need first to escape the backslash to prevent
5607 * this method from interpreting it as a back-reference, and then you
5608 * need to escape the backslashes again to prevent a string literal from
5609 * consuming them: <code>"..\\\\\\\\&.."</code>.
5610 * You may want to use the block form to avoid a lot of backslashes.
5611 */
5612
5613static VALUE
5614rb_str_gsub(int argc, VALUE *argv, VALUE str)
5615{
5616 return str_gsub(argc, argv, str, 0);
5617}
5618
5619
5620/*
5621 * call-seq:
5622 * str.replace(other_str) -> str
5623 *
5624 * Replaces the contents of <i>str</i> with the corresponding
5625 * values in <i>other_str</i>.
5626 *
5627 * s = "hello" #=> "hello"
5628 * s.replace "world" #=> "world"
5629 */
5630
5631VALUE
5633{
5634 str_modifiable(str);
5635 if (str == str2) return str;
5636
5637 StringValue(str2);
5638 str_discard(str);
5639 return str_replace(str, str2);
5640}
5641
5642/*
5643 * call-seq:
5644 * string.clear -> string
5645 *
5646 * Makes string empty.
5647 *
5648 * a = "abcde"
5649 * a.clear #=> ""
5650 */
5651
5652static VALUE
5653rb_str_clear(VALUE str)
5654{
5655 str_discard(str);
5658 RSTRING_PTR(str)[0] = 0;
5661 else
5663 return str;
5664}
5665
5666/*
5667 * call-seq:
5668 * string.chr -> string
5669 *
5670 * Returns a one-character string at the beginning of the string.
5671 *
5672 * a = "abcde"
5673 * a.chr #=> "a"
5674 */
5675
5676static VALUE
5677rb_str_chr(VALUE str)
5678{
5679 return rb_str_substr(str, 0, 1);
5680}
5681
5682/*
5683 * call-seq:
5684 * str.getbyte(index) -> 0 .. 255
5685 *
5686 * returns the <i>index</i>th byte as an integer.
5687 */
5688static VALUE
5689rb_str_getbyte(VALUE str, VALUE index)
5690{
5691 long pos = NUM2LONG(index);
5692
5693 if (pos < 0)
5694 pos += RSTRING_LEN(str);
5695 if (pos < 0 || RSTRING_LEN(str) <= pos)
5696 return Qnil;
5697
5698 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5699}
5700
5701/*
5702 * call-seq:
5703 * str.setbyte(index, integer) -> integer
5704 *
5705 * modifies the <i>index</i>th byte as <i>integer</i>.
5706 */
5707static VALUE
5708rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5709{
5710 long pos = NUM2LONG(index);
5711 long len = RSTRING_LEN(str);
5712 char *head, *left = 0;
5713 unsigned char *ptr;
5714 rb_encoding *enc;
5715 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5716
5717 if (pos < -len || len <= pos)
5718 rb_raise(rb_eIndexError, "index %ld out of string", pos);
5719 if (pos < 0)
5720 pos += len;
5721
5722 VALUE v = rb_to_int(value);
5723 VALUE w = rb_int_and(v, INT2FIX(0xff));
5724 unsigned char byte = NUM2INT(w) & 0xFF;
5725
5726 if (!str_independent(str))
5727 str_make_independent(str);
5728 enc = STR_ENC_GET(str);
5729 head = RSTRING_PTR(str);
5730 ptr = (unsigned char *)&head[pos];
5731 if (!STR_EMBED_P(str)) {
5732 cr = ENC_CODERANGE(str);
5733 switch (cr) {
5734 case ENC_CODERANGE_7BIT:
5735 left = (char *)ptr;
5736 *ptr = byte;
5737 if (ISASCII(byte)) goto end;
5738 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5739 if (!MBCLEN_CHARFOUND_P(nlen))
5741 else
5743 goto end;
5745 left = rb_enc_left_char_head(head, ptr, head+len, enc);
5746 width = rb_enc_precise_mbclen(left, head+len, enc);
5747 *ptr = byte;
5748 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5749 if (!MBCLEN_CHARFOUND_P(nlen))
5751 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5753 goto end;
5754 }
5755 }
5757 *ptr = byte;
5758
5759 end:
5760 return value;
5761}
5762
5763static VALUE
5764str_byte_substr(VALUE str, long beg, long len, int empty)
5765{
5766 char *p, *s = RSTRING_PTR(str);
5767 long n = RSTRING_LEN(str);
5768 VALUE str2;
5769
5770 if (beg > n || len < 0) return Qnil;
5771 if (beg < 0) {
5772 beg += n;
5773 if (beg < 0) return Qnil;
5774 }
5775 if (len > n - beg)
5776 len = n - beg;
5777 if (len <= 0) {
5778 if (!empty) return Qnil;
5779 len = 0;
5780 p = 0;
5781 }
5782 else
5783 p = s + beg;
5784
5786 str2 = rb_str_new_frozen(str);
5787 str2 = str_new_shared(rb_cString, str2);
5788 RSTRING(str2)->as.heap.ptr += beg;
5789 RSTRING(str2)->as.heap.len = len;
5790 }
5791 else {
5792 str2 = rb_str_new(p, len);
5793 }
5794
5795 str_enc_copy(str2, str);
5796
5797 if (RSTRING_LEN(str2) == 0) {
5800 else
5802 }
5803 else {
5804 switch (ENC_CODERANGE(str)) {
5805 case ENC_CODERANGE_7BIT:
5807 break;
5808 default:
5810 break;
5811 }
5812 }
5813
5814 return str2;
5815}
5816
5817static VALUE
5818str_byte_aref(VALUE str, VALUE indx)
5819{
5820 long idx;
5821 if (FIXNUM_P(indx)) {
5822 idx = FIX2LONG(indx);
5823 }
5824 else {
5825 /* check if indx is Range */
5826 long beg, len = RSTRING_LEN(str);
5827
5828 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5829 case Qfalse:
5830 break;
5831 case Qnil:
5832 return Qnil;
5833 default:
5834 return str_byte_substr(str, beg, len, TRUE);
5835 }
5836
5837 idx = NUM2LONG(indx);
5838 }
5839 return str_byte_substr(str, idx, 1, FALSE);
5840}
5841
5842/*
5843 * call-seq:
5844 * str.byteslice(integer) -> new_str or nil
5845 * str.byteslice(integer, integer) -> new_str or nil
5846 * str.byteslice(range) -> new_str or nil
5847 *
5848 * Byte Reference---If passed a single Integer, returns a
5849 * substring of one byte at that position. If passed two Integer
5850 * objects, returns a substring starting at the offset given by the first, and
5851 * a length given by the second. If given a Range, a substring containing
5852 * bytes at offsets given by the range is returned. In all three cases, if
5853 * an offset is negative, it is counted from the end of <i>str</i>. Returns
5854 * <code>nil</code> if the initial offset falls outside the string, the length
5855 * is negative, or the beginning of the range is greater than the end.
5856 * The encoding of the resulted string keeps original encoding.
5857 *
5858 * "hello".byteslice(1) #=> "e"
5859 * "hello".byteslice(-1) #=> "o"
5860 * "hello".byteslice(1, 2) #=> "el"
5861 * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
5862 * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
5863 */
5864
5865static VALUE
5866rb_str_byteslice(int argc, VALUE *argv, VALUE str)
5867{
5868 if (argc == 2) {
5869 long beg = NUM2LONG(argv[0]);
5870 long end = NUM2LONG(argv[1]);
5871 return str_byte_substr(str, beg, end, TRUE);
5872 }
5873 rb_check_arity(argc, 1, 2);
5874 return str_byte_aref(str, argv[0]);
5875}
5876
5877/*
5878 * call-seq:
5879 * str.reverse -> new_str
5880 *
5881 * Returns a new string with the characters from <i>str</i> in reverse order.
5882 *
5883 * "stressed".reverse #=> "desserts"
5884 */
5885
5886static VALUE
5887rb_str_reverse(VALUE str)
5888{
5889 rb_encoding *enc;
5890 VALUE rev;
5891 char *s, *e, *p;
5892 int cr;
5893
5894 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
5895 enc = STR_ENC_GET(str);
5896 rev = rb_str_new(0, RSTRING_LEN(str));
5897 s = RSTRING_PTR(str); e = RSTRING_END(str);
5898 p = RSTRING_END(rev);
5899 cr = ENC_CODERANGE(str);
5900
5901 if (RSTRING_LEN(str) > 1) {
5902 if (single_byte_optimizable(str)) {
5903 while (s < e) {
5904 *--p = *s++;
5905 }
5906 }
5907 else if (cr == ENC_CODERANGE_VALID) {
5908 while (s < e) {
5909 int clen = rb_enc_fast_mbclen(s, e, enc);
5910
5911 p -= clen;
5912 memcpy(p, s, clen);
5913 s += clen;
5914 }
5915 }
5916 else {
5917 cr = rb_enc_asciicompat(enc) ?
5919 while (s < e) {
5920 int clen = rb_enc_mbclen(s, e, enc);
5921
5922 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
5923 p -= clen;
5924 memcpy(p, s, clen);
5925 s += clen;
5926 }
5927 }
5928 }
5930 str_enc_copy(rev, str);
5931 ENC_CODERANGE_SET(rev, cr);
5932
5933 return rev;
5934}
5935
5936
5937/*
5938 * call-seq:
5939 * str.reverse! -> str
5940 *
5941 * Reverses <i>str</i> in place.
5942 */
5943
5944static VALUE
5945rb_str_reverse_bang(VALUE str)
5946{
5947 if (RSTRING_LEN(str) > 1) {
5948 if (single_byte_optimizable(str)) {
5949 char *s, *e, c;
5950
5951 str_modify_keep_cr(str);
5952 s = RSTRING_PTR(str);
5953 e = RSTRING_END(str) - 1;
5954 while (s < e) {
5955 c = *s;
5956 *s++ = *e;
5957 *e-- = c;
5958 }
5959 }
5960 else {
5961 str_shared_replace(str, rb_str_reverse(str));
5962 }
5963 }
5964 else {
5965 str_modify_keep_cr(str);
5966 }
5967 return str;
5968}
5969
5970
5971/*
5972 * call-seq:
5973 * str.include? other_str -> true or false
5974 *
5975 * Returns <code>true</code> if <i>str</i> contains the given string or
5976 * character.
5977 *
5978 * "hello".include? "lo" #=> true
5979 * "hello".include? "ol" #=> false
5980 * "hello".include? ?h #=> true
5981 */
5982
5983static VALUE
5984rb_str_include(VALUE str, VALUE arg)
5985{
5986 long i;
5987
5988 StringValue(arg);
5989 i = rb_str_index(str, arg, 0);
5990
5991 if (i == -1) return Qfalse;
5992 return Qtrue;
5993}
5994
5995
5996/*
5997 * call-seq:
5998 * str.to_i(base=10) -> integer
5999 *
6000 * Returns the result of interpreting leading characters in <i>str</i> as an
6001 * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
6002 * end of a valid number are ignored. If there is not a valid number at the
6003 * start of <i>str</i>, <code>0</code> is returned. This method never raises an
6004 * exception when <i>base</i> is valid.
6005 *
6006 * "12345".to_i #=> 12345
6007 * "99 red balloons".to_i #=> 99
6008 * "0a".to_i #=> 0
6009 * "0a".to_i(16) #=> 10
6010 * "hello".to_i #=> 0
6011 * "1100101".to_i(2) #=> 101
6012 * "1100101".to_i(8) #=> 294977
6013 * "1100101".to_i(10) #=> 1100101
6014 * "1100101".to_i(16) #=> 17826049
6015 */
6016
6017static VALUE
6018rb_str_to_i(int argc, VALUE *argv, VALUE str)
6019{
6020 int base = 10;
6021
6022 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6023 rb_raise(rb_eArgError, "invalid radix %d", base);
6024 }
6025 return rb_str_to_inum(str, base, FALSE);
6026}
6027
6028
6029/*
6030 * call-seq:
6031 * str.to_f -> float
6032 *
6033 * Returns the result of interpreting leading characters in <i>str</i> as a
6034 * floating point number. Extraneous characters past the end of a valid number
6035 * are ignored. If there is not a valid number at the start of <i>str</i>,
6036 * <code>0.0</code> is returned. This method never raises an exception.
6037 *
6038 * "123.45e1".to_f #=> 1234.5
6039 * "45.67 degrees".to_f #=> 45.67
6040 * "thx1138".to_f #=> 0.0
6041 */
6042
6043static VALUE
6044rb_str_to_f(VALUE str)
6045{
6046 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6047}
6048
6049
6050/*
6051 * call-seq:
6052 * str.to_s -> str
6053 * str.to_str -> str
6054 *
6055 * Returns +self+.
6056 *
6057 * If called on a subclass of String, converts the receiver to a String object.
6058 */
6059
6060static VALUE
6061rb_str_to_s(VALUE str)
6062{
6063 if (rb_obj_class(str) != rb_cString) {
6064 return str_duplicate(rb_cString, str);
6065 }
6066 return str;
6067}
6068
6069#if 0
6070static void
6071str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6072{
6073 char s[RUBY_MAX_CHAR_LEN];
6074 int n = rb_enc_codelen(c, enc);
6075
6076 rb_enc_mbcput(c, s, enc);
6077 rb_enc_str_buf_cat(str, s, n, enc);
6078}
6079#endif
6080
6081#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6082
6083int
6084rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6085{
6086 char buf[CHAR_ESC_LEN + 1];
6087 int l;
6088
6089#if SIZEOF_INT > 4
6090 c &= 0xffffffff;
6091#endif
6092 if (unicode_p) {
6093 if (c < 0x7F && ISPRINT(c)) {
6094 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6095 }
6096 else if (c < 0x10000) {
6097 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6098 }
6099 else {
6100 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6101 }
6102 }
6103 else {
6104 if (c < 0x100) {
6105 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6106 }
6107 else {
6108 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6109 }
6110 }
6111 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6112 rb_str_buf_cat(result, buf, l);
6113 return l;
6114}
6115
6116const char *
6118{
6119 switch (c) {
6120 case '\0': return "\\0";
6121 case '\n': return "\\n";
6122 case '\r': return "\\r";
6123 case '\t': return "\\t";
6124 case '\f': return "\\f";
6125 case '\013': return "\\v";
6126 case '\010': return "\\b";
6127 case '\007': return "\\a";
6128 case '\033': return "\\e";
6129 case '\x7f': return "\\c?";
6130 }
6131 return NULL;
6132}
6133
6134VALUE
6136{
6137 int encidx = ENCODING_GET(str);
6138 rb_encoding *enc = rb_enc_from_index(encidx);
6139 const char *p = RSTRING_PTR(str);
6140 const char *pend = RSTRING_END(str);
6141 const char *prev = p;
6142 char buf[CHAR_ESC_LEN + 1];
6143 VALUE result = rb_str_buf_new(0);
6144 int unicode_p = rb_enc_unicode_p(enc);
6145 int asciicompat = rb_enc_asciicompat(enc);
6146
6147 while (p < pend) {
6148 unsigned int c;
6149 const char *cc;
6150 int n = rb_enc_precise_mbclen(p, pend, enc);
6151 if (!MBCLEN_CHARFOUND_P(n)) {
6152 if (p > prev) str_buf_cat(result, prev, p - prev);
6153 n = rb_enc_mbminlen(enc);
6154 if (pend < p + n)
6155 n = (int)(pend - p);
6156 while (n--) {
6157 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6158 str_buf_cat(result, buf, strlen(buf));
6159 prev = ++p;
6160 }
6161 continue;
6162 }
6163 n = MBCLEN_CHARFOUND_LEN(n);
6164 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6165 p += n;
6166 cc = ruby_escaped_char(c);
6167 if (cc) {
6168 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6169 str_buf_cat(result, cc, strlen(cc));
6170 prev = p;
6171 }
6172 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6173 }
6174 else {
6175 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6176 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6177 prev = p;
6178 }
6179 }
6180 if (p > prev) str_buf_cat(result, prev, p - prev);
6182
6183 return result;
6184}
6185
6186/*
6187 * call-seq:
6188 * str.inspect -> string
6189 *
6190 * Returns a printable version of _str_, surrounded by quote marks,
6191 * with special characters escaped.
6192 *
6193 * str = "hello"
6194 * str[3] = "\b"
6195 * str.inspect #=> "\"hel\\bo\""
6196 */
6197
6198VALUE
6200{
6201 int encidx = ENCODING_GET(str);
6202 rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
6203 const char *p, *pend, *prev;
6204 char buf[CHAR_ESC_LEN + 1];
6205 VALUE result = rb_str_buf_new(0);
6207 int unicode_p = rb_enc_unicode_p(enc);
6208 int asciicompat = rb_enc_asciicompat(enc);
6209
6210 if (resenc == NULL) resenc = rb_default_external_encoding();
6211 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6212 rb_enc_associate(result, resenc);
6213 str_buf_cat2(result, "\"");
6214
6215 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6216 prev = p;
6217 actenc = get_actual_encoding(encidx, str);
6218 if (actenc != enc) {
6219 enc = actenc;
6220 if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
6221 }
6222 while (p < pend) {
6223 unsigned int c, cc;
6224 int n;
6225
6226 n = rb_enc_precise_mbclen(p, pend, enc);
6227 if (!MBCLEN_CHARFOUND_P(n)) {
6228 if (p > prev) str_buf_cat(result, prev, p - prev);
6229 n = rb_enc_mbminlen(enc);
6230 if (pend < p + n)
6231 n = (int)(pend - p);
6232 while (n--) {
6233 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6234 str_buf_cat(result, buf, strlen(buf));
6235 prev = ++p;
6236 }
6237 continue;
6238 }
6239 n = MBCLEN_CHARFOUND_LEN(n);
6240 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6241 p += n;
6242 if ((asciicompat || unicode_p) &&
6243 (c == '"'|| c == '\\' ||
6244 (c == '#' &&
6245 p < pend &&
6247 (cc = rb_enc_codepoint(p,pend,enc),
6248 (cc == '$' || cc == '@' || cc == '{'))))) {
6249 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6250 str_buf_cat2(result, "\\");
6251 if (asciicompat || enc == resenc) {
6252 prev = p - n;
6253 continue;
6254 }
6255 }
6256 switch (c) {
6257 case '\n': cc = 'n'; break;
6258 case '\r': cc = 'r'; break;
6259 case '\t': cc = 't'; break;
6260 case '\f': cc = 'f'; break;
6261 case '\013': cc = 'v'; break;
6262 case '\010': cc = 'b'; break;
6263 case '\007': cc = 'a'; break;
6264 case 033: cc = 'e'; break;
6265 default: cc = 0; break;
6266 }
6267 if (cc) {
6268 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6269 buf[0] = '\\';
6270 buf[1] = (char)cc;
6271 str_buf_cat(result, buf, 2);
6272 prev = p;
6273 continue;
6274 }
6275 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6276 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6277 continue;
6278 }
6279 else {
6280 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6281 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6282 prev = p;
6283 continue;
6284 }
6285 }
6286 if (p > prev) str_buf_cat(result, prev, p - prev);
6287 str_buf_cat2(result, "\"");
6288
6289 return result;
6290}
6291
6292#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6293
6294/*
6295 * call-seq:
6296 * str.dump -> new_str
6297 *
6298 * Returns a quoted version of the string with all non-printing characters
6299 * replaced by <code>\xHH</code> notation and all special characters escaped.
6300 *
6301 * This method can be used for round-trip: if the resulting +new_str+ is
6302 * eval'ed, it will produce the original string.
6303 *
6304 * "hello \n ''".dump #=> "\"hello \\n ''\""
6305 * "\f\x00\xff\\\"".dump #=> "\"\\f\\x00\\xFF\\\\\\\"\""
6306 *
6307 * See also String#undump.
6308 */
6309
6310VALUE
6312{
6313 int encidx = rb_enc_get_index(str);
6314 rb_encoding *enc = rb_enc_from_index(encidx);
6315 long len;
6316 const char *p, *pend;
6317 char *q, *qend;
6318 VALUE result;
6319 int u8 = (encidx == rb_utf8_encindex());
6320 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6321
6322 len = 2; /* "" */
6323 if (!rb_enc_asciicompat(enc)) {
6324 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6325 len += strlen(enc->name);
6326 }
6327
6328 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6329 while (p < pend) {
6330 int clen;
6331 unsigned char c = *p++;
6332
6333 switch (c) {
6334 case '"': case '\\':
6335 case '\n': case '\r':
6336 case '\t': case '\f':
6337 case '\013': case '\010': case '\007': case '\033':
6338 clen = 2;
6339 break;
6340
6341 case '#':
6342 clen = IS_EVSTR(p, pend) ? 2 : 1;
6343 break;
6344
6345 default:
6346 if (ISPRINT(c)) {
6347 clen = 1;
6348 }
6349 else {
6350 if (u8 && c > 0x7F) { /* \u notation */
6351 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6352 if (MBCLEN_CHARFOUND_P(n)) {
6353 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6354 if (cc <= 0xFFFF)
6355 clen = 6; /* \uXXXX */
6356 else if (cc <= 0xFFFFF)
6357 clen = 9; /* \u{XXXXX} */
6358 else
6359 clen = 10; /* \u{XXXXXX} */
6360 p += MBCLEN_CHARFOUND_LEN(n)-1;
6361 break;
6362 }
6363 }
6364 clen = 4; /* \xNN */
6365 }
6366 break;
6367 }
6368
6369 if (clen > LONG_MAX - len) {
6370 rb_raise(rb_eRuntimeError, "string size too big");
6371 }
6372 len += clen;
6373 }
6374
6375 result = rb_str_new(0, len);
6376 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6377 q = RSTRING_PTR(result); qend = q + len + 1;
6378
6379 *q++ = '"';
6380 while (p < pend) {
6381 unsigned char c = *p++;
6382
6383 if (c == '"' || c == '\\') {
6384 *q++ = '\\';
6385 *q++ = c;
6386 }
6387 else if (c == '#') {
6388 if (IS_EVSTR(p, pend)) *q++ = '\\';
6389 *q++ = '#';
6390 }
6391 else if (c == '\n') {
6392 *q++ = '\\';
6393 *q++ = 'n';
6394 }
6395 else if (c == '\r') {
6396 *q++ = '\\';
6397 *q++ = 'r';
6398 }
6399 else if (c == '\t') {
6400 *q++ = '\\';
6401 *q++ = 't';
6402 }
6403 else if (c == '\f') {
6404 *q++ = '\\';
6405 *q++ = 'f';
6406 }
6407 else if (c == '\013') {
6408 *q++ = '\\';
6409 *q++ = 'v';
6410 }
6411 else if (c == '\010') {
6412 *q++ = '\\';
6413 *q++ = 'b';
6414 }
6415 else if (c == '\007') {
6416 *q++ = '\\';
6417 *q++ = 'a';
6418 }
6419 else if (c == '\033') {
6420 *q++ = '\\';
6421 *q++ = 'e';
6422 }
6423 else if (ISPRINT(c)) {
6424 *q++ = c;
6425 }
6426 else {
6427 *q++ = '\\';
6428 if (u8) {
6429 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6430 if (MBCLEN_CHARFOUND_P(n)) {
6431 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6432 p += n;
6433 if (cc <= 0xFFFF)
6434 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6435 else
6436 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6437 q += strlen(q);
6438 continue;
6439 }
6440 }
6441 snprintf(q, qend-q, "x%02X", c);
6442 q += 3;
6443 }
6444 }
6445 *q++ = '"';
6446 *q = '\0';
6447 if (!rb_enc_asciicompat(enc)) {
6448 snprintf(q, qend-q, nonascii_suffix, enc->name);
6449 encidx = rb_ascii8bit_encindex();
6450 }
6451 /* result from dump is ASCII */
6452 rb_enc_associate_index(result, encidx);
6454 return result;
6455}
6456
6457static int
6458unescape_ascii(unsigned int c)
6459{
6460 switch (c) {
6461 case 'n':
6462 return '\n';
6463 case 'r':
6464 return '\r';
6465 case 't':
6466 return '\t';
6467 case 'f':
6468 return '\f';
6469 case 'v':
6470 return '\13';
6471 case 'b':
6472 return '\010';
6473 case 'a':
6474 return '\007';
6475 case 'e':
6476 return 033;
6477 }
6479}
6480
6481static void
6482undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6483{
6484 const char *s = *ss;
6485 unsigned int c;
6486 int codelen;
6487 size_t hexlen;
6488 unsigned char buf[6];
6489 static rb_encoding *enc_utf8 = NULL;
6490
6491 switch (*s) {
6492 case '\\':
6493 case '"':
6494 case '#':
6495 rb_str_cat(undumped, s, 1); /* cat itself */
6496 s++;
6497 break;
6498 case 'n':
6499 case 'r':
6500 case 't':
6501 case 'f':
6502 case 'v':
6503 case 'b':
6504 case 'a':
6505 case 'e':
6506 *buf = unescape_ascii(*s);
6507 rb_str_cat(undumped, (char *)buf, 1);
6508 s++;
6509 break;
6510 case 'u':
6511 if (*binary) {
6512 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6513 }
6514 *utf8 = true;
6515 if (++s >= s_end) {
6516 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6517 }
6518 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6519 if (*penc != enc_utf8) {
6520 *penc = enc_utf8;
6521 rb_enc_associate(undumped, enc_utf8);
6522 }
6523 if (*s == '{') { /* handle \u{...} form */
6524 s++;
6525 for (;;) {
6526 if (s >= s_end) {
6527 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6528 }
6529 if (*s == '}') {
6530 s++;
6531 break;
6532 }
6533 if (ISSPACE(*s)) {
6534 s++;
6535 continue;
6536 }
6537 c = scan_hex(s, s_end-s, &hexlen);
6538 if (hexlen == 0 || hexlen > 6) {
6539 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6540 }
6541 if (c > 0x10ffff) {
6542 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6543 }
6544 if (0xd800 <= c && c <= 0xdfff) {
6545 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6546 }
6547 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6548 rb_str_cat(undumped, (char *)buf, codelen);
6549 s += hexlen;
6550 }
6551 }
6552 else { /* handle \uXXXX form */
6553 c = scan_hex(s, 4, &hexlen);
6554 if (hexlen != 4) {
6555 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6556 }
6557 if (0xd800 <= c && c <= 0xdfff) {
6558 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6559 }
6560 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6561 rb_str_cat(undumped, (char *)buf, codelen);
6562 s += hexlen;
6563 }
6564 break;
6565 case 'x':
6566 if (*utf8) {
6567 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6568 }
6569 *binary = true;
6570 if (++s >= s_end) {
6571 rb_raise(rb_eRuntimeError, "invalid hex escape");
6572 }
6573 *buf = scan_hex(s, 2, &hexlen);
6574 if (hexlen != 2) {
6575 rb_raise(rb_eRuntimeError, "invalid hex escape");
6576 }
6577 rb_str_cat(undumped, (char *)buf, 1);
6578 s += hexlen;
6579 break;
6580 default:
6581 rb_str_cat(undumped, s-1, 2);
6582 s++;
6583 }
6584
6585 *ss = s;
6586}
6587
6588static VALUE rb_str_is_ascii_only_p(VALUE str);
6589
6590/*
6591 * call-seq:
6592 * str.undump -> new_str
6593 *
6594 * Returns an unescaped version of the string.
6595 * This does the inverse of String#dump.
6596 *
6597 * "\"hello \\n ''\"".undump #=> "hello \n ''"
6598 */
6599
6600static VALUE
6601str_undump(VALUE str)
6602{
6603 const char *s = RSTRING_PTR(str);
6604 const char *s_end = RSTRING_END(str);
6605 rb_encoding *enc = rb_enc_get(str);
6606 VALUE undumped = rb_enc_str_new(s, 0L, enc);
6607 bool utf8 = false;
6608 bool binary = false;
6609 int w;
6610
6612 if (rb_str_is_ascii_only_p(str) == Qfalse) {
6613 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6614 }
6615 if (!str_null_check(str, &w)) {
6616 rb_raise(rb_eRuntimeError, "string contains null byte");
6617 }
6618 if (RSTRING_LEN(str) < 2) goto invalid_format;
6619 if (*s != '"') goto invalid_format;
6620
6621 /* strip '"' at the start */
6622 s++;
6623
6624 for (;;) {
6625 if (s >= s_end) {
6626 rb_raise(rb_eRuntimeError, "unterminated dumped string");
6627 }
6628
6629 if (*s == '"') {
6630 /* epilogue */
6631 s++;
6632 if (s == s_end) {
6633 /* ascii compatible dumped string */
6634 break;
6635 }
6636 else {
6637 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6638 static const char dup_suffix[] = ".dup";
6639 const char *encname;
6640 int encidx;
6641 ptrdiff_t size;
6642
6643 /* check separately for strings dumped by older versions */
6644 size = sizeof(dup_suffix) - 1;
6645 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6646
6647 size = sizeof(force_encoding_suffix) - 1;
6648 if (s_end - s <= size) goto invalid_format;
6649 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6650 s += size;
6651
6652 if (utf8) {
6653 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6654 }
6655
6656 encname = s;
6657 s = memchr(s, '"', s_end-s);
6658 size = s - encname;
6659 if (!s) goto invalid_format;
6660 if (s_end - s != 2) goto invalid_format;
6661 if (s[0] != '"' || s[1] != ')') goto invalid_format;
6662
6663 encidx = rb_enc_find_index2(encname, (long)size);
6664 if (encidx < 0) {
6665 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6666 }
6667 rb_enc_associate_index(undumped, encidx);
6668 }
6669 break;
6670 }
6671
6672 if (*s == '\\') {
6673 s++;
6674 if (s >= s_end) {
6675 rb_raise(rb_eRuntimeError, "invalid escape");
6676 }
6677 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6678 }
6679 else {
6680 rb_str_cat(undumped, s++, 1);
6681 }
6682 }
6683
6684 return undumped;
6685invalid_format:
6686 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6687}
6688
6689static void
6690rb_str_check_dummy_enc(rb_encoding *enc)
6691{
6692 if (rb_enc_dummy_p(enc)) {
6693 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6694 rb_enc_name(enc));
6695 }
6696}
6697
6698static rb_encoding *
6699str_true_enc(VALUE str)
6700{
6701 rb_encoding *enc = STR_ENC_GET(str);
6702 rb_str_check_dummy_enc(enc);
6703 return enc;
6704}
6705
6706static OnigCaseFoldType
6707check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6708{
6709 if (argc==0)
6710 return flags;
6711 if (argc>2)
6712 rb_raise(rb_eArgError, "too many options");
6713 if (argv[0]==sym_turkic) {
6715 if (argc==2) {
6716 if (argv[1]==sym_lithuanian)
6718 else
6719 rb_raise(rb_eArgError, "invalid second option");
6720 }
6721 }
6722 else if (argv[0]==sym_lithuanian) {
6724 if (argc==2) {
6725 if (argv[1]==sym_turkic)
6727 else
6728 rb_raise(rb_eArgError, "invalid second option");
6729 }
6730 }
6731 else if (argc>1)
6732 rb_raise(rb_eArgError, "too many options");
6733 else if (argv[0]==sym_ascii)
6734 flags |= ONIGENC_CASE_ASCII_ONLY;
6735 else if (argv[0]==sym_fold) {
6738 else
6739 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
6740 }
6741 else
6742 rb_raise(rb_eArgError, "invalid option");
6743 return flags;
6744}
6745
6746static inline bool
6747case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
6748{
6749 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
6750 return true;
6752}
6753
6754/* 16 should be long enough to absorb any kind of single character length increase */
6755#define CASE_MAPPING_ADDITIONAL_LENGTH 20
6756#ifndef CASEMAP_DEBUG
6757# define CASEMAP_DEBUG 0
6758#endif
6759
6760struct mapping_buffer;
6761typedef struct mapping_buffer {
6762 size_t capa;
6763 size_t used;
6767
6768static void
6769mapping_buffer_free(void *p)
6770{
6771 mapping_buffer *previous_buffer;
6772 mapping_buffer *current_buffer = p;
6773 while (current_buffer) {
6774 previous_buffer = current_buffer;
6775 current_buffer = current_buffer->next;
6776 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
6777 }
6778}
6779
6780static const rb_data_type_t mapping_buffer_type = {
6781 "mapping_buffer",
6782 {0, mapping_buffer_free,}
6783};
6784
6785static VALUE
6786rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
6787{
6788 VALUE target;
6789
6790 const OnigUChar *source_current, *source_end;
6791 int target_length = 0;
6792 VALUE buffer_anchor;
6793 mapping_buffer *current_buffer = 0;
6794 mapping_buffer **pre_buffer;
6795 size_t buffer_count = 0;
6796 int buffer_length_or_invalid;
6797
6798 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
6799
6800 source_current = (OnigUChar*)RSTRING_PTR(source);
6801 source_end = (OnigUChar*)RSTRING_END(source);
6802
6803 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
6804 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
6805 while (source_current < source_end) {
6806 /* increase multiplier using buffer count to converge quickly */
6807 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
6808 if (CASEMAP_DEBUG) {
6809 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
6810 }
6811 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
6812 *pre_buffer = current_buffer;
6813 pre_buffer = &current_buffer->next;
6814 current_buffer->next = NULL;
6815 current_buffer->capa = capa;
6816 buffer_length_or_invalid = enc->case_map(flags,
6817 (const OnigUChar**)&source_current, source_end,
6818 current_buffer->space,
6819 current_buffer->space+current_buffer->capa,
6820 enc);
6821 if (buffer_length_or_invalid < 0) {
6822 current_buffer = DATA_PTR(buffer_anchor);
6823 DATA_PTR(buffer_anchor) = 0;
6824 mapping_buffer_free(current_buffer);
6825 rb_raise(rb_eArgError, "input string invalid");
6826 }
6827 target_length += current_buffer->used = buffer_length_or_invalid;
6828 }
6829 if (CASEMAP_DEBUG) {
6830 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
6831 }
6832
6833 if (buffer_count==1) {
6834 target = rb_str_new((const char*)current_buffer->space, target_length);
6835 }
6836 else {
6837 char *target_current;
6838
6839 target = rb_str_new(0, target_length);
6840 target_current = RSTRING_PTR(target);
6841 current_buffer = DATA_PTR(buffer_anchor);
6842 while (current_buffer) {
6843 memcpy(target_current, current_buffer->space, current_buffer->used);
6844 target_current += current_buffer->used;
6845 current_buffer = current_buffer->next;
6846 }
6847 }
6848 current_buffer = DATA_PTR(buffer_anchor);
6849 DATA_PTR(buffer_anchor) = 0;
6850 mapping_buffer_free(current_buffer);
6851
6852 /* TODO: check about string terminator character */
6853 str_enc_copy(target, source);
6854 /*ENC_CODERANGE_SET(mapped, cr);*/
6855
6856 return target;
6857}
6858
6859static VALUE
6860rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
6861{
6862 const OnigUChar *source_current, *source_end;
6863 OnigUChar *target_current, *target_end;
6864 long old_length = RSTRING_LEN(source);
6865 int length_or_invalid;
6866
6867 if (old_length == 0) return Qnil;
6868
6869 source_current = (OnigUChar*)RSTRING_PTR(source);
6870 source_end = (OnigUChar*)RSTRING_END(source);
6871 if (source == target) {
6872 target_current = (OnigUChar*)source_current;
6873 target_end = (OnigUChar*)source_end;
6874 }
6875 else {
6876 target_current = (OnigUChar*)RSTRING_PTR(target);
6877 target_end = (OnigUChar*)RSTRING_END(target);
6878 }
6879
6880 length_or_invalid = onigenc_ascii_only_case_map(flags,
6881 &source_current, source_end,
6882 target_current, target_end, enc);
6883 if (length_or_invalid < 0)
6884 rb_raise(rb_eArgError, "input string invalid");
6885 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
6886 fprintf(stderr, "problem with rb_str_ascii_casemap"
6887 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6888 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
6889 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6890 }
6891
6892 str_enc_copy(target, source);
6893
6894 return target;
6895}
6896
6897static bool
6898upcase_single(VALUE str)
6899{
6900 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6901 bool modified = false;
6902
6903 while (s < send) {
6904 unsigned int c = *(unsigned char*)s;
6905
6906 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
6907 *s = 'A' + (c - 'a');
6908 modified = true;
6909 }
6910 s++;
6911 }
6912 return modified;
6913}
6914
6915/*
6916 * call-seq:
6917 * str.upcase! -> str or nil
6918 * str.upcase!([options]) -> str or nil
6919 *
6920 * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
6921 * were made.
6922 *
6923 * See String#downcase for meaning of +options+ and use with different encodings.
6924 */
6925
6926static VALUE
6927rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
6928{
6929 rb_encoding *enc;
6931
6932 flags = check_case_options(argc, argv, flags);
6933 str_modify_keep_cr(str);
6934 enc = str_true_enc(str);
6935 if (case_option_single_p(flags, enc, str)) {
6936 if (upcase_single(str))
6937 flags |= ONIGENC_CASE_MODIFIED;
6938 }
6939 else if (flags&ONIGENC_CASE_ASCII_ONLY)
6940 rb_str_ascii_casemap(str, str, &flags, enc);
6941 else
6942 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6943
6944 if (ONIGENC_CASE_MODIFIED&flags) return str;
6945 return Qnil;
6946}
6947
6948
6949/*
6950 * call-seq:
6951 * str.upcase -> new_str
6952 * str.upcase([options]) -> new_str
6953 *
6954 * Returns a copy of <i>str</i> with all lowercase letters replaced with their
6955 * uppercase counterparts.
6956 *
6957 * See String#downcase for meaning of +options+ and use with different encodings.
6958 *
6959 * "hEllO".upcase #=> "HELLO"
6960 */
6961
6962static VALUE
6963rb_str_upcase(int argc, VALUE *argv, VALUE str)
6964{
6965 rb_encoding *enc;
6967 VALUE ret;
6968
6969 flags = check_case_options(argc, argv, flags);
6970 enc = str_true_enc(str);
6971 if (case_option_single_p(flags, enc, str)) {
6973 str_enc_copy(ret, str);
6974 upcase_single(ret);
6975 }
6976 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
6977 ret = rb_str_new(0, RSTRING_LEN(str));
6978 rb_str_ascii_casemap(str, ret, &flags, enc);
6979 }
6980 else {
6981 ret = rb_str_casemap(str, &flags, enc);
6982 }
6983
6984 return ret;
6985}
6986
6987static bool
6988downcase_single(VALUE str)
6989{
6990 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6991 bool modified = false;
6992
6993 while (s < send) {
6994 unsigned int c = *(unsigned char*)s;
6995
6996 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
6997 *s = 'a' + (c - 'A');
6998 modified = true;
6999 }
7000 s++;
7001 }
7002
7003 return modified;
7004}
7005
7006/*
7007 * call-seq:
7008 * str.downcase! -> str or nil
7009 * str.downcase!([options]) -> str or nil
7010 *
7011 * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
7012 * changes were made.
7013 *
7014 * See String#downcase for meaning of +options+ and use with different encodings.
7015 */
7016
7017static VALUE
7018rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7019{
7020 rb_encoding *enc;
7022
7023 flags = check_case_options(argc, argv, flags);
7024 str_modify_keep_cr(str);
7025 enc = str_true_enc(str);
7026 if (case_option_single_p(flags, enc, str)) {
7027 if (downcase_single(str))
7028 flags |= ONIGENC_CASE_MODIFIED;
7029 }
7030 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7031 rb_str_ascii_casemap(str, str, &flags, enc);
7032 else
7033 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7034
7035 if (ONIGENC_CASE_MODIFIED&flags) return str;
7036 return Qnil;
7037}
7038
7039
7040/*
7041 * call-seq:
7042 * str.downcase -> new_str
7043 * str.downcase([options]) -> new_str
7044 *
7045 * Returns a copy of <i>str</i> with all uppercase letters replaced with their
7046 * lowercase counterparts. Which letters exactly are replaced, and by which
7047 * other letters, depends on the presence or absence of options, and on the
7048 * +encoding+ of the string.
7049 *
7050 * The meaning of the +options+ is as follows:
7051 *
7052 * No option ::
7053 * Full Unicode case mapping, suitable for most languages
7054 * (see :turkic and :lithuanian options below for exceptions).
7055 * Context-dependent case mapping as described in Table 3-14 of the
7056 * Unicode standard is currently not supported.
7057 * :ascii ::
7058 * Only the ASCII region, i.e. the characters ``A'' to ``Z'' and
7059 * ``a'' to ``z'', are affected.
7060 * This option cannot be combined with any other option.
7061 * :turkic ::
7062 * Full Unicode case mapping, adapted for Turkic languages
7063 * (Turkish, Azerbaijani, ...). This means that upper case I is mapped to
7064 * lower case dotless i, and so on.
7065 * :lithuanian ::
7066 * Currently, just full Unicode case mapping. In the future, full Unicode
7067 * case mapping adapted for Lithuanian (keeping the dot on the lower case
7068 * i even if there is an accent on top).
7069 * :fold ::
7070 * Only available on +downcase+ and +downcase!+. Unicode case <b>folding</b>,
7071 * which is more far-reaching than Unicode case mapping.
7072 * This option currently cannot be combined with any other option
7073 * (i.e. there is currently no variant for turkic languages).
7074 *
7075 * Please note that several assumptions that are valid for ASCII-only case
7076 * conversions do not hold for more general case conversions. For example,
7077 * the length of the result may not be the same as the length of the input
7078 * (neither in characters nor in bytes), some roundtrip assumptions
7079 * (e.g. str.downcase == str.upcase.downcase) may not apply, and Unicode
7080 * normalization (i.e. String#unicode_normalize) is not necessarily maintained
7081 * by case mapping operations.
7082 *
7083 * Non-ASCII case mapping/folding is currently supported for UTF-8,
7084 * UTF-16BE/LE, UTF-32BE/LE, and ISO-8859-1~16 Strings/Symbols.
7085 * This support will be extended to other encodings.
7086 *
7087 * "hEllO".downcase #=> "hello"
7088 */
7089
7090static VALUE
7091rb_str_downcase(int argc, VALUE *argv, VALUE str)
7092{
7093 rb_encoding *enc;
7095 VALUE ret;
7096
7097 flags = check_case_options(argc, argv, flags);
7098 enc = str_true_enc(str);
7099 if (case_option_single_p(flags, enc, str)) {
7101 str_enc_copy(ret, str);
7102 downcase_single(ret);
7103 }
7104 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7105 ret = rb_str_new(0, RSTRING_LEN(str));
7106 rb_str_ascii_casemap(str, ret, &flags, enc);
7107 }
7108 else {
7109 ret = rb_str_casemap(str, &flags, enc);
7110 }
7111
7112 return ret;
7113}
7114
7115
7116/*
7117 * call-seq:
7118 * str.capitalize! -> str or nil
7119 * str.capitalize!([options]) -> str or nil
7120 *
7121 * Modifies <i>str</i> by converting the first character to uppercase and the
7122 * remainder to lowercase. Returns <code>nil</code> if no changes are made.
7123 * There is an exception for modern Georgian (mkhedruli/MTAVRULI), where
7124 * the result is the same as for String#downcase, to avoid mixed case.
7125 *
7126 * See String#downcase for meaning of +options+ and use with different encodings.
7127 *
7128 * a = "hello"
7129 * a.capitalize! #=> "Hello"
7130 * a #=> "Hello"
7131 * a.capitalize! #=> nil
7132 */
7133
7134static VALUE
7135rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7136{
7137 rb_encoding *enc;
7139
7140 flags = check_case_options(argc, argv, flags);
7141 str_modify_keep_cr(str);
7142 enc = str_true_enc(str);
7143 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7144 if (flags&ONIGENC_CASE_ASCII_ONLY)
7145 rb_str_ascii_casemap(str, str, &flags, enc);
7146 else
7147 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7148
7149 if (ONIGENC_CASE_MODIFIED&flags) return str;
7150 return Qnil;
7151}
7152
7153
7154/*
7155 * call-seq:
7156 * str.capitalize -> new_str
7157 * str.capitalize([options]) -> new_str
7158 *
7159 * Returns a copy of <i>str</i> with the first character converted to uppercase
7160 * and the remainder to lowercase.
7161 *
7162 * See String#downcase for meaning of +options+ and use with different encodings.
7163 *
7164 * "hello".capitalize #=> "Hello"
7165 * "HELLO".capitalize #=> "Hello"
7166 * "123ABC".capitalize #=> "123abc"
7167 */
7168
7169static VALUE
7170rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7171{
7172 rb_encoding *enc;
7174 VALUE ret;
7175
7176 flags = check_case_options(argc, argv, flags);
7177 enc = str_true_enc(str);
7178 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7179 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7180 ret = rb_str_new(0, RSTRING_LEN(str));
7181 rb_str_ascii_casemap(str, ret, &flags, enc);
7182 }
7183 else {
7184 ret = rb_str_casemap(str, &flags, enc);
7185 }
7186 return ret;
7187}
7188
7189
7190/*
7191 * call-seq:
7192 * str.swapcase! -> str or nil
7193 * str.swapcase!([options]) -> str or nil
7194 *
7195 * Equivalent to String#swapcase, but modifies the receiver in place,
7196 * returning <i>str</i>, or <code>nil</code> if no changes were made.
7197 *
7198 * See String#downcase for meaning of +options+ and use with
7199 * different encodings.
7200 */
7201
7202static VALUE
7203rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7204{
7205 rb_encoding *enc;
7207
7208 flags = check_case_options(argc, argv, flags);
7209 str_modify_keep_cr(str);
7210 enc = str_true_enc(str);
7211 if (flags&ONIGENC_CASE_ASCII_ONLY)
7212 rb_str_ascii_casemap(str, str, &flags, enc);
7213 else
7214 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7215
7216 if (ONIGENC_CASE_MODIFIED&flags) return str;
7217 return Qnil;
7218}
7219
7220
7221/*
7222 * call-seq:
7223 * str.swapcase -> new_str
7224 * str.swapcase([options]) -> new_str
7225 *
7226 * Returns a copy of <i>str</i> with uppercase alphabetic characters converted
7227 * to lowercase and lowercase characters converted to uppercase.
7228 *
7229 * See String#downcase for meaning of +options+ and use with different encodings.
7230 *
7231 * "Hello".swapcase #=> "hELLO"
7232 * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11"
7233 */
7234
7235static VALUE
7236rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7237{
7238 rb_encoding *enc;
7240 VALUE ret;
7241
7242 flags = check_case_options(argc, argv, flags);
7243 enc = str_true_enc(str);
7244 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7245 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7246 ret = rb_str_new(0, RSTRING_LEN(str));
7247 rb_str_ascii_casemap(str, ret, &flags, enc);
7248 }
7249 else {
7250 ret = rb_str_casemap(str, &flags, enc);
7251 }
7252 return ret;
7253}
7254
7255typedef unsigned char *USTR;
7256
7257struct tr {
7258 int gen;
7259 unsigned int now, max;
7260 char *p, *pend;
7261};
7262
7263static unsigned int
7264trnext(struct tr *t, rb_encoding *enc)
7265{
7266 int n;
7267
7268 for (;;) {
7269 nextpart:
7270 if (!t->gen) {
7271 if (t->p == t->pend) return -1;
7272 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7273 t->p += n;
7274 }
7275 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7276 t->p += n;
7277 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7278 t->p += n;
7279 if (t->p < t->pend) {
7280 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7281 t->p += n;
7282 if (t->now > c) {
7283 if (t->now < 0x80 && c < 0x80) {
7285 "invalid range \"%c-%c\" in string transliteration",
7286 t->now, c);
7287 }
7288 else {
7289 rb_raise(rb_eArgError, "invalid range in string transliteration");
7290 }
7291 continue; /* not reached */
7292 }
7293 t->gen = 1;
7294 t->max = c;
7295 }
7296 }
7297 return t->now;
7298 }
7299 else {
7300 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7301 if (t->now == t->max) {
7302 t->gen = 0;
7303 goto nextpart;
7304 }
7305 }
7306 if (t->now < t->max) {
7307 return t->now;
7308 }
7309 else {
7310 t->gen = 0;
7311 return t->max;
7312 }
7313 }
7314 }
7315}
7316
7317static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7318
7319static VALUE
7320tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7321{
7322 const unsigned int errc = -1;
7323 unsigned int trans[256];
7324 rb_encoding *enc, *e1, *e2;
7325 struct tr trsrc, trrepl;
7326 int cflag = 0;
7327 unsigned int c, c0, last = 0;
7328 int modify = 0, i, l;
7329 unsigned char *s, *send;
7330 VALUE hash = 0;
7331 int singlebyte = single_byte_optimizable(str);
7332 int termlen;
7333 int cr;
7334
7335#define CHECK_IF_ASCII(c) \
7336 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7337 (cr = ENC_CODERANGE_VALID) : 0)
7338
7339 StringValue(src);
7340 StringValue(repl);
7341 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7342 if (RSTRING_LEN(repl) == 0) {
7343 return rb_str_delete_bang(1, &src, str);
7344 }
7345
7346 cr = ENC_CODERANGE(str);
7347 e1 = rb_enc_check(str, src);
7348 e2 = rb_enc_check(str, repl);
7349 if (e1 == e2) {
7350 enc = e1;
7351 }
7352 else {
7353 enc = rb_enc_check(src, repl);
7354 }
7355 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7356 if (RSTRING_LEN(src) > 1 &&
7357 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7358 trsrc.p + l < trsrc.pend) {
7359 cflag = 1;
7360 trsrc.p += l;
7361 }
7362 trrepl.p = RSTRING_PTR(repl);
7363 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7364 trsrc.gen = trrepl.gen = 0;
7365 trsrc.now = trrepl.now = 0;
7366 trsrc.max = trrepl.max = 0;
7367
7368 if (cflag) {
7369 for (i=0; i<256; i++) {
7370 trans[i] = 1;
7371 }
7372 while ((c = trnext(&trsrc, enc)) != errc) {
7373 if (c < 256) {
7374 trans[c] = errc;
7375 }
7376 else {
7377 if (!hash) hash = rb_hash_new();
7378 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7379 }
7380 }
7381 while ((c = trnext(&trrepl, enc)) != errc)
7382 /* retrieve last replacer */;
7383 last = trrepl.now;
7384 for (i=0; i<256; i++) {
7385 if (trans[i] != errc) {
7386 trans[i] = last;
7387 }
7388 }
7389 }
7390 else {
7391 unsigned int r;
7392
7393 for (i=0; i<256; i++) {
7394 trans[i] = errc;
7395 }
7396 while ((c = trnext(&trsrc, enc)) != errc) {
7397 r = trnext(&trrepl, enc);
7398 if (r == errc) r = trrepl.now;
7399 if (c < 256) {
7400 trans[c] = r;
7401 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7402 }
7403 else {
7404 if (!hash) hash = rb_hash_new();
7405 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7406 }
7407 }
7408 }
7409
7410 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7411 cr = ENC_CODERANGE_7BIT;
7412 str_modify_keep_cr(str);
7413 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7414 termlen = rb_enc_mbminlen(enc);
7415 if (sflag) {
7416 int clen, tlen;
7417 long offset, max = RSTRING_LEN(str);
7418 unsigned int save = -1;
7419 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7420
7421 while (s < send) {
7422 int may_modify = 0;
7423
7424 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7425 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7426
7427 s += clen;
7428 if (c < 256) {
7429 c = trans[c];
7430 }
7431 else if (hash) {
7432 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7433 if (NIL_P(tmp)) {
7434 if (cflag) c = last;
7435 else c = errc;
7436 }
7437 else if (cflag) c = errc;
7438 else c = NUM2INT(tmp);
7439 }
7440 else {
7441 c = errc;
7442 }
7443 if (c != (unsigned int)-1) {
7444 if (save == c) {
7445 CHECK_IF_ASCII(c);
7446 continue;
7447 }
7448 save = c;
7449 tlen = rb_enc_codelen(c, enc);
7450 modify = 1;
7451 }
7452 else {
7453 save = -1;
7454 c = c0;
7455 if (enc != e1) may_modify = 1;
7456 }
7457 if ((offset = t - buf) + tlen > max) {
7458 size_t MAYBE_UNUSED(old) = max + termlen;
7459 max = offset + tlen + (send - s);
7460 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7461 t = buf + offset;
7462 }
7463 rb_enc_mbcput(c, t, enc);
7464 if (may_modify && memcmp(s, t, tlen) != 0) {
7465 modify = 1;
7466 }
7467 CHECK_IF_ASCII(c);
7468 t += tlen;
7469 }
7470 if (!STR_EMBED_P(str)) {
7472 }
7473 TERM_FILL((char *)t, termlen);
7474 RSTRING(str)->as.heap.ptr = (char *)buf;
7475 RSTRING(str)->as.heap.len = t - buf;
7477 RSTRING(str)->as.heap.aux.capa = max;
7478 }
7479 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7480 while (s < send) {
7481 c = (unsigned char)*s;
7482 if (trans[c] != errc) {
7483 if (!cflag) {
7484 c = trans[c];
7485 *s = c;
7486 modify = 1;
7487 }
7488 else {
7489 *s = last;
7490 modify = 1;
7491 }
7492 }
7493 CHECK_IF_ASCII(c);
7494 s++;
7495 }
7496 }
7497 else {
7498 int clen, tlen;
7499 long offset, max = (long)((send - s) * 1.2);
7500 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7501
7502 while (s < send) {
7503 int may_modify = 0;
7504 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7505 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7506
7507 if (c < 256) {
7508 c = trans[c];
7509 }
7510 else if (hash) {
7511 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7512 if (NIL_P(tmp)) {
7513 if (cflag) c = last;
7514 else c = errc;
7515 }
7516 else if (cflag) c = errc;
7517 else c = NUM2INT(tmp);
7518 }
7519 else {
7520 c = cflag ? last : errc;
7521 }
7522 if (c != errc) {
7523 tlen = rb_enc_codelen(c, enc);
7524 modify = 1;
7525 }
7526 else {
7527 c = c0;
7528 if (enc != e1) may_modify = 1;
7529 }
7530 if ((offset = t - buf) + tlen > max) {
7531 size_t MAYBE_UNUSED(old) = max + termlen;
7532 max = offset + tlen + (long)((send - s) * 1.2);
7533 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7534 t = buf + offset;
7535 }
7536 if (s != t) {
7537 rb_enc_mbcput(c, t, enc);
7538 if (may_modify && memcmp(s, t, tlen) != 0) {
7539 modify = 1;
7540 }
7541 }
7542 CHECK_IF_ASCII(c);
7543 s += clen;
7544 t += tlen;
7545 }
7546 if (!STR_EMBED_P(str)) {
7548 }
7549 TERM_FILL((char *)t, termlen);
7550 RSTRING(str)->as.heap.ptr = (char *)buf;
7551 RSTRING(str)->as.heap.len = t - buf;
7553 RSTRING(str)->as.heap.aux.capa = max;
7554 }
7555
7556 if (modify) {
7557 if (cr != ENC_CODERANGE_BROKEN)
7559 rb_enc_associate(str, enc);
7560 return str;
7561 }
7562 return Qnil;
7563}
7564
7565
7566/*
7567 * call-seq:
7568 * str.tr!(from_str, to_str) -> str or nil
7569 *
7570 * Translates <i>str</i> in place, using the same rules as
7571 * String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7572 * were made.
7573 */
7574
7575static VALUE
7576rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7577{
7578 return tr_trans(str, src, repl, 0);
7579}
7580
7581
7582/*
7583 * call-seq:
7584 * str.tr(from_str, to_str) => new_str
7585 *
7586 * Returns a copy of +str+ with the characters in +from_str+ replaced by the
7587 * corresponding characters in +to_str+. If +to_str+ is shorter than
7588 * +from_str+, it is padded with its last character in order to maintain the
7589 * correspondence.
7590 *
7591 * "hello".tr('el', 'ip') #=> "hippo"
7592 * "hello".tr('aeiou', '*') #=> "h*ll*"
7593 * "hello".tr('aeiou', 'AA*') #=> "hAll*"
7594 *
7595 * Both strings may use the <code>c1-c2</code> notation to denote ranges of
7596 * characters, and +from_str+ may start with a <code>^</code>, which denotes
7597 * all characters except those listed.
7598 *
7599 * "hello".tr('a-y', 'b-z') #=> "ifmmp"
7600 * "hello".tr('^aeiou', '*') #=> "*e**o"
7601 *
7602 * The backslash character <code>\</code> can be used to escape
7603 * <code>^</code> or <code>-</code> and is otherwise ignored unless it
7604 * appears at the end of a range or the end of the +from_str+ or +to_str+:
7605 *
7606 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7607 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
7608 *
7609 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
7610 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
7611 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7612 *
7613 * "X['\\b']".tr("X\\", "") #=> "['b']"
7614 * "X['\\b']".tr("X-\\]", "") #=> "'b'"
7615 */
7616
7617static VALUE
7618rb_str_tr(VALUE str, VALUE src, VALUE repl)
7619{
7620 str = str_duplicate(rb_cString, str);
7621 tr_trans(str, src, repl, 0);
7622 return str;
7623}
7624
7625#define TR_TABLE_MAX (UCHAR_MAX+1)
7626#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
7627static void
7628tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7629 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7630{
7631 const unsigned int errc = -1;
7632 char buf[TR_TABLE_MAX];
7633 struct tr tr;
7634 unsigned int c;
7635 VALUE table = 0, ptable = 0;
7636 int i, l, cflag = 0;
7637
7639 tr.gen = tr.now = tr.max = 0;
7640
7641 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7642 cflag = 1;
7643 tr.p += l;
7644 }
7645 if (first) {
7646 for (i=0; i<TR_TABLE_MAX; i++) {
7647 stable[i] = 1;
7648 }
7649 stable[TR_TABLE_MAX] = cflag;
7650 }
7651 else if (stable[TR_TABLE_MAX] && !cflag) {
7652 stable[TR_TABLE_MAX] = 0;
7653 }
7654 for (i=0; i<TR_TABLE_MAX; i++) {
7655 buf[i] = cflag;
7656 }
7657
7658 while ((c = trnext(&tr, enc)) != errc) {
7659 if (c < TR_TABLE_MAX) {
7660 buf[(unsigned char)c] = !cflag;
7661 }
7662 else {
7663 VALUE key = UINT2NUM(c);
7664
7665 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
7666 if (cflag) {
7667 ptable = *ctablep;
7668 table = ptable ? ptable : rb_hash_new();
7669 *ctablep = table;
7670 }
7671 else {
7672 table = rb_hash_new();
7673 ptable = *tablep;
7674 *tablep = table;
7675 }
7676 }
7677 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7678 rb_hash_aset(table, key, Qtrue);
7679 }
7680 }
7681 }
7682 for (i=0; i<TR_TABLE_MAX; i++) {
7683 stable[i] = stable[i] && buf[i];
7684 }
7685 if (!table && !cflag) {
7686 *tablep = 0;
7687 }
7688}
7689
7690
7691static int
7692tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7693{
7694 if (c < TR_TABLE_MAX) {
7695 return table[c] != 0;
7696 }
7697 else {
7698 VALUE v = UINT2NUM(c);
7699
7700 if (del) {
7701 if (!NIL_P(rb_hash_lookup(del, v)) &&
7702 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7703 return TRUE;
7704 }
7705 }
7706 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7707 return FALSE;
7708 }
7709 return table[TR_TABLE_MAX] ? TRUE : FALSE;
7710 }
7711}
7712
7713/*
7714 * call-seq:
7715 * str.delete!([other_str]+) -> str or nil
7716 *
7717 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7718 * <code>nil</code> if <i>str</i> was not modified.
7719 */
7720
7721static VALUE
7722rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7723{
7724 char squeez[TR_TABLE_SIZE];
7725 rb_encoding *enc = 0;
7726 char *s, *send, *t;
7727 VALUE del = 0, nodel = 0;
7728 int modify = 0;
7729 int i, ascompat, cr;
7730
7731 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7733 for (i=0; i<argc; i++) {
7734 VALUE s = argv[i];
7735
7736 StringValue(s);
7737 enc = rb_enc_check(str, s);
7738 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7739 }
7740
7741 str_modify_keep_cr(str);
7742 ascompat = rb_enc_asciicompat(enc);
7743 s = t = RSTRING_PTR(str);
7744 send = RSTRING_END(str);
7745 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
7746 while (s < send) {
7747 unsigned int c;
7748 int clen;
7749
7750 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7751 if (squeez[c]) {
7752 modify = 1;
7753 }
7754 else {
7755 if (t != s) *t = c;
7756 t++;
7757 }
7758 s++;
7759 }
7760 else {
7761 c = rb_enc_codepoint_len(s, send, &clen, enc);
7762
7763 if (tr_find(c, squeez, del, nodel)) {
7764 modify = 1;
7765 }
7766 else {
7767 if (t != s) rb_enc_mbcput(c, t, enc);
7768 t += clen;
7770 }
7771 s += clen;
7772 }
7773 }
7777
7778 if (modify) return str;
7779 return Qnil;
7780}
7781
7782
7783/*
7784 * call-seq:
7785 * str.delete([other_str]+) -> new_str
7786 *
7787 * Returns a copy of <i>str</i> with all characters in the intersection of its
7788 * arguments deleted. Uses the same rules for building the set of characters as
7789 * String#count.
7790 *
7791 * "hello".delete "l","lo" #=> "heo"
7792 * "hello".delete "lo" #=> "he"
7793 * "hello".delete "aeiou", "^e" #=> "hell"
7794 * "hello".delete "ej-m" #=> "ho"
7795 */
7796
7797static VALUE
7798rb_str_delete(int argc, VALUE *argv, VALUE str)
7799{
7800 str = str_duplicate(rb_cString, str);
7801 rb_str_delete_bang(argc, argv, str);
7802 return str;
7803}
7804
7805
7806/*
7807 * call-seq:
7808 * str.squeeze!([other_str]*) -> str or nil
7809 *
7810 * Squeezes <i>str</i> in place, returning either <i>str</i>, or
7811 * <code>nil</code> if no changes were made.
7812 */
7813
7814static VALUE
7815rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
7816{
7817 char squeez[TR_TABLE_SIZE];
7818 rb_encoding *enc = 0;
7819 VALUE del = 0, nodel = 0;
7820 unsigned char *s, *send, *t;
7821 int i, modify = 0;
7822 int ascompat, singlebyte = single_byte_optimizable(str);
7823 unsigned int save;
7824
7825 if (argc == 0) {
7826 enc = STR_ENC_GET(str);
7827 }
7828 else {
7829 for (i=0; i<argc; i++) {
7830 VALUE s = argv[i];
7831
7832 StringValue(s);
7833 enc = rb_enc_check(str, s);
7834 if (singlebyte && !single_byte_optimizable(s))
7835 singlebyte = 0;
7836 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7837 }
7838 }
7839
7840 str_modify_keep_cr(str);
7841 s = t = (unsigned char *)RSTRING_PTR(str);
7842 if (!s || RSTRING_LEN(str) == 0) return Qnil;
7843 send = (unsigned char *)RSTRING_END(str);
7844 save = -1;
7845 ascompat = rb_enc_asciicompat(enc);
7846
7847 if (singlebyte) {
7848 while (s < send) {
7849 unsigned int c = *s++;
7850 if (c != save || (argc > 0 && !squeez[c])) {
7851 *t++ = save = c;
7852 }
7853 }
7854 }
7855 else {
7856 while (s < send) {
7857 unsigned int c;
7858 int clen;
7859
7860 if (ascompat && (c = *s) < 0x80) {
7861 if (c != save || (argc > 0 && !squeez[c])) {
7862 *t++ = save = c;
7863 }
7864 s++;
7865 }
7866 else {
7867 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
7868
7869 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
7870 if (t != s) rb_enc_mbcput(c, t, enc);
7871 save = c;
7872 t += clen;
7873 }
7874 s += clen;
7875 }
7876 }
7877 }
7878
7879 TERM_FILL((char *)t, TERM_LEN(str));
7880 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
7881 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
7882 modify = 1;
7883 }
7884
7885 if (modify) return str;
7886 return Qnil;
7887}
7888
7889
7890/*
7891 * call-seq:
7892 * str.squeeze([other_str]*) -> new_str
7893 *
7894 * Builds a set of characters from the <i>other_str</i> parameter(s)
7895 * using the procedure described for String#count. Returns a new
7896 * string where runs of the same character that occur in this set are
7897 * replaced by a single character. If no arguments are given, all
7898 * runs of identical characters are replaced by a single character.
7899 *
7900 * "yellow moon".squeeze #=> "yelow mon"
7901 * " now is the".squeeze(" ") #=> " now is the"
7902 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
7903 */
7904
7905static VALUE
7906rb_str_squeeze(int argc, VALUE *argv, VALUE str)
7907{
7908 str = str_duplicate(rb_cString, str);
7909 rb_str_squeeze_bang(argc, argv, str);
7910 return str;
7911}
7912
7913
7914/*
7915 * call-seq:
7916 * str.tr_s!(from_str, to_str) -> str or nil
7917 *
7918 * Performs String#tr_s processing on <i>str</i> in place,
7919 * returning <i>str</i>, or <code>nil</code> if no changes were made.
7920 */
7921
7922static VALUE
7923rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
7924{
7925 return tr_trans(str, src, repl, 1);
7926}
7927
7928
7929/*
7930 * call-seq:
7931 * str.tr_s(from_str, to_str) -> new_str
7932 *
7933 * Processes a copy of <i>str</i> as described under String#tr, then
7934 * removes duplicate characters in regions that were affected by the
7935 * translation.
7936 *
7937 * "hello".tr_s('l', 'r') #=> "hero"
7938 * "hello".tr_s('el', '*') #=> "h*o"
7939 * "hello".tr_s('el', 'hx') #=> "hhxo"
7940 */
7941
7942static VALUE
7943rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
7944{
7945 str = str_duplicate(rb_cString, str);
7946 tr_trans(str, src, repl, 1);
7947 return str;
7948}
7949
7950
7951/*
7952 * call-seq:
7953 * str.count([other_str]+) -> integer
7954 *
7955 * Each +other_str+ parameter defines a set of characters to count. The
7956 * intersection of these sets defines the characters to count in +str+. Any
7957 * +other_str+ that starts with a caret <code>^</code> is negated. The
7958 * sequence <code>c1-c2</code> means all characters between c1 and c2. The
7959 * backslash character <code>\</code> can be used to escape <code>^</code> or
7960 * <code>-</code> and is otherwise ignored unless it appears at the end of a
7961 * sequence or the end of a +other_str+.
7962 *
7963 * a = "hello world"
7964 * a.count "lo" #=> 5
7965 * a.count "lo", "o" #=> 2
7966 * a.count "hello", "^l" #=> 4
7967 * a.count "ej-m" #=> 4
7968 *
7969 * "hello^world".count "\\^aeiou" #=> 4
7970 * "hello-world".count "a\\-eo" #=> 4
7971 *
7972 * c = "hello world\\r\\n"
7973 * c.count "\\" #=> 2
7974 * c.count "\\A" #=> 0
7975 * c.count "X-\\w" #=> 3
7976 */
7977
7978static VALUE
7979rb_str_count(int argc, VALUE *argv, VALUE str)
7980{
7981 char table[TR_TABLE_SIZE];
7982 rb_encoding *enc = 0;
7983 VALUE del = 0, nodel = 0, tstr;
7984 char *s, *send;
7985 int i;
7986 int ascompat;
7987
7989
7990 tstr = argv[0];
7991 StringValue(tstr);
7992 enc = rb_enc_check(str, tstr);
7993 if (argc == 1) {
7994 const char *ptstr;
7995 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
7996 (ptstr = RSTRING_PTR(tstr),
7997 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
7998 !is_broken_string(str)) {
7999 int n = 0;
8000 int clen;
8001 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8002
8003 s = RSTRING_PTR(str);
8004 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8005 send = RSTRING_END(str);
8006 while (s < send) {
8007 if (*(unsigned char*)s++ == c) n++;
8008 }
8009 return INT2NUM(n);
8010 }
8011 }
8012
8013 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8014 for (i=1; i<argc; i++) {
8015 tstr = argv[i];
8016 StringValue(tstr);
8017 enc = rb_enc_check(str, tstr);
8018 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8019 }
8020
8021 s = RSTRING_PTR(str);
8022 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8023 send = RSTRING_END(str);
8024 ascompat = rb_enc_asciicompat(enc);
8025 i = 0;
8026 while (s < send) {
8027 unsigned int c;
8028
8029 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8030 if (table[c]) {
8031 i++;
8032 }
8033 s++;
8034 }
8035 else {
8036 int clen;
8037 c = rb_enc_codepoint_len(s, send, &clen, enc);
8038 if (tr_find(c, table, del, nodel)) {
8039 i++;
8040 }
8041 s += clen;
8042 }
8043 }
8044
8045 return INT2NUM(i);
8046}
8047
8048static VALUE
8049rb_fs_check(VALUE val)
8050{
8051 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8052 val = rb_check_string_type(val);
8053 if (NIL_P(val)) return 0;
8054 }
8055 return val;
8056}
8057
8058static const char isspacetable[256] = {
8059 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8060 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8061 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8062 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8063 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8064 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8065 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8066 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8067 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8068 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8069 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8070 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8071 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8072 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8075};
8076
8077#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8078
8079static long
8080split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8081{
8082 if (empty_count >= 0 && len == 0) {
8083 return empty_count + 1;
8084 }
8085 if (empty_count > 0) {
8086 /* make different substrings */
8087 if (result) {
8088 do {
8089 rb_ary_push(result, str_new_empty_String(str));
8090 } while (--empty_count > 0);
8091 }
8092 else {
8093 do {
8094 rb_yield(str_new_empty_String(str));
8095 } while (--empty_count > 0);
8096 }
8097 }
8098 str = rb_str_subseq(str, beg, len);
8099 if (result) {
8100 rb_ary_push(result, str);
8101 }
8102 else {
8103 rb_yield(str);
8104 }
8105 return empty_count;
8106}
8107
8108typedef enum {
8111
8112static split_type_t
8113literal_split_pattern(VALUE spat, split_type_t default_type)
8114{
8115 rb_encoding *enc = STR_ENC_GET(spat);
8116 const char *ptr;
8117 long len;
8118 RSTRING_GETMEM(spat, ptr, len);
8119 if (len == 0) {
8120 /* Special case - split into chars */
8121 return SPLIT_TYPE_CHARS;
8122 }
8123 else if (rb_enc_asciicompat(enc)) {
8124 if (len == 1 && ptr[0] == ' ') {
8125 return SPLIT_TYPE_AWK;
8126 }
8127 }
8128 else {
8129 int l;
8130 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8131 return SPLIT_TYPE_AWK;
8132 }
8133 }
8134 return default_type;
8135}
8136
8137/*
8138 * call-seq:
8139 * str.split(pattern=nil, [limit]) -> an_array
8140 * str.split(pattern=nil, [limit]) {|sub| block } -> str
8141 *
8142 * Divides <i>str</i> into substrings based on a delimiter, returning an array
8143 * of these substrings.
8144 *
8145 * If <i>pattern</i> is a String, then its contents are used as
8146 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
8147 * space, <i>str</i> is split on whitespace, with leading and trailing
8148 * whitespace and runs of contiguous whitespace characters ignored.
8149 *
8150 * If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
8151 * pattern matches. Whenever the pattern matches a zero-length string,
8152 * <i>str</i> is split into individual characters. If <i>pattern</i> contains
8153 * groups, the respective matches will be returned in the array as well.
8154 *
8155 * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
8156 * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
8157 * split on whitespace as if ' ' were specified.
8158 *
8159 * If the <i>limit</i> parameter is omitted, trailing null fields are
8160 * suppressed. If <i>limit</i> is a positive number, at most that number
8161 * of split substrings will be returned (captured groups will be returned
8162 * as well, but are not counted towards the limit).
8163 * If <i>limit</i> is <code>1</code>, the entire
8164 * string is returned as the only entry in an array. If negative, there is no
8165 * limit to the number of fields returned, and trailing null fields are not
8166 * suppressed.
8167 *
8168 * When the input +str+ is empty an empty Array is returned as the string is
8169 * considered to have no fields to split.
8170 *
8171 * " now's the time ".split #=> ["now's", "the", "time"]
8172 * " now's the time ".split(' ') #=> ["now's", "the", "time"]
8173 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
8174 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
8175 * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
8176 * "hello".split(//, 3) #=> ["h", "e", "llo"]
8177 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
8178 *
8179 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
8180 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
8181 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
8182 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
8183 *
8184 * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
8185 *
8186 * "".split(',', -1) #=> []
8187 *
8188 * If a block is given, invoke the block with each split substring.
8189 *
8190 */
8191
8192static VALUE
8193rb_str_split_m(int argc, VALUE *argv, VALUE str)
8194{
8195 rb_encoding *enc;
8196 VALUE spat;
8197 VALUE limit;
8198 split_type_t split_type;
8199 long beg, end, i = 0, empty_count = -1;
8200 int lim = 0;
8201 VALUE result, tmp;
8202
8203 result = rb_block_given_p() ? Qfalse : Qnil;
8204 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8205 lim = NUM2INT(limit);
8206 if (lim <= 0) limit = Qnil;
8207 else if (lim == 1) {
8208 if (RSTRING_LEN(str) == 0)
8209 return result ? rb_ary_new2(0) : str;
8210 tmp = str_duplicate(rb_cString, str);
8211 if (!result) {
8212 rb_yield(tmp);
8213 return str;
8214 }
8215 return rb_ary_new3(1, tmp);
8216 }
8217 i = 1;
8218 }
8219 if (NIL_P(limit) && !lim) empty_count = 0;
8220
8221 enc = STR_ENC_GET(str);
8222 split_type = SPLIT_TYPE_REGEXP;
8223 if (!NIL_P(spat)) {
8224 spat = get_pat_quoted(spat, 0);
8225 }
8226 else if (NIL_P(spat = rb_fs)) {
8227 split_type = SPLIT_TYPE_AWK;
8228 }
8229 else if (!(spat = rb_fs_check(spat))) {
8230 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8231 }
8232 else {
8233 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8234 }
8235 if (split_type != SPLIT_TYPE_AWK) {
8236 switch (BUILTIN_TYPE(spat)) {
8237 case T_REGEXP:
8238 rb_reg_options(spat); /* check if uninitialized */
8239 tmp = RREGEXP_SRC(spat);
8240 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8241 if (split_type == SPLIT_TYPE_AWK) {
8242 spat = tmp;
8243 split_type = SPLIT_TYPE_STRING;
8244 }
8245 break;
8246
8247 case T_STRING:
8248 mustnot_broken(spat);
8249 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8250 break;
8251
8252 default:
8254 }
8255 }
8256
8257#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8258
8259 if (result) result = rb_ary_new();
8260 beg = 0;
8261 char *ptr = RSTRING_PTR(str);
8262 char *eptr = RSTRING_END(str);
8263 if (split_type == SPLIT_TYPE_AWK) {
8264 char *bptr = ptr;
8265 int skip = 1;
8266 unsigned int c;
8267
8268 end = beg;
8269 if (is_ascii_string(str)) {
8270 while (ptr < eptr) {
8271 c = (unsigned char)*ptr++;
8272 if (skip) {
8273 if (ascii_isspace(c)) {
8274 beg = ptr - bptr;
8275 }
8276 else {
8277 end = ptr - bptr;
8278 skip = 0;
8279 if (!NIL_P(limit) && lim <= i) break;
8280 }
8281 }
8282 else if (ascii_isspace(c)) {
8283 SPLIT_STR(beg, end-beg);
8284 skip = 1;
8285 beg = ptr - bptr;
8286 if (!NIL_P(limit)) ++i;
8287 }
8288 else {
8289 end = ptr - bptr;
8290 }
8291 }
8292 }
8293 else {
8294 while (ptr < eptr) {
8295 int n;
8296
8297 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8298 ptr += n;
8299 if (skip) {
8300 if (rb_isspace(c)) {
8301 beg = ptr - bptr;
8302 }
8303 else {
8304 end = ptr - bptr;
8305 skip = 0;
8306 if (!NIL_P(limit) && lim <= i) break;
8307 }
8308 }
8309 else if (rb_isspace(c)) {
8310 SPLIT_STR(beg, end-beg);
8311 skip = 1;
8312 beg = ptr - bptr;
8313 if (!NIL_P(limit)) ++i;
8314 }
8315 else {
8316 end = ptr - bptr;
8317 }
8318 }
8319 }
8320 }
8321 else if (split_type == SPLIT_TYPE_STRING) {
8322 char *str_start = ptr;
8323 char *substr_start = ptr;
8324 char *sptr = RSTRING_PTR(spat);
8325 long slen = RSTRING_LEN(spat);
8326
8327 mustnot_broken(str);
8328 enc = rb_enc_check(str, spat);
8329 while (ptr < eptr &&
8330 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8331 /* Check we are at the start of a char */
8332 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8333 if (t != ptr + end) {
8334 ptr = t;
8335 continue;
8336 }
8337 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8338 ptr += end + slen;
8339 substr_start = ptr;
8340 if (!NIL_P(limit) && lim <= ++i) break;
8341 }
8342 beg = ptr - str_start;
8343 }
8344 else if (split_type == SPLIT_TYPE_CHARS) {
8345 char *str_start = ptr;
8346 int n;
8347
8348 mustnot_broken(str);
8349 enc = rb_enc_get(str);
8350 while (ptr < eptr &&
8351 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8352 SPLIT_STR(ptr - str_start, n);
8353 ptr += n;
8354 if (!NIL_P(limit) && lim <= ++i) break;
8355 }
8356 beg = ptr - str_start;
8357 }
8358 else {
8359 long len = RSTRING_LEN(str);
8360 long start = beg;
8361 long idx;
8362 int last_null = 0;
8363 struct re_registers *regs;
8364 VALUE match = 0;
8365
8366 for (; rb_reg_search(spat, str, start, 0) >= 0;
8367 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8369 if (!result) rb_match_busy(match);
8370 regs = RMATCH_REGS(match);
8371 end = BEG(0);
8372 if (start == end && BEG(0) == END(0)) {
8373 if (!ptr) {
8374 SPLIT_STR(0, 0);
8375 break;
8376 }
8377 else if (last_null == 1) {
8378 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8379 beg = start;
8380 }
8381 else {
8382 if (start == len)
8383 start++;
8384 else
8385 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8386 last_null = 1;
8387 continue;
8388 }
8389 }
8390 else {
8391 SPLIT_STR(beg, end-beg);
8392 beg = start = END(0);
8393 }
8394 last_null = 0;
8395
8396 for (idx=1; idx < regs->num_regs; idx++) {
8397 if (BEG(idx) == -1) continue;
8398 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8399 }
8400 if (!NIL_P(limit) && lim <= ++i) break;
8401 }
8403 }
8404 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8406 }
8407
8408 return result ? result : str;
8409}
8410
8411VALUE
8412rb_str_split(VALUE str, const char *sep0)
8413{
8414 VALUE sep;
8415
8417 sep = rb_str_new_cstr(sep0);
8418 return rb_str_split_m(1, &sep, str);
8419}
8420
8421#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8422
8423static inline int
8424enumerator_element(VALUE ary, VALUE e)
8425{
8426 if (ary) {
8427 rb_ary_push(ary, e);
8428 return 0;
8429 }
8430 else {
8431 rb_yield(e);
8432 return 1;
8433 }
8434}
8435
8436#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8437
8438static const char *
8439chomp_newline(const char *p, const char *e, rb_encoding *enc)
8440{
8441 const char *prev = rb_enc_prev_char(p, e, e, enc);
8442 if (rb_enc_is_newline(prev, e, enc)) {
8443 e = prev;
8444 prev = rb_enc_prev_char(p, e, e, enc);
8445 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8446 e = prev;
8447 }
8448 return e;
8449}
8450
8451static VALUE
8452get_rs(void)
8453{
8454 VALUE rs = rb_rs;
8455 if (!NIL_P(rs) &&
8456 (!RB_TYPE_P(rs, T_STRING) ||
8457 RSTRING_LEN(rs) != 1 ||
8458 RSTRING_PTR(rs)[0] != '\n')) {
8459 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8460 }
8461 return rs;
8462}
8463
8464#define rb_rs get_rs()
8465
8466static VALUE
8467rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8468{
8469 rb_encoding *enc;
8470 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8471 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8472 long pos, len, rslen;
8473 int rsnewline = 0;
8474
8475 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8476 rs = rb_rs;
8477 if (!NIL_P(opts)) {
8478 static ID keywords[1];
8479 if (!keywords[0]) {
8480 keywords[0] = rb_intern_const("chomp");
8481 }
8482 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8483 chomp = (chomp != Qundef && RTEST(chomp));
8484 }
8485
8486 if (NIL_P(rs)) {
8487 if (!ENUM_ELEM(ary, str)) {
8488 return ary;
8489 }
8490 else {
8491 return orig;
8492 }
8493 }
8494
8495 if (!RSTRING_LEN(str)) goto end;
8497 ptr = subptr = RSTRING_PTR(str);
8498 pend = RSTRING_END(str);
8499 len = RSTRING_LEN(str);
8500 StringValue(rs);
8501 rslen = RSTRING_LEN(rs);
8502
8503 if (rs == rb_default_rs)
8504 enc = rb_enc_get(str);
8505 else
8506 enc = rb_enc_check(str, rs);
8507
8508 if (rslen == 0) {
8509 /* paragraph mode */
8510 int n;
8511 const char *eol = NULL;
8512 subend = subptr;
8513 while (subend < pend) {
8514 do {
8515 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8516 n = 0;
8517 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8518 if (rb_enc_is_newline(subend + n, pend, enc)) {
8519 if (eol == subend) break;
8520 subend += rslen;
8521 if (subptr) eol = subend;
8522 }
8523 else {
8524 if (!subptr) subptr = subend;
8525 subend += rslen;
8526 }
8527 rslen = 0;
8528 } while (subend < pend);
8529 if (!subptr) break;
8530 line = rb_str_subseq(str, subptr - ptr,
8531 subend - subptr + (chomp ? 0 : rslen));
8532 if (ENUM_ELEM(ary, line)) {
8533 str_mod_check(str, ptr, len);
8534 }
8535 subptr = eol = NULL;
8536 }
8537 goto end;
8538 }
8539 else {
8540 rsptr = RSTRING_PTR(rs);
8541 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8542 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8543 rsnewline = 1;
8544 }
8545 }
8546
8547 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8548 rs = rb_str_new(rsptr, rslen);
8549 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8550 rsptr = RSTRING_PTR(rs);
8551 rslen = RSTRING_LEN(rs);
8552 }
8553
8554 while (subptr < pend) {
8555 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8556 if (pos < 0) break;
8557 hit = subptr + pos;
8558 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8559 if (hit != adjusted) {
8560 subptr = adjusted;
8561 continue;
8562 }
8563 subend = hit += rslen;
8564 if (chomp) {
8565 if (rsnewline) {
8566 subend = chomp_newline(subptr, subend, enc);
8567 }
8568 else {
8569 subend -= rslen;
8570 }
8571 }
8572 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8573 if (ENUM_ELEM(ary, line)) {
8574 str_mod_check(str, ptr, len);
8575 }
8576 subptr = hit;
8577 }
8578
8579 if (subptr != pend) {
8580 if (chomp) {
8581 if (rsnewline) {
8582 pend = chomp_newline(subptr, pend, enc);
8583 }
8584 else if (pend - subptr >= rslen &&
8585 memcmp(pend - rslen, rsptr, rslen) == 0) {
8586 pend -= rslen;
8587 }
8588 }
8589 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8590 ENUM_ELEM(ary, line);
8592 }
8593
8594 end:
8595 if (ary)
8596 return ary;
8597 else
8598 return orig;
8599}
8600
8601/*
8602 * call-seq:
8603 * str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8604 * str.each_line(separator=$/, chomp: false) -> an_enumerator
8605 *
8606 * Splits <i>str</i> using the supplied parameter as the record
8607 * separator (<code>$/</code> by default), passing each substring in
8608 * turn to the supplied block. If a zero-length record separator is
8609 * supplied, the string is split into paragraphs delimited by
8610 * multiple successive newlines.
8611 *
8612 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8613 * line.
8614 *
8615 * If no block is given, an enumerator is returned instead.
8616 *
8617 * "hello\nworld".each_line {|s| p s}
8618 * # prints:
8619 * # "hello\n"
8620 * # "world"
8621 *
8622 * "hello\nworld".each_line('l') {|s| p s}
8623 * # prints:
8624 * # "hel"
8625 * # "l"
8626 * # "o\nworl"
8627 * # "d"
8628 *
8629 * "hello\n\n\nworld".each_line('') {|s| p s}
8630 * # prints
8631 * # "hello\n\n"
8632 * # "world"
8633 *
8634 * "hello\nworld".each_line(chomp: true) {|s| p s}
8635 * # prints:
8636 * # "hello"
8637 * # "world"
8638 *
8639 * "hello\nworld".each_line('l', chomp: true) {|s| p s}
8640 * # prints:
8641 * # "he"
8642 * # ""
8643 * # "o\nwor"
8644 * # "d"
8645 *
8646 */
8647
8648static VALUE
8649rb_str_each_line(int argc, VALUE *argv, VALUE str)
8650{
8652 return rb_str_enumerate_lines(argc, argv, str, 0);
8653}
8654
8655/*
8656 * call-seq:
8657 * str.lines(separator=$/, chomp: false) -> an_array
8658 *
8659 * Returns an array of lines in <i>str</i> split using the supplied
8660 * record separator (<code>$/</code> by default). This is a
8661 * shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8662 *
8663 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8664 * line.
8665 *
8666 * "hello\nworld\n".lines #=> ["hello\n", "world\n"]
8667 * "hello world".lines(' ') #=> ["hello ", " ", "world"]
8668 * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8669 *
8670 * If a block is given, which is a deprecated form, works the same as
8671 * <code>each_line</code>.
8672 */
8673
8674static VALUE
8675rb_str_lines(int argc, VALUE *argv, VALUE str)
8676{
8677 VALUE ary = WANTARRAY("lines", 0);
8678 return rb_str_enumerate_lines(argc, argv, str, ary);
8679}
8680
8681static VALUE
8682rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8683{
8684 return LONG2FIX(RSTRING_LEN(str));
8685}
8686
8687static VALUE
8688rb_str_enumerate_bytes(VALUE str, VALUE ary)
8689{
8690 long i;
8691
8692 for (i=0; i<RSTRING_LEN(str); i++) {
8693 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
8694 }
8695 if (ary)
8696 return ary;
8697 else
8698 return str;
8699}
8700
8701/*
8702 * call-seq:
8703 * str.each_byte {|integer| block } -> str
8704 * str.each_byte -> an_enumerator
8705 *
8706 * Passes each byte in <i>str</i> to the given block, or returns an
8707 * enumerator if no block is given.
8708 *
8709 * "hello".each_byte {|c| print c, ' ' }
8710 *
8711 * <em>produces:</em>
8712 *
8713 * 104 101 108 108 111
8714 */
8715
8716static VALUE
8717rb_str_each_byte(VALUE str)
8718{
8719 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8720 return rb_str_enumerate_bytes(str, 0);
8721}
8722
8723/*
8724 * call-seq:
8725 * str.bytes -> an_array
8726 *
8727 * Returns an array of bytes in <i>str</i>. This is a shorthand for
8728 * <code>str.each_byte.to_a</code>.
8729 *
8730 * If a block is given, which is a deprecated form, works the same as
8731 * <code>each_byte</code>.
8732 */
8733
8734static VALUE
8735rb_str_bytes(VALUE str)
8736{
8737 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
8738 return rb_str_enumerate_bytes(str, ary);
8739}
8740
8741static VALUE
8742rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
8743{
8744 return rb_str_length(str);
8745}
8746
8747static VALUE
8748rb_str_enumerate_chars(VALUE str, VALUE ary)
8749{
8750 VALUE orig = str;
8751 long i, len, n;
8752 const char *ptr;
8753 rb_encoding *enc;
8754
8756 ptr = RSTRING_PTR(str);
8757 len = RSTRING_LEN(str);
8758 enc = rb_enc_get(str);
8759
8761 for (i = 0; i < len; i += n) {
8762 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
8763 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
8764 }
8765 }
8766 else {
8767 for (i = 0; i < len; i += n) {
8768 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
8769 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
8770 }
8771 }
8773 if (ary)
8774 return ary;
8775 else
8776 return orig;
8777}
8778
8779/*
8780 * call-seq:
8781 * str.each_char {|cstr| block } -> str
8782 * str.each_char -> an_enumerator
8783 *
8784 * Passes each character in <i>str</i> to the given block, or returns
8785 * an enumerator if no block is given.
8786 *
8787 * "hello".each_char {|c| print c, ' ' }
8788 *
8789 * <em>produces:</em>
8790 *
8791 * h e l l o
8792 */
8793
8794static VALUE
8795rb_str_each_char(VALUE str)
8796{
8797 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8798 return rb_str_enumerate_chars(str, 0);
8799}
8800
8801/*
8802 * call-seq:
8803 * str.chars -> an_array
8804 *
8805 * Returns an array of characters in <i>str</i>. This is a shorthand
8806 * for <code>str.each_char.to_a</code>.
8807 *
8808 * If a block is given, which is a deprecated form, works the same as
8809 * <code>each_char</code>.
8810 */
8811
8812static VALUE
8813rb_str_chars(VALUE str)
8814{
8815 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
8816 return rb_str_enumerate_chars(str, ary);
8817}
8818
8819static VALUE
8820rb_str_enumerate_codepoints(VALUE str, VALUE ary)
8821{
8822 VALUE orig = str;
8823 int n;
8824 unsigned int c;
8825 const char *ptr, *end;
8826 rb_encoding *enc;
8827
8828 if (single_byte_optimizable(str))
8829 return rb_str_enumerate_bytes(str, ary);
8830
8832 ptr = RSTRING_PTR(str);
8833 end = RSTRING_END(str);
8834 enc = STR_ENC_GET(str);
8835
8836 while (ptr < end) {
8837 c = rb_enc_codepoint_len(ptr, end, &n, enc);
8838 ENUM_ELEM(ary, UINT2NUM(c));
8839 ptr += n;
8840 }
8842 if (ary)
8843 return ary;
8844 else
8845 return orig;
8846}
8847
8848/*
8849 * call-seq:
8850 * str.each_codepoint {|integer| block } -> str
8851 * str.each_codepoint -> an_enumerator
8852 *
8853 * Passes the Integer ordinal of each character in <i>str</i>,
8854 * also known as a <i>codepoint</i> when applied to Unicode strings to the
8855 * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
8856 * values are directly derived from the binary representation
8857 * of each character.
8858 *
8859 * If no block is given, an enumerator is returned instead.
8860 *
8861 * "hello\u0639".each_codepoint {|c| print c, ' ' }
8862 *
8863 * <em>produces:</em>
8864 *
8865 * 104 101 108 108 111 1593
8866 */
8867
8868static VALUE
8869rb_str_each_codepoint(VALUE str)
8870{
8871 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8872 return rb_str_enumerate_codepoints(str, 0);
8873}
8874
8875/*
8876 * call-seq:
8877 * str.codepoints -> an_array
8878 *
8879 * Returns an array of the Integer ordinals of the
8880 * characters in <i>str</i>. This is a shorthand for
8881 * <code>str.each_codepoint.to_a</code>.
8882 *
8883 * If a block is given, which is a deprecated form, works the same as
8884 * <code>each_codepoint</code>.
8885 */
8886
8887static VALUE
8888rb_str_codepoints(VALUE str)
8889{
8890 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
8891 return rb_str_enumerate_codepoints(str, ary);
8892}
8893
8894static regex_t *
8895get_reg_grapheme_cluster(rb_encoding *enc)
8896{
8897 int encidx = rb_enc_to_index(enc);
8898 regex_t *reg_grapheme_cluster = NULL;
8899 static regex_t *reg_grapheme_cluster_utf8 = NULL;
8900
8901 /* synchronize */
8902 if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
8903 reg_grapheme_cluster = reg_grapheme_cluster_utf8;
8904 }
8905 if (!reg_grapheme_cluster) {
8906 const OnigUChar source_ascii[] = "\\X";
8907 OnigErrorInfo einfo;
8908 const OnigUChar *source = source_ascii;
8909 size_t source_len = sizeof(source_ascii) - 1;
8910 switch (encidx) {
8911#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
8912#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
8913#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
8914#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
8915#define CASE_UTF(e) \
8916 case ENCINDEX_UTF_##e: { \
8917 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
8918 source = source_UTF_##e; \
8919 source_len = sizeof(source_UTF_##e); \
8920 break; \
8921 }
8922 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
8923#undef CASE_UTF
8924#undef CHARS_16BE
8925#undef CHARS_16LE
8926#undef CHARS_32BE
8927#undef CHARS_32LE
8928 }
8929 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
8931 if (r) {
8933 onig_error_code_to_str(message, r, &einfo);
8934 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
8935 }
8936 if (encidx == rb_utf8_encindex()) {
8937 reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
8938 }
8939 }
8940 return reg_grapheme_cluster;
8941}
8942
8943static VALUE
8944rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
8945{
8946 size_t grapheme_cluster_count = 0;
8947 regex_t *reg_grapheme_cluster = NULL;
8949 const char *ptr, *end;
8950
8951 if (!rb_enc_unicode_p(enc)) {
8952 return rb_str_length(str);
8953 }
8954
8955 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
8956 ptr = RSTRING_PTR(str);
8957 end = RSTRING_END(str);
8958
8959 while (ptr < end) {
8960 OnigPosition len = onig_match(reg_grapheme_cluster,
8961 (const OnigUChar *)ptr, (const OnigUChar *)end,
8962 (const OnigUChar *)ptr, NULL, 0);
8963 if (len <= 0) break;
8964 grapheme_cluster_count++;
8965 ptr += len;
8966 }
8967
8968 return SIZET2NUM(grapheme_cluster_count);
8969}
8970
8971static VALUE
8972rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
8973{
8974 VALUE orig = str;
8975 regex_t *reg_grapheme_cluster = NULL;
8977 const char *ptr0, *ptr, *end;
8978
8979 if (!rb_enc_unicode_p(enc)) {
8980 return rb_str_enumerate_chars(str, ary);
8981 }
8982
8983 if (!ary) str = rb_str_new_frozen(str);
8984 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
8985 ptr0 = ptr = RSTRING_PTR(str);
8986 end = RSTRING_END(str);
8987
8988 while (ptr < end) {
8989 OnigPosition len = onig_match(reg_grapheme_cluster,
8990 (const OnigUChar *)ptr, (const OnigUChar *)end,
8991 (const OnigUChar *)ptr, NULL, 0);
8992 if (len <= 0) break;
8993 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
8994 ptr += len;
8995 }
8997 if (ary)
8998 return ary;
8999 else
9000 return orig;
9001}
9002
9003/*
9004 * call-seq:
9005 * str.each_grapheme_cluster {|cstr| block } -> str
9006 * str.each_grapheme_cluster -> an_enumerator
9007 *
9008 * Passes each grapheme cluster in <i>str</i> to the given block, or returns
9009 * an enumerator if no block is given.
9010 * Unlike String#each_char, this enumerates by grapheme clusters defined by
9011 * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
9012 *
9013 * "a\u0300".each_char.to_a.size #=> 2
9014 * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
9015 *
9016 */
9017
9018static VALUE
9019rb_str_each_grapheme_cluster(VALUE str)
9020{
9021 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9022 return rb_str_enumerate_grapheme_clusters(str, 0);
9023}
9024
9025/*
9026 * call-seq:
9027 * str.grapheme_clusters -> an_array
9028 *
9029 * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
9030 * for <code>str.each_grapheme_cluster.to_a</code>.
9031 *
9032 * If a block is given, which is a deprecated form, works the same as
9033 * <code>each_grapheme_cluster</code>.
9034 */
9035
9036static VALUE
9037rb_str_grapheme_clusters(VALUE str)
9038{
9039 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9040 return rb_str_enumerate_grapheme_clusters(str, ary);
9041}
9042
9043static long
9044chopped_length(VALUE str)
9045{
9046 rb_encoding *enc = STR_ENC_GET(str);
9047 const char *p, *p2, *beg, *end;
9048
9049 beg = RSTRING_PTR(str);
9050 end = beg + RSTRING_LEN(str);
9051 if (beg >= end) return 0;
9052 p = rb_enc_prev_char(beg, end, end, enc);
9053 if (!p) return 0;
9054 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9055 p2 = rb_enc_prev_char(beg, p, end, enc);
9056 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9057 }
9058 return p - beg;
9059}
9060
9061/*
9062 * call-seq:
9063 * str.chop! -> str or nil
9064 *
9065 * Processes <i>str</i> as for String#chop, returning <i>str</i>, or
9066 * <code>nil</code> if <i>str</i> is the empty string. See also
9067 * String#chomp!.
9068 */
9069
9070static VALUE
9071rb_str_chop_bang(VALUE str)
9072{
9073 str_modify_keep_cr(str);
9074 if (RSTRING_LEN(str) > 0) {
9075 long len;
9076 len = chopped_length(str);
9081 }
9082 return str;
9083 }
9084 return Qnil;
9085}
9086
9087
9088/*
9089 * call-seq:
9090 * str.chop -> new_str
9091 *
9092 * Returns a new String with the last character removed. If the
9093 * string ends with <code>\r\n</code>, both characters are
9094 * removed. Applying <code>chop</code> to an empty string returns an
9095 * empty string. String#chomp is often a safer alternative, as it
9096 * leaves the string unchanged if it doesn't end in a record
9097 * separator.
9098 *
9099 * "string\r\n".chop #=> "string"
9100 * "string\n\r".chop #=> "string\n"
9101 * "string\n".chop #=> "string"
9102 * "string".chop #=> "strin"
9103 * "x".chop.chop #=> ""
9104 */
9105
9106static VALUE
9107rb_str_chop(VALUE str)
9108{
9109 return rb_str_subseq(str, 0, chopped_length(str));
9110}
9111
9112static long
9113smart_chomp(VALUE str, const char *e, const char *p)
9114{
9115 rb_encoding *enc = rb_enc_get(str);
9116 if (rb_enc_mbminlen(enc) > 1) {
9117 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9118 if (rb_enc_is_newline(pp, e, enc)) {
9119 e = pp;
9120 }
9121 pp = e - rb_enc_mbminlen(enc);
9122 if (pp >= p) {
9123 pp = rb_enc_left_char_head(p, pp, e, enc);
9124 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9125 e = pp;
9126 }
9127 }
9128 }
9129 else {
9130 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9131 case '\n':
9132 if (--e > p && *(e-1) == '\r') {
9133 --e;
9134 }
9135 break;
9136 case '\r':
9137 --e;
9138 break;
9139 }
9140 }
9141 return e - p;
9142}
9143
9144static long
9145chompped_length(VALUE str, VALUE rs)
9146{
9147 rb_encoding *enc;
9148 int newline;
9149 char *pp, *e, *rsptr;
9150 long rslen;
9151 char *const p = RSTRING_PTR(str);
9152 long len = RSTRING_LEN(str);
9153
9154 if (len == 0) return 0;
9155 e = p + len;
9156 if (rs == rb_default_rs) {
9157 return smart_chomp(str, e, p);
9158 }
9159
9160 enc = rb_enc_get(str);
9161 RSTRING_GETMEM(rs, rsptr, rslen);
9162 if (rslen == 0) {
9163 if (rb_enc_mbminlen(enc) > 1) {
9164 while (e > p) {
9165 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9166 if (!rb_enc_is_newline(pp, e, enc)) break;
9167 e = pp;
9168 pp -= rb_enc_mbminlen(enc);
9169 if (pp >= p) {
9170 pp = rb_enc_left_char_head(p, pp, e, enc);
9171 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9172 e = pp;
9173 }
9174 }
9175 }
9176 }
9177 else {
9178 while (e > p && *(e-1) == '\n') {
9179 --e;
9180 if (e > p && *(e-1) == '\r')
9181 --e;
9182 }
9183 }
9184 return e - p;
9185 }
9186 if (rslen > len) return len;
9187
9188 enc = rb_enc_get(rs);
9189 newline = rsptr[rslen-1];
9190 if (rslen == rb_enc_mbminlen(enc)) {
9191 if (rslen == 1) {
9192 if (newline == '\n')
9193 return smart_chomp(str, e, p);
9194 }
9195 else {
9196 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9197 return smart_chomp(str, e, p);
9198 }
9199 }
9200
9201 enc = rb_enc_check(str, rs);
9202 if (is_broken_string(rs)) {
9203 return len;
9204 }
9205 pp = e - rslen;
9206 if (p[len-1] == newline &&
9207 (rslen <= 1 ||
9208 memcmp(rsptr, pp, rslen) == 0)) {
9209 if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9210 return len - rslen;
9211 RB_GC_GUARD(rs);
9212 }
9213 return len;
9214}
9215
9221static VALUE
9222chomp_rs(int argc, const VALUE *argv)
9223{
9224 rb_check_arity(argc, 0, 1);
9225 if (argc > 0) {
9226 VALUE rs = argv[0];
9227 if (!NIL_P(rs)) StringValue(rs);
9228 return rs;
9229 }
9230 else {
9231 return rb_rs;
9232 }
9233}
9234
9235VALUE
9237{
9238 long olen = RSTRING_LEN(str);
9239 long len = chompped_length(str, rs);
9240 if (len >= olen) return Qnil;
9241 str_modify_keep_cr(str);
9246 }
9247 return str;
9248}
9249
9250/*
9251 * call-seq:
9252 * str.chomp!(separator=$/) -> str or nil
9253 *
9254 * Modifies <i>str</i> in place as described for String#chomp,
9255 * returning <i>str</i>, or <code>nil</code> if no modifications were
9256 * made.
9257 */
9258
9259static VALUE
9260rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9261{
9262 VALUE rs;
9263 str_modifiable(str);
9264 if (RSTRING_LEN(str) == 0) return Qnil;
9265 rs = chomp_rs(argc, argv);
9266 if (NIL_P(rs)) return Qnil;
9267 return rb_str_chomp_string(str, rs);
9268}
9269
9270
9271/*
9272 * call-seq:
9273 * str.chomp(separator=$/) -> new_str
9274 *
9275 * Returns a new String with the given record separator removed
9276 * from the end of <i>str</i> (if present). If <code>$/</code> has not been
9277 * changed from the default Ruby record separator, then <code>chomp</code> also
9278 * removes carriage return characters (that is it will remove <code>\n</code>,
9279 * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
9280 * it will remove all trailing newlines from the string.
9281 *
9282 * "hello".chomp #=> "hello"
9283 * "hello\n".chomp #=> "hello"
9284 * "hello\r\n".chomp #=> "hello"
9285 * "hello\n\r".chomp #=> "hello\n"
9286 * "hello\r".chomp #=> "hello"
9287 * "hello \n there".chomp #=> "hello \n there"
9288 * "hello".chomp("llo") #=> "he"
9289 * "hello\r\n\r\n".chomp('') #=> "hello"
9290 * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
9291 */
9292
9293static VALUE
9294rb_str_chomp(int argc, VALUE *argv, VALUE str)
9295{
9296 VALUE rs = chomp_rs(argc, argv);
9297 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9298 return rb_str_subseq(str, 0, chompped_length(str, rs));
9299}
9300
9301static long
9302lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9303{
9304 const char *const start = s;
9305
9306 if (!s || s >= e) return 0;
9307
9308 /* remove spaces at head */
9309 if (single_byte_optimizable(str)) {
9310 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9311 }
9312 else {
9313 while (s < e) {
9314 int n;
9315 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9316
9317 if (cc && !rb_isspace(cc)) break;
9318 s += n;
9319 }
9320 }
9321 return s - start;
9322}
9323
9324/*
9325 * call-seq:
9326 * str.lstrip! -> self or nil
9327 *
9328 * Removes leading whitespace from the receiver.
9329 * Returns the altered receiver, or +nil+ if no change was made.
9330 * See also String#rstrip! and String#strip!.
9331 *
9332 * Refer to String#strip for the definition of whitespace.
9333 *
9334 * " hello ".lstrip! #=> "hello "
9335 * "hello ".lstrip! #=> nil
9336 * "hello".lstrip! #=> nil
9337 */
9338
9339static VALUE
9340rb_str_lstrip_bang(VALUE str)
9341{
9342 rb_encoding *enc;
9343 char *start, *s;
9344 long olen, loffset;
9345
9346 str_modify_keep_cr(str);
9347 enc = STR_ENC_GET(str);
9348 RSTRING_GETMEM(str, start, olen);
9349 loffset = lstrip_offset(str, start, start+olen, enc);
9350 if (loffset > 0) {
9351 long len = olen-loffset;
9352 s = start + loffset;
9353 memmove(start, s, len);
9355#if !SHARABLE_MIDDLE_SUBSTRING
9356 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9357#endif
9358 return str;
9359 }
9360 return Qnil;
9361}
9362
9363
9364/*
9365 * call-seq:
9366 * str.lstrip -> new_str
9367 *
9368 * Returns a copy of the receiver with leading whitespace removed.
9369 * See also String#rstrip and String#strip.
9370 *
9371 * Refer to String#strip for the definition of whitespace.
9372 *
9373 * " hello ".lstrip #=> "hello "
9374 * "hello".lstrip #=> "hello"
9375 */
9376
9377static VALUE
9378rb_str_lstrip(VALUE str)
9379{
9380 char *start;
9381 long len, loffset;
9382 RSTRING_GETMEM(str, start, len);
9383 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9384 if (loffset <= 0) return str_duplicate(rb_cString, str);
9385 return rb_str_subseq(str, loffset, len - loffset);
9386}
9387
9388static long
9389rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9390{
9391 const char *t;
9392
9393 rb_str_check_dummy_enc(enc);
9394 if (!s || s >= e) return 0;
9395 t = e;
9396
9397 /* remove trailing spaces or '\0's */
9398 if (single_byte_optimizable(str)) {
9399 unsigned char c;
9400 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9401 }
9402 else {
9403 char *tp;
9404
9405 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9406 unsigned int c = rb_enc_codepoint(tp, e, enc);
9407 if (c && !rb_isspace(c)) break;
9408 t = tp;
9409 }
9410 }
9411 return e - t;
9412}
9413
9414/*
9415 * call-seq:
9416 * str.rstrip! -> self or nil
9417 *
9418 * Removes trailing whitespace from the receiver.
9419 * Returns the altered receiver, or +nil+ if no change was made.
9420 * See also String#lstrip! and String#strip!.
9421 *
9422 * Refer to String#strip for the definition of whitespace.
9423 *
9424 * " hello ".rstrip! #=> " hello"
9425 * " hello".rstrip! #=> nil
9426 * "hello".rstrip! #=> nil
9427 */
9428
9429static VALUE
9430rb_str_rstrip_bang(VALUE str)
9431{
9432 rb_encoding *enc;
9433 char *start;
9434 long olen, roffset;
9435
9436 str_modify_keep_cr(str);
9437 enc = STR_ENC_GET(str);
9438 RSTRING_GETMEM(str, start, olen);
9439 roffset = rstrip_offset(str, start, start+olen, enc);
9440 if (roffset > 0) {
9441 long len = olen - roffset;
9442
9444#if !SHARABLE_MIDDLE_SUBSTRING
9445 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9446#endif
9447 return str;
9448 }
9449 return Qnil;
9450}
9451
9452
9453/*
9454 * call-seq:
9455 * str.rstrip -> new_str
9456 *
9457 * Returns a copy of the receiver with trailing whitespace removed.
9458 * See also String#lstrip and String#strip.
9459 *
9460 * Refer to String#strip for the definition of whitespace.
9461 *
9462 * " hello ".rstrip #=> " hello"
9463 * "hello".rstrip #=> "hello"
9464 */
9465
9466static VALUE
9467rb_str_rstrip(VALUE str)
9468{
9469 rb_encoding *enc;
9470 char *start;
9471 long olen, roffset;
9472
9473 enc = STR_ENC_GET(str);
9474 RSTRING_GETMEM(str, start, olen);
9475 roffset = rstrip_offset(str, start, start+olen, enc);
9476
9477 if (roffset <= 0) return str_duplicate(rb_cString, str);
9478 return rb_str_subseq(str, 0, olen-roffset);
9479}
9480
9481
9482/*
9483 * call-seq:
9484 * str.strip! -> self or nil
9485 *
9486 * Removes leading and trailing whitespace from the receiver.
9487 * Returns the altered receiver, or +nil+ if there was no change.
9488 *
9489 * Refer to String#strip for the definition of whitespace.
9490 *
9491 * " hello ".strip! #=> "hello"
9492 * "hello".strip! #=> nil
9493 */
9494
9495static VALUE
9496rb_str_strip_bang(VALUE str)
9497{
9498 char *start;
9499 long olen, loffset, roffset;
9500 rb_encoding *enc;
9501
9502 str_modify_keep_cr(str);
9503 enc = STR_ENC_GET(str);
9504 RSTRING_GETMEM(str, start, olen);
9505 loffset = lstrip_offset(str, start, start+olen, enc);
9506 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9507
9508 if (loffset > 0 || roffset > 0) {
9509 long len = olen-roffset;
9510 if (loffset > 0) {
9511 len -= loffset;
9512 memmove(start, start + loffset, len);
9513 }
9515#if !SHARABLE_MIDDLE_SUBSTRING
9516 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9517#endif
9518 return str;
9519 }
9520 return Qnil;
9521}
9522
9523
9524/*
9525 * call-seq:
9526 * str.strip -> new_str
9527 *
9528 * Returns a copy of the receiver with leading and trailing whitespace removed.
9529 *
9530 * Whitespace is defined as any of the following characters:
9531 * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9532 *
9533 * " hello ".strip #=> "hello"
9534 * "\tgoodbye\r\n".strip #=> "goodbye"
9535 * "\x00\t\n\v\f\r ".strip #=> ""
9536 * "hello".strip #=> "hello"
9537 */
9538
9539static VALUE
9540rb_str_strip(VALUE str)
9541{
9542 char *start;
9543 long olen, loffset, roffset;
9544 rb_encoding *enc = STR_ENC_GET(str);
9545
9546 RSTRING_GETMEM(str, start, olen);
9547 loffset = lstrip_offset(str, start, start+olen, enc);
9548 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9549
9550 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9551 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9552}
9553
9554static VALUE
9555scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9556{
9557 VALUE result, match;
9558 struct re_registers *regs;
9559 int i;
9560 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9561 if (pos >= 0) {
9562 if (BUILTIN_TYPE(pat) == T_STRING) {
9563 regs = NULL;
9564 end = pos + RSTRING_LEN(pat);
9565 }
9566 else {
9568 regs = RMATCH_REGS(match);
9569 pos = BEG(0);
9570 end = END(0);
9571 }
9572 if (pos == end) {
9573 rb_encoding *enc = STR_ENC_GET(str);
9574 /*
9575 * Always consume at least one character of the input string
9576 */
9577 if (RSTRING_LEN(str) > end)
9579 RSTRING_END(str), enc);
9580 else
9581 *start = end + 1;
9582 }
9583 else {
9584 *start = end;
9585 }
9586 if (!regs || regs->num_regs == 1) {
9587 result = rb_str_subseq(str, pos, end - pos);
9588 return result;
9589 }
9590 result = rb_ary_new2(regs->num_regs);
9591 for (i=1; i < regs->num_regs; i++) {
9592 VALUE s = Qnil;
9593 if (BEG(i) >= 0) {
9594 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9595 }
9596 rb_ary_push(result, s);
9597 }
9598
9599 return result;
9600 }
9601 return Qnil;
9602}
9603
9604
9605/*
9606 * call-seq:
9607 * str.scan(pattern) -> array
9608 * str.scan(pattern) {|match, ...| block } -> str
9609 *
9610 * Both forms iterate through <i>str</i>, matching the pattern (which may be a
9611 * Regexp or a String). For each match, a result is
9612 * generated and either added to the result array or passed to the block. If
9613 * the pattern contains no groups, each individual result consists of the
9614 * matched string, <code>$&</code>. If the pattern contains groups, each
9615 * individual result is itself an array containing one entry per group.
9616 *
9617 * a = "cruel world"
9618 * a.scan(/\w+/) #=> ["cruel", "world"]
9619 * a.scan(/.../) #=> ["cru", "el ", "wor"]
9620 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
9621 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
9622 *
9623 * And the block form:
9624 *
9625 * a.scan(/\w+/) {|w| print "<<#{w}>> " }
9626 * print "\n"
9627 * a.scan(/(.)(.)/) {|x,y| print y, x }
9628 * print "\n"
9629 *
9630 * <em>produces:</em>
9631 *
9632 * <<cruel>> <<world>>
9633 * rceu lowlr
9634 */
9635
9636static VALUE
9637rb_str_scan(VALUE str, VALUE pat)
9638{
9639 VALUE result;
9640 long start = 0;
9641 long last = -1, prev = 0;
9642 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9643
9644 pat = get_pat_quoted(pat, 1);
9645 mustnot_broken(str);
9646 if (!rb_block_given_p()) {
9647 VALUE ary = rb_ary_new();
9648
9649 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9650 last = prev;
9651 prev = start;
9652 rb_ary_push(ary, result);
9653 }
9654 if (last >= 0) rb_pat_search(pat, str, last, 1);
9655 else rb_backref_set(Qnil);
9656 return ary;
9657 }
9658
9659 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9660 last = prev;
9661 prev = start;
9662 rb_yield(result);
9663 str_mod_check(str, p, len);
9664 }
9665 if (last >= 0) rb_pat_search(pat, str, last, 1);
9666 return str;
9667}
9668
9669
9670/*
9671 * call-seq:
9672 * str.hex -> integer
9673 *
9674 * Treats leading characters from <i>str</i> as a string of hexadecimal digits
9675 * (with an optional sign and an optional <code>0x</code>) and returns the
9676 * corresponding number. Zero is returned on error.
9677 *
9678 * "0x0a".hex #=> 10
9679 * "-1234".hex #=> -4660
9680 * "0".hex #=> 0
9681 * "wombat".hex #=> 0
9682 */
9683
9684static VALUE
9685rb_str_hex(VALUE str)
9686{
9687 return rb_str_to_inum(str, 16, FALSE);
9688}
9689
9690
9691/*
9692 * call-seq:
9693 * str.oct -> integer
9694 *
9695 * Treats leading characters of <i>str</i> as a string of octal digits (with an
9696 * optional sign) and returns the corresponding number. Returns 0 if the
9697 * conversion fails.
9698 *
9699 * "123".oct #=> 83
9700 * "-377".oct #=> -255
9701 * "bad".oct #=> 0
9702 * "0377bad".oct #=> 255
9703 *
9704 * If +str+ starts with <code>0</code>, radix indicators are honored.
9705 * See Kernel#Integer.
9706 */
9707
9708static VALUE
9709rb_str_oct(VALUE str)
9710{
9711 return rb_str_to_inum(str, -8, FALSE);
9712}
9713
9714
9715/*
9716 * call-seq:
9717 * str.crypt(salt_str) -> new_str
9718 *
9719 * Returns the string generated by calling <code>crypt(3)</code>
9720 * standard library function with <code>str</code> and
9721 * <code>salt_str</code>, in this order, as its arguments. Please do
9722 * not use this method any longer. It is legacy; provided only for
9723 * backward compatibility with ruby scripts in earlier days. It is
9724 * bad to use in contemporary programs for several reasons:
9725 *
9726 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
9727 * run. The generated string lacks data portability.
9728 *
9729 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
9730 * (i.e. silently ends up in unexpected results).
9731 *
9732 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
9733 * thread safe.
9734 *
9735 * * So-called "traditional" usage of <code>crypt(3)</code> is very
9736 * very very weak. According to its manpage, Linux's traditional
9737 * <code>crypt(3)</code> output has only 2**56 variations; too
9738 * easy to brute force today. And this is the default behaviour.
9739 *
9740 * * In order to make things robust some OSes implement so-called
9741 * "modular" usage. To go through, you have to do a complex
9742 * build-up of the <code>salt_str</code> parameter, by hand.
9743 * Failure in generation of a proper salt string tends not to
9744 * yield any errors; typos in parameters are normally not
9745 * detectable.
9746 *
9747 * * For instance, in the following example, the second invocation
9748 * of String#crypt is wrong; it has a typo in "round=" (lacks
9749 * "s"). However the call does not fail and something unexpected
9750 * is generated.
9751 *
9752 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
9753 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
9754 *
9755 * * Even in the "modular" mode, some hash functions are considered
9756 * archaic and no longer recommended at all; for instance module
9757 * <code>$1$</code> is officially abandoned by its author: see
9758 * http://phk.freebsd.dk/sagas/md5crypt_eol.html . For another
9759 * instance module <code>$3$</code> is considered completely
9760 * broken: see the manpage of FreeBSD.
9761 *
9762 * * On some OS such as Mac OS, there is no modular mode. Yet, as
9763 * written above, <code>crypt(3)</code> on Mac OS never fails.
9764 * This means even if you build up a proper salt string it
9765 * generates a traditional DES hash anyways, and there is no way
9766 * for you to be aware of.
9767 *
9768 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
9769 *
9770 * If for some reason you cannot migrate to other secure contemporary
9771 * password hashing algorithms, install the string-crypt gem and
9772 * <code>require 'string/crypt'</code> to continue using it.
9773 */
9774
9775static VALUE
9776rb_str_crypt(VALUE str, VALUE salt)
9777{
9778#ifdef HAVE_CRYPT_R
9779 VALUE databuf;
9780 struct crypt_data *data;
9781# define CRYPT_END() ALLOCV_END(databuf)
9782#else
9783 extern char *crypt(const char *, const char *);
9784# define CRYPT_END() (void)0
9785#endif
9786 VALUE result;
9787 const char *s, *saltp;
9788 char *res;
9789#ifdef BROKEN_CRYPT
9790 char salt_8bit_clean[3];
9791#endif
9792
9793 StringValue(salt);
9794 mustnot_wchar(str);
9795 mustnot_wchar(salt);
9796 if (RSTRING_LEN(salt) < 2) {
9797 goto short_salt;
9798 }
9799
9800 s = StringValueCStr(str);
9801 saltp = RSTRING_PTR(salt);
9802 if (!saltp[0] || !saltp[1]) goto short_salt;
9803#ifdef BROKEN_CRYPT
9804 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
9805 salt_8bit_clean[0] = saltp[0] & 0x7f;
9806 salt_8bit_clean[1] = saltp[1] & 0x7f;
9807 salt_8bit_clean[2] = '\0';
9808 saltp = salt_8bit_clean;
9809 }
9810#endif
9811#ifdef HAVE_CRYPT_R
9812 data = ALLOCV(databuf, sizeof(struct crypt_data));
9813# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
9814 data->initialized = 0;
9815# endif
9816 res = crypt_r(s, saltp, data);
9817#else
9818 res = crypt(s, saltp);
9819#endif
9820 if (!res) {
9821 int err = errno;
9822 CRYPT_END();
9823 rb_syserr_fail(err, "crypt");
9824 }
9825 result = rb_str_new_cstr(res);
9826 CRYPT_END();
9827 return result;
9828
9829 short_salt:
9830 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
9832}
9833
9834
9835/*
9836 * call-seq:
9837 * str.ord -> integer
9838 *
9839 * Returns the Integer ordinal of a one-character string.
9840 *
9841 * "a".ord #=> 97
9842 */
9843
9844static VALUE
9845rb_str_ord(VALUE s)
9846{
9847 unsigned int c;
9848
9849 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
9850 return UINT2NUM(c);
9851}
9852/*
9853 * call-seq:
9854 * str.sum(n=16) -> integer
9855 *
9856 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
9857 * where <em>n</em> is the optional Integer parameter, defaulting
9858 * to 16. The result is simply the sum of the binary value of each byte in
9859 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
9860 * checksum.
9861 */
9862
9863static VALUE
9864rb_str_sum(int argc, VALUE *argv, VALUE str)
9865{
9866 int bits = 16;
9867 char *ptr, *p, *pend;
9868 long len;
9869 VALUE sum = INT2FIX(0);
9870 unsigned long sum0 = 0;
9871
9872 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
9873 bits = 0;
9874 }
9875 ptr = p = RSTRING_PTR(str);
9876 len = RSTRING_LEN(str);
9877 pend = p + len;
9878
9879 while (p < pend) {
9880 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
9881 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9882 str_mod_check(str, ptr, len);
9883 sum0 = 0;
9884 }
9885 sum0 += (unsigned char)*p;
9886 p++;
9887 }
9888
9889 if (bits == 0) {
9890 if (sum0) {
9891 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9892 }
9893 }
9894 else {
9895 if (sum == INT2FIX(0)) {
9896 if (bits < (int)sizeof(long)*CHAR_BIT) {
9897 sum0 &= (((unsigned long)1)<<bits)-1;
9898 }
9899 sum = LONG2FIX(sum0);
9900 }
9901 else {
9902 VALUE mod;
9903
9904 if (sum0) {
9905 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9906 }
9907
9909 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
9910 sum = rb_funcall(sum, '&', 1, mod);
9911 }
9912 }
9913 return sum;
9914}
9915
9916static VALUE
9917rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
9918{
9919 rb_encoding *enc;
9920 VALUE w;
9921 long width, len, flen = 1, fclen = 1;
9922 VALUE res;
9923 char *p;
9924 const char *f = " ";
9925 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
9926 VALUE pad;
9927 int singlebyte = 1, cr;
9928 int termlen;
9929
9930 rb_scan_args(argc, argv, "11", &w, &pad);
9931 enc = STR_ENC_GET(str);
9932 termlen = rb_enc_mbminlen(enc);
9933 width = NUM2LONG(w);
9934 if (argc == 2) {
9935 StringValue(pad);
9936 enc = rb_enc_check(str, pad);
9937 f = RSTRING_PTR(pad);
9938 flen = RSTRING_LEN(pad);
9939 fclen = str_strlen(pad, enc); /* rb_enc_check */
9940 singlebyte = single_byte_optimizable(pad);
9941 if (flen == 0 || fclen == 0) {
9942 rb_raise(rb_eArgError, "zero width padding");
9943 }
9944 }
9945 len = str_strlen(str, enc); /* rb_enc_check */
9946 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
9947 n = width - len;
9948 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
9949 rlen = n - llen;
9950 cr = ENC_CODERANGE(str);
9951 if (flen > 1) {
9952 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
9953 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
9954 }
9955 size = RSTRING_LEN(str);
9956 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
9957 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
9958 (len += llen2 + rlen2) >= LONG_MAX - size) {
9959 rb_raise(rb_eArgError, "argument too big");
9960 }
9961 len += size;
9962 res = str_new0(rb_cString, 0, len, termlen);
9963 p = RSTRING_PTR(res);
9964 if (flen <= 1) {
9965 memset(p, *f, llen);
9966 p += llen;
9967 }
9968 else {
9969 while (llen >= fclen) {
9970 memcpy(p,f,flen);
9971 p += flen;
9972 llen -= fclen;
9973 }
9974 if (llen > 0) {
9975 memcpy(p, f, llen2);
9976 p += llen2;
9977 }
9978 }
9979 memcpy(p, RSTRING_PTR(str), size);
9980 p += size;
9981 if (flen <= 1) {
9982 memset(p, *f, rlen);
9983 p += rlen;
9984 }
9985 else {
9986 while (rlen >= fclen) {
9987 memcpy(p,f,flen);
9988 p += flen;
9989 rlen -= fclen;
9990 }
9991 if (rlen > 0) {
9992 memcpy(p, f, rlen2);
9993 p += rlen2;
9994 }
9995 }
9996 TERM_FILL(p, termlen);
9997 STR_SET_LEN(res, p-RSTRING_PTR(res));
9998 rb_enc_associate(res, enc);
9999 if (argc == 2)
10000 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10001 if (cr != ENC_CODERANGE_BROKEN)
10002 ENC_CODERANGE_SET(res, cr);
10003
10004 RB_GC_GUARD(pad);
10005 return res;
10006}
10007
10008
10009/*
10010 * call-seq:
10011 * str.ljust(integer, padstr=' ') -> new_str
10012 *
10013 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10014 * String of length <i>integer</i> with <i>str</i> left justified
10015 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10016 *
10017 * "hello".ljust(4) #=> "hello"
10018 * "hello".ljust(20) #=> "hello "
10019 * "hello".ljust(20, '1234') #=> "hello123412341234123"
10020 */
10021
10022static VALUE
10023rb_str_ljust(int argc, VALUE *argv, VALUE str)
10024{
10025 return rb_str_justify(argc, argv, str, 'l');
10026}
10027
10028
10029/*
10030 * call-seq:
10031 * str.rjust(integer, padstr=' ') -> new_str
10032 *
10033 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10034 * String of length <i>integer</i> with <i>str</i> right justified
10035 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10036 *
10037 * "hello".rjust(4) #=> "hello"
10038 * "hello".rjust(20) #=> " hello"
10039 * "hello".rjust(20, '1234') #=> "123412341234123hello"
10040 */
10041
10042static VALUE
10043rb_str_rjust(int argc, VALUE *argv, VALUE str)
10044{
10045 return rb_str_justify(argc, argv, str, 'r');
10046}
10047
10048
10049/*
10050 * call-seq:
10051 * str.center(width, padstr=' ') -> new_str
10052 *
10053 * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
10054 * returns a new String of length +width+ with +str+ centered and padded with
10055 * +padstr+; otherwise, returns +str+.
10056 *
10057 * "hello".center(4) #=> "hello"
10058 * "hello".center(20) #=> " hello "
10059 * "hello".center(20, '123') #=> "1231231hello12312312"
10060 */
10061
10062static VALUE
10063rb_str_center(int argc, VALUE *argv, VALUE str)
10064{
10065 return rb_str_justify(argc, argv, str, 'c');
10066}
10067
10068/*
10069 * call-seq:
10070 * str.partition(sep) -> [head, sep, tail]
10071 * str.partition(regexp) -> [head, match, tail]
10072 *
10073 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
10074 * and returns the part before it, the match, and the part
10075 * after it.
10076 * If it is not found, returns two empty strings and <i>str</i>.
10077 *
10078 * "hello".partition("l") #=> ["he", "l", "lo"]
10079 * "hello".partition("x") #=> ["hello", "", ""]
10080 * "hello".partition(/.l/) #=> ["h", "el", "lo"]
10081 */
10082
10083static VALUE
10084rb_str_partition(VALUE str, VALUE sep)
10085{
10086 long pos;
10087
10088 sep = get_pat_quoted(sep, 0);
10089 if (RB_TYPE_P(sep, T_REGEXP)) {
10090 if (rb_reg_search(sep, str, 0, 0) < 0) {
10091 goto failed;
10092 }
10094 struct re_registers *regs = RMATCH_REGS(match);
10095
10096 pos = BEG(0);
10097 sep = rb_str_subseq(str, pos, END(0) - pos);
10098 }
10099 else {
10100 pos = rb_str_index(str, sep, 0);
10101 if (pos < 0) goto failed;
10102 }
10103 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10104 sep,
10105 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10106 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10107
10108 failed:
10109 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10110}
10111
10112/*
10113 * call-seq:
10114 * str.rpartition(sep) -> [head, sep, tail]
10115 * str.rpartition(regexp) -> [head, match, tail]
10116 *
10117 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
10118 * of the string, and returns the part before it, the match, and the part
10119 * after it.
10120 * If it is not found, returns two empty strings and <i>str</i>.
10121 *
10122 * "hello".rpartition("l") #=> ["hel", "l", "o"]
10123 * "hello".rpartition("x") #=> ["", "", "hello"]
10124 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
10125 *
10126 * The match from the end means starting at the possible last position, not
10127 * the last of longest matches.
10128 *
10129 * "hello".rpartition(/l+/) #=> ["hel", "l", "o"]
10130 *
10131 * To partition at the last longest match, needs to combine with
10132 * negative lookbehind.
10133 *
10134 * "hello".rpartition(/(?<!l)l+/) #=> ["he", "ll", "o"]
10135 *
10136 * Or String#partition with negative lookforward.
10137 *
10138 * "hello".partition(/l+(?!.*l)/) #=> ["he", "ll", "o"]
10139 */
10140
10141static VALUE
10142rb_str_rpartition(VALUE str, VALUE sep)
10143{
10144 long pos = RSTRING_LEN(str);
10145
10146 sep = get_pat_quoted(sep, 0);
10147 if (RB_TYPE_P(sep, T_REGEXP)) {
10148 if (rb_reg_search(sep, str, pos, 1) < 0) {
10149 goto failed;
10150 }
10152 struct re_registers *regs = RMATCH_REGS(match);
10153
10154 pos = BEG(0);
10155 sep = rb_str_subseq(str, pos, END(0) - pos);
10156 }
10157 else {
10158 pos = rb_str_sublen(str, pos);
10159 pos = rb_str_rindex(str, sep, pos);
10160 if(pos < 0) {
10161 goto failed;
10162 }
10163 pos = rb_str_offset(str, pos);
10164 }
10165
10166 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10167 sep,
10168 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10169 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10170 failed:
10171 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10172}
10173
10174/*
10175 * call-seq:
10176 * str.start_with?([prefixes]+) -> true or false
10177 *
10178 * Returns true if +str+ starts with one of the +prefixes+ given.
10179 * Each of the +prefixes+ should be a String or a Regexp.
10180 *
10181 * "hello".start_with?("hell") #=> true
10182 * "hello".start_with?(/H/i) #=> true
10183 *
10184 * # returns true if one of the prefixes matches.
10185 * "hello".start_with?("heaven", "hell") #=> true
10186 * "hello".start_with?("heaven", "paradise") #=> false
10187 */
10188
10189static VALUE
10190rb_str_start_with(int argc, VALUE *argv, VALUE str)
10191{
10192 int i;
10193
10194 for (i=0; i<argc; i++) {
10195 VALUE tmp = argv[i];
10196 if (RB_TYPE_P(tmp, T_REGEXP)) {
10197 if (rb_reg_start_with_p(tmp, str))
10198 return Qtrue;
10199 }
10200 else {
10201 StringValue(tmp);
10202 rb_enc_check(str, tmp);
10203 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10204 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10205 return Qtrue;
10206 }
10207 }
10208 return Qfalse;
10209}
10210
10211/*
10212 * call-seq:
10213 * str.end_with?([suffixes]+) -> true or false
10214 *
10215 * Returns true if +str+ ends with one of the +suffixes+ given.
10216 *
10217 * "hello".end_with?("ello") #=> true
10218 *
10219 * # returns true if one of the +suffixes+ matches.
10220 * "hello".end_with?("heaven", "ello") #=> true
10221 * "hello".end_with?("heaven", "paradise") #=> false
10222 */
10223
10224static VALUE
10225rb_str_end_with(int argc, VALUE *argv, VALUE str)
10226{
10227 int i;
10228 char *p, *s, *e;
10229 rb_encoding *enc;
10230
10231 for (i=0; i<argc; i++) {
10232 VALUE tmp = argv[i];
10233 StringValue(tmp);
10234 enc = rb_enc_check(str, tmp);
10235 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10236 p = RSTRING_PTR(str);
10237 e = p + RSTRING_LEN(str);
10238 s = e - RSTRING_LEN(tmp);
10239 if (rb_enc_left_char_head(p, s, e, enc) != s)
10240 continue;
10241 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10242 return Qtrue;
10243 }
10244 return Qfalse;
10245}
10246
10256static long
10257deleted_prefix_length(VALUE str, VALUE prefix)
10258{
10259 char *strptr, *prefixptr;
10260 long olen, prefixlen;
10261
10263 if (is_broken_string(prefix)) return 0;
10265
10266 /* return 0 if not start with prefix */
10267 prefixlen = RSTRING_LEN(prefix);
10268 if (prefixlen <= 0) return 0;
10269 olen = RSTRING_LEN(str);
10270 if (olen < prefixlen) return 0;
10271 strptr = RSTRING_PTR(str);
10272 prefixptr = RSTRING_PTR(prefix);
10273 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10274
10275 return prefixlen;
10276}
10277
10278/*
10279 * call-seq:
10280 * str.delete_prefix!(prefix) -> self or nil
10281 *
10282 * Deletes leading <code>prefix</code> from <i>str</i>, returning
10283 * <code>nil</code> if no change was made.
10284 *
10285 * "hello".delete_prefix!("hel") #=> "lo"
10286 * "hello".delete_prefix!("llo") #=> nil
10287 */
10288
10289static VALUE
10290rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10291{
10292 long prefixlen;
10293 str_modify_keep_cr(str);
10294
10295 prefixlen = deleted_prefix_length(str, prefix);
10296 if (prefixlen <= 0) return Qnil;
10297
10298 return rb_str_drop_bytes(str, prefixlen);
10299}
10300
10301/*
10302 * call-seq:
10303 * str.delete_prefix(prefix) -> new_str
10304 *
10305 * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
10306 *
10307 * "hello".delete_prefix("hel") #=> "lo"
10308 * "hello".delete_prefix("llo") #=> "hello"
10309 */
10310
10311static VALUE
10312rb_str_delete_prefix(VALUE str, VALUE prefix)
10313{
10314 long prefixlen;
10315
10316 prefixlen = deleted_prefix_length(str, prefix);
10317 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10318
10319 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10320}
10321
10331static long
10332deleted_suffix_length(VALUE str, VALUE suffix)
10333{
10334 char *strptr, *suffixptr, *s;
10335 long olen, suffixlen;
10336 rb_encoding *enc;
10337
10339 if (is_broken_string(suffix)) return 0;
10340 enc = rb_enc_check(str, suffix);
10341
10342 /* return 0 if not start with suffix */
10343 suffixlen = RSTRING_LEN(suffix);
10344 if (suffixlen <= 0) return 0;
10345 olen = RSTRING_LEN(str);
10346 if (olen < suffixlen) return 0;
10347 strptr = RSTRING_PTR(str);
10348 suffixptr = RSTRING_PTR(suffix);
10349 s = strptr + olen - suffixlen;
10350 if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10351 if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10352
10353 return suffixlen;
10354}
10355
10356/*
10357 * call-seq:
10358 * str.delete_suffix!(suffix) -> self or nil
10359 *
10360 * Deletes trailing <code>suffix</code> from <i>str</i>, returning
10361 * <code>nil</code> if no change was made.
10362 *
10363 * "hello".delete_suffix!("llo") #=> "he"
10364 * "hello".delete_suffix!("hel") #=> nil
10365 */
10366
10367static VALUE
10368rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10369{
10370 long olen, suffixlen, len;
10371 str_modifiable(str);
10372
10373 suffixlen = deleted_suffix_length(str, suffix);
10374 if (suffixlen <= 0) return Qnil;
10375
10376 olen = RSTRING_LEN(str);
10377 str_modify_keep_cr(str);
10378 len = olen - suffixlen;
10383 }
10384 return str;
10385}
10386
10387/*
10388 * call-seq:
10389 * str.delete_suffix(suffix) -> new_str
10390 *
10391 * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10392 *
10393 * "hello".delete_suffix("llo") #=> "he"
10394 * "hello".delete_suffix("hel") #=> "hello"
10395 */
10396
10397static VALUE
10398rb_str_delete_suffix(VALUE str, VALUE suffix)
10399{
10400 long suffixlen;
10401
10402 suffixlen = deleted_suffix_length(str, suffix);
10403 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10404
10405 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10406}
10407
10408void
10410{
10411 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10412 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10413 }
10414 *var = val;
10415}
10416
10417static void
10418rb_fs_setter(VALUE val, ID id, VALUE *var)
10419{
10420 val = rb_fs_check(val);
10421 if (!val) {
10423 "value of %"PRIsVALUE" must be String or Regexp",
10424 rb_id2str(id));
10425 }
10426 if (!NIL_P(val)) {
10427 rb_warn_deprecated("`$;'", NULL);
10428 }
10429 *var = val;
10430}
10431
10432
10433/*
10434 * call-seq:
10435 * str.force_encoding(encoding) -> str
10436 *
10437 * Changes the encoding to +encoding+ and returns self.
10438 */
10439
10440static VALUE
10441rb_str_force_encoding(VALUE str, VALUE enc)
10442{
10443 str_modifiable(str);
10446 return str;
10447}
10448
10449/*
10450 * call-seq:
10451 * str.b -> str
10452 *
10453 * Returns a copied string whose encoding is ASCII-8BIT.
10454 */
10455
10456static VALUE
10457rb_str_b(VALUE str)
10458{
10459 VALUE str2 = str_alloc(rb_cString);
10460 str_replace_shared_without_enc(str2, str);
10461 ENC_CODERANGE_CLEAR(str2);
10462 return str2;
10463}
10464
10465/*
10466 * call-seq:
10467 * str.valid_encoding? -> true or false
10468 *
10469 * Returns true for a string which is encoded correctly.
10470 *
10471 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
10472 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
10473 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
10474 */
10475
10476static VALUE
10477rb_str_valid_encoding_p(VALUE str)
10478{
10479 int cr = rb_enc_str_coderange(str);
10480
10481 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
10482}
10483
10484/*
10485 * call-seq:
10486 * str.ascii_only? -> true or false
10487 *
10488 * Returns true for a string which has only ASCII characters.
10489 *
10490 * "abc".force_encoding("UTF-8").ascii_only? #=> true
10491 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
10492 */
10493
10494static VALUE
10495rb_str_is_ascii_only_p(VALUE str)
10496{
10497 int cr = rb_enc_str_coderange(str);
10498
10499 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
10500}
10501
10516VALUE
10518{
10519 static const char ellipsis[] = "...";
10520 const long ellipsislen = sizeof(ellipsis) - 1;
10521 rb_encoding *const enc = rb_enc_get(str);
10522 const long blen = RSTRING_LEN(str);
10523 const char *const p = RSTRING_PTR(str), *e = p + blen;
10524 VALUE estr, ret = 0;
10525
10526 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10527 if (len * rb_enc_mbminlen(enc) >= blen ||
10528 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10529 ret = str;
10530 }
10531 else if (len <= ellipsislen ||
10532 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10533 if (rb_enc_asciicompat(enc)) {
10534 ret = rb_str_new(ellipsis, len);
10535 rb_enc_associate(ret, enc);
10536 }
10537 else {
10538 estr = rb_usascii_str_new(ellipsis, len);
10539 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10540 }
10541 }
10542 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10543 rb_str_cat(ret, ellipsis, ellipsislen);
10544 }
10545 else {
10546 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10547 rb_enc_from_encoding(enc), 0, Qnil);
10548 rb_str_append(ret, estr);
10549 }
10550 return ret;
10551}
10552
10553static VALUE
10554str_compat_and_valid(VALUE str, rb_encoding *enc)
10555{
10556 int cr;
10557 str = StringValue(str);
10559 if (cr == ENC_CODERANGE_BROKEN) {
10560 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10561 }
10562 else {
10564 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10565 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10566 rb_enc_name(enc), rb_enc_name(e));
10567 }
10568 }
10569 return str;
10570}
10571
10572static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10573
10579VALUE
10581{
10582 rb_encoding *enc = STR_ENC_GET(str);
10583 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10584}
10585
10586VALUE
10588{
10589 int cr = ENC_CODERANGE_UNKNOWN;
10590 if (enc == STR_ENC_GET(str)) {
10591 /* cached coderange makes sense only when enc equals the
10592 * actual encoding of str */
10593 cr = ENC_CODERANGE(str);
10594 }
10595 return enc_str_scrub(enc, str, repl, cr);
10596}
10597
10598static VALUE
10599enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10600{
10601 int encidx;
10602 VALUE buf = Qnil;
10603 const char *rep, *p, *e, *p1, *sp;
10604 long replen = -1;
10605 long slen;
10606
10607 if (rb_block_given_p()) {
10608 if (!NIL_P(repl))
10609 rb_raise(rb_eArgError, "both of block and replacement given");
10610 replen = 0;
10611 }
10612
10613 if (ENC_CODERANGE_CLEAN_P(cr))
10614 return Qnil;
10615
10616 if (!NIL_P(repl)) {
10617 repl = str_compat_and_valid(repl, enc);
10618 }
10619
10620 if (rb_enc_dummy_p(enc)) {
10621 return Qnil;
10622 }
10623 encidx = rb_enc_to_index(enc);
10624
10625#define DEFAULT_REPLACE_CHAR(str) do { \
10626 static const char replace[sizeof(str)-1] = str; \
10627 rep = replace; replen = (int)sizeof(replace); \
10628 } while (0)
10629
10630 slen = RSTRING_LEN(str);
10631 p = RSTRING_PTR(str);
10632 e = RSTRING_END(str);
10633 p1 = p;
10634 sp = p;
10635
10636 if (rb_enc_asciicompat(enc)) {
10637 int rep7bit_p;
10638 if (!replen) {
10639 rep = NULL;
10640 rep7bit_p = FALSE;
10641 }
10642 else if (!NIL_P(repl)) {
10643 rep = RSTRING_PTR(repl);
10644 replen = RSTRING_LEN(repl);
10645 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10646 }
10647 else if (encidx == rb_utf8_encindex()) {
10648 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10649 rep7bit_p = FALSE;
10650 }
10651 else {
10653 rep7bit_p = TRUE;
10654 }
10655 cr = ENC_CODERANGE_7BIT;
10656
10657 p = search_nonascii(p, e);
10658 if (!p) {
10659 p = e;
10660 }
10661 while (p < e) {
10662 int ret = rb_enc_precise_mbclen(p, e, enc);
10663 if (MBCLEN_NEEDMORE_P(ret)) {
10664 break;
10665 }
10666 else if (MBCLEN_CHARFOUND_P(ret)) {
10668 p += MBCLEN_CHARFOUND_LEN(ret);
10669 }
10670 else if (MBCLEN_INVALID_P(ret)) {
10671 /*
10672 * p1~p: valid ascii/multibyte chars
10673 * p ~e: invalid bytes + unknown bytes
10674 */
10675 long clen = rb_enc_mbmaxlen(enc);
10677 if (p > p1) {
10678 rb_str_buf_cat(buf, p1, p - p1);
10679 }
10680
10681 if (e - p < clen) clen = e - p;
10682 if (clen <= 2) {
10683 clen = 1;
10684 }
10685 else {
10686 const char *q = p;
10687 clen--;
10688 for (; clen > 1; clen--) {
10689 ret = rb_enc_precise_mbclen(q, q + clen, enc);
10690 if (MBCLEN_NEEDMORE_P(ret)) break;
10691 if (MBCLEN_INVALID_P(ret)) continue;
10693 }
10694 }
10695 if (rep) {
10696 rb_str_buf_cat(buf, rep, replen);
10697 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10698 }
10699 else {
10700 repl = rb_yield(rb_enc_str_new(p, clen, enc));
10701 str_mod_check(str, sp, slen);
10702 repl = str_compat_and_valid(repl, enc);
10706 }
10707 p += clen;
10708 p1 = p;
10709 p = search_nonascii(p, e);
10710 if (!p) {
10711 p = e;
10712 break;
10713 }
10714 }
10715 else {
10717 }
10718 }
10719 if (NIL_P(buf)) {
10720 if (p == e) {
10722 return Qnil;
10723 }
10725 }
10726 if (p1 < p) {
10727 rb_str_buf_cat(buf, p1, p - p1);
10728 }
10729 if (p < e) {
10730 if (rep) {
10731 rb_str_buf_cat(buf, rep, replen);
10732 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10733 }
10734 else {
10735 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
10736 str_mod_check(str, sp, slen);
10737 repl = str_compat_and_valid(repl, enc);
10741 }
10742 }
10743 }
10744 else {
10745 /* ASCII incompatible */
10746 long mbminlen = rb_enc_mbminlen(enc);
10747 if (!replen) {
10748 rep = NULL;
10749 }
10750 else if (!NIL_P(repl)) {
10751 rep = RSTRING_PTR(repl);
10752 replen = RSTRING_LEN(repl);
10753 }
10754 else if (encidx == ENCINDEX_UTF_16BE) {
10755 DEFAULT_REPLACE_CHAR("\xFF\xFD");
10756 }
10757 else if (encidx == ENCINDEX_UTF_16LE) {
10758 DEFAULT_REPLACE_CHAR("\xFD\xFF");
10759 }
10760 else if (encidx == ENCINDEX_UTF_32BE) {
10761 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
10762 }
10763 else if (encidx == ENCINDEX_UTF_32LE) {
10764 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
10765 }
10766 else {
10768 }
10769
10770 while (p < e) {
10771 int ret = rb_enc_precise_mbclen(p, e, enc);
10772 if (MBCLEN_NEEDMORE_P(ret)) {
10773 break;
10774 }
10775 else if (MBCLEN_CHARFOUND_P(ret)) {
10776 p += MBCLEN_CHARFOUND_LEN(ret);
10777 }
10778 else if (MBCLEN_INVALID_P(ret)) {
10779 const char *q = p;
10780 long clen = rb_enc_mbmaxlen(enc);
10782 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
10783
10784 if (e - p < clen) clen = e - p;
10785 if (clen <= mbminlen * 2) {
10786 clen = mbminlen;
10787 }
10788 else {
10789 clen -= mbminlen;
10790 for (; clen > mbminlen; clen-=mbminlen) {
10791 ret = rb_enc_precise_mbclen(q, q + clen, enc);
10792 if (MBCLEN_NEEDMORE_P(ret)) break;
10793 if (MBCLEN_INVALID_P(ret)) continue;
10795 }
10796 }
10797 if (rep) {
10798 rb_str_buf_cat(buf, rep, replen);
10799 }
10800 else {
10801 repl = rb_yield(rb_enc_str_new(p, clen, enc));
10802 str_mod_check(str, sp, slen);
10803 repl = str_compat_and_valid(repl, enc);
10805 }
10806 p += clen;
10807 p1 = p;
10808 }
10809 else {
10811 }
10812 }
10813 if (NIL_P(buf)) {
10814 if (p == e) {
10816 return Qnil;
10817 }
10819 }
10820 if (p1 < p) {
10821 rb_str_buf_cat(buf, p1, p - p1);
10822 }
10823 if (p < e) {
10824 if (rep) {
10825 rb_str_buf_cat(buf, rep, replen);
10826 }
10827 else {
10828 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
10829 str_mod_check(str, sp, slen);
10830 repl = str_compat_and_valid(repl, enc);
10832 }
10833 }
10835 }
10837 return buf;
10838}
10839
10840/*
10841 * call-seq:
10842 * str.scrub -> new_str
10843 * str.scrub(repl) -> new_str
10844 * str.scrub{|bytes|} -> new_str
10845 *
10846 * If the string is invalid byte sequence then replace invalid bytes with given replacement
10847 * character, else returns self.
10848 * If block is given, replace invalid bytes with returned value of the block.
10849 *
10850 * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
10851 * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
10852 * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
10853 */
10854static VALUE
10855str_scrub(int argc, VALUE *argv, VALUE str)
10856{
10857 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
10858 VALUE new = rb_str_scrub(str, repl);
10859 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
10860}
10861
10862/*
10863 * call-seq:
10864 * str.scrub! -> str
10865 * str.scrub!(repl) -> str
10866 * str.scrub!{|bytes|} -> str
10867 *
10868 * If the string is invalid byte sequence then replace invalid bytes with given replacement
10869 * character, else returns self.
10870 * If block is given, replace invalid bytes with returned value of the block.
10871 *
10872 * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
10873 * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
10874 * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
10875 */
10876static VALUE
10877str_scrub_bang(int argc, VALUE *argv, VALUE str)
10878{
10879 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
10880 VALUE new = rb_str_scrub(str, repl);
10881 if (!NIL_P(new)) rb_str_replace(str, new);
10882 return str;
10883}
10884
10885static ID id_normalize;
10886static ID id_normalized_p;
10887static VALUE mUnicodeNormalize;
10888
10889static VALUE
10890unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
10891{
10892 static int UnicodeNormalizeRequired = 0;
10893 VALUE argv2[2];
10894
10895 if (!UnicodeNormalizeRequired) {
10896 rb_require("unicode_normalize/normalize.rb");
10897 UnicodeNormalizeRequired = 1;
10898 }
10899 argv2[0] = str;
10900 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
10901 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
10902}
10903
10904/*
10905 * call-seq:
10906 * str.unicode_normalize(form=:nfc)
10907 *
10908 * Unicode Normalization---Returns a normalized form of +str+,
10909 * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
10910 * The normalization form used is determined by +form+, which can
10911 * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
10912 * The default is +:nfc+.
10913 *
10914 * If the string is not in a Unicode Encoding, then an Exception is raised.
10915 * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
10916 * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
10917 * Anything other than UTF-8 is implemented by converting to UTF-8,
10918 * which makes it slower than UTF-8.
10919 *
10920 * "a\u0300".unicode_normalize #=> "\u00E0"
10921 * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
10922 * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
10923 * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
10924 * #=> Encoding::CompatibilityError raised
10925 */
10926static VALUE
10927rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
10928{
10929 return unicode_normalize_common(argc, argv, str, id_normalize);
10930}
10931
10932/*
10933 * call-seq:
10934 * str.unicode_normalize!(form=:nfc)
10935 *
10936 * Destructive version of String#unicode_normalize, doing Unicode
10937 * normalization in place.
10938 */
10939static VALUE
10940rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
10941{
10942 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
10943}
10944
10945/* call-seq:
10946 * str.unicode_normalized?(form=:nfc)
10947 *
10948 * Checks whether +str+ is in Unicode normalization form +form+,
10949 * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
10950 * The default is +:nfc+.
10951 *
10952 * If the string is not in a Unicode Encoding, then an Exception is raised.
10953 * For details, see String#unicode_normalize.
10954 *
10955 * "a\u0300".unicode_normalized? #=> false
10956 * "a\u0300".unicode_normalized?(:nfd) #=> true
10957 * "\u00E0".unicode_normalized? #=> true
10958 * "\u00E0".unicode_normalized?(:nfd) #=> false
10959 * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
10960 * #=> Encoding::CompatibilityError raised
10961 */
10962static VALUE
10963rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
10964{
10965 return unicode_normalize_common(argc, argv, str, id_normalized_p);
10966}
10967
10968/**********************************************************************
10969 * Document-class: Symbol
10970 *
10971 * Symbol objects represent names inside the Ruby interpreter. They
10972 * are generated using the <code>:name</code> and
10973 * <code>:"string"</code> literals syntax, and by the various
10974 * <code>to_sym</code> methods. The same Symbol object will be
10975 * created for a given name or string for the duration of a program's
10976 * execution, regardless of the context or meaning of that name. Thus
10977 * if <code>Fred</code> is a constant in one context, a method in
10978 * another, and a class in a third, the Symbol <code>:Fred</code>
10979 * will be the same object in all three contexts.
10980 *
10981 * module One
10982 * class Fred
10983 * end
10984 * $f1 = :Fred
10985 * end
10986 * module Two
10987 * Fred = 1
10988 * $f2 = :Fred
10989 * end
10990 * def Fred()
10991 * end
10992 * $f3 = :Fred
10993 * $f1.object_id #=> 2514190
10994 * $f2.object_id #=> 2514190
10995 * $f3.object_id #=> 2514190
10996 *
10997 */
10998
10999
11000/*
11001 * call-seq:
11002 * sym == obj -> true or false
11003 *
11004 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
11005 * symbol, returns <code>true</code>.
11006 */
11007
11008#define sym_equal rb_obj_equal
11009
11010static int
11011sym_printable(const char *s, const char *send, rb_encoding *enc)
11012{
11013 while (s < send) {
11014 int n;
11015 int c = rb_enc_precise_mbclen(s, send, enc);
11016
11017 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11018 n = MBCLEN_CHARFOUND_LEN(c);
11019 c = rb_enc_mbc_to_codepoint(s, send, enc);
11020 if (!rb_enc_isprint(c, enc)) return FALSE;
11021 s += n;
11022 }
11023 return TRUE;
11024}
11025
11026int
11028{
11029 rb_encoding *enc;
11030 const char *ptr;
11031 long len;
11033
11034 if (resenc == NULL) resenc = rb_default_external_encoding();
11035 enc = STR_ENC_GET(sym);
11036 ptr = RSTRING_PTR(sym);
11037 len = RSTRING_LEN(sym);
11038 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11039 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11040 return FALSE;
11041 }
11042 return TRUE;
11043}
11044
11045VALUE
11047{
11048 rb_encoding *enc;
11049 const char *ptr;
11050 long len;
11051 rb_encoding *resenc;
11052
11053 Check_Type(str, T_STRING);
11055 if (resenc == NULL) resenc = rb_default_external_encoding();
11056 enc = STR_ENC_GET(str);
11057 ptr = RSTRING_PTR(str);
11058 len = RSTRING_LEN(str);
11059 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11060 !sym_printable(ptr, ptr + len, enc)) {
11061 return rb_str_inspect(str);
11062 }
11063 return str;
11064}
11065
11068{
11069 VALUE str = rb_id2str(id);
11070 if (!rb_str_symname_p(str)) {
11071 return rb_str_inspect(str);
11072 }
11073 return str;
11074}
11075
11076/*
11077 * call-seq:
11078 * sym.inspect -> string
11079 *
11080 * Returns the representation of <i>sym</i> as a symbol literal.
11081 *
11082 * :fred.inspect #=> ":fred"
11083 */
11084
11085static VALUE
11086sym_inspect(VALUE sym)
11087{
11089 const char *ptr;
11090 long len;
11091 char *dest;
11092
11093 if (!rb_str_symname_p(str)) {
11095 len = RSTRING_LEN(str);
11096 rb_str_resize(str, len + 1);
11097 dest = RSTRING_PTR(str);
11098 memmove(dest + 1, dest, len);
11099 }
11100 else {
11101 rb_encoding *enc = STR_ENC_GET(str);
11103 str = rb_enc_str_new(0, len + 1, enc);
11104 dest = RSTRING_PTR(str);
11105 memcpy(dest + 1, ptr, len);
11106 }
11107 dest[0] = ':';
11108 return str;
11109}
11110
11111#if 0 /* for RDoc */
11112/*
11113 * call-seq:
11114 * sym.name -> string
11115 *
11116 * Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
11117 * returned string is frozen.
11118 *
11119 * :fred.name #=> "fred"
11120 * :fred.name.frozen? #=> true
11121 * :fred.to_s #=> "fred"
11122 * :fred.to_s.frozen? #=> false
11123 */
11124VALUE
11126{
11127
11128}
11129#endif
11130
11131
11132/*
11133 * call-seq:
11134 * sym.id2name -> string
11135 * sym.to_s -> string
11136 *
11137 * Returns the name or string corresponding to <i>sym</i>.
11138 *
11139 * :fred.id2name #=> "fred"
11140 * :ginger.to_s #=> "ginger"
11141 *
11142 * Note that this string is not frozen (unlike the symbol itself).
11143 * To get a frozen string, use #name.
11144 */
11145
11146
11147VALUE
11149{
11150 return str_new_shared(rb_cString, rb_sym2str(sym));
11151}
11152
11153
11154/*
11155 * call-seq:
11156 * sym.to_sym -> sym
11157 * sym.intern -> sym
11158 *
11159 * In general, <code>to_sym</code> returns the Symbol corresponding
11160 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
11161 * in this case.
11162 */
11163
11164static VALUE
11165sym_to_sym(VALUE sym)
11166{
11167 return sym;
11168}
11169
11171rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11172{
11173 VALUE obj;
11174
11175 if (argc < 1) {
11176 rb_raise(rb_eArgError, "no receiver given");
11177 }
11178 obj = argv[0];
11179 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11180}
11181
11182#if 0
11183/*
11184 * call-seq:
11185 * sym.to_proc
11186 *
11187 * Returns a _Proc_ object which responds to the given method by _sym_.
11188 *
11189 * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
11190 */
11191
11192VALUE
11194{
11195}
11196#endif
11197
11198/*
11199 * call-seq:
11200 *
11201 * sym.succ
11202 *
11203 * Same as <code>sym.to_s.succ.intern</code>.
11204 */
11205
11206static VALUE
11207sym_succ(VALUE sym)
11208{
11210}
11211
11212/*
11213 * call-seq:
11214 *
11215 * symbol <=> other_symbol -> -1, 0, +1, or nil
11216 *
11217 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
11218 * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
11219 * less than, equal to, or greater than +other_symbol+.
11220 *
11221 * +nil+ is returned if the two values are incomparable.
11222 *
11223 * See String#<=> for more information.
11224 */
11225
11226static VALUE
11227sym_cmp(VALUE sym, VALUE other)
11228{
11229 if (!SYMBOL_P(other)) {
11230 return Qnil;
11231 }
11232 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11233}
11234
11235/*
11236 * call-seq:
11237 * sym.casecmp(other_symbol) -> -1, 0, +1, or nil
11238 *
11239 * Case-insensitive version of Symbol#<=>.
11240 * Currently, case-insensitivity only works on characters A-Z/a-z,
11241 * not all of Unicode. This is different from Symbol#casecmp?.
11242 *
11243 * :aBcDeF.casecmp(:abcde) #=> 1
11244 * :aBcDeF.casecmp(:abcdef) #=> 0
11245 * :aBcDeF.casecmp(:abcdefg) #=> -1
11246 * :abcdef.casecmp(:ABCDEF) #=> 0
11247 *
11248 * +nil+ is returned if the two symbols have incompatible encodings,
11249 * or if +other_symbol+ is not a symbol.
11250 *
11251 * :foo.casecmp(2) #=> nil
11252 * "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp(:"\u{c4 d6 dc}") #=> nil
11253 */
11254
11255static VALUE
11256sym_casecmp(VALUE sym, VALUE other)
11257{
11258 if (!SYMBOL_P(other)) {
11259 return Qnil;
11260 }
11261 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11262}
11263
11264/*
11265 * call-seq:
11266 * sym.casecmp?(other_symbol) -> true, false, or nil
11267 *
11268 * Returns +true+ if +sym+ and +other_symbol+ are equal after
11269 * Unicode case folding, +false+ if they are not equal.
11270 *
11271 * :aBcDeF.casecmp?(:abcde) #=> false
11272 * :aBcDeF.casecmp?(:abcdef) #=> true
11273 * :aBcDeF.casecmp?(:abcdefg) #=> false
11274 * :abcdef.casecmp?(:ABCDEF) #=> true
11275 * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
11276 *
11277 * +nil+ is returned if the two symbols have incompatible encodings,
11278 * or if +other_symbol+ is not a symbol.
11279 *
11280 * :foo.casecmp?(2) #=> nil
11281 * "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp?(:"\u{c4 d6 dc}") #=> nil
11282 */
11283
11284static VALUE
11285sym_casecmp_p(VALUE sym, VALUE other)
11286{
11287 if (!SYMBOL_P(other)) {
11288 return Qnil;
11289 }
11290 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11291}
11292
11293/*
11294 * call-seq:
11295 * sym =~ obj -> integer or nil
11296 *
11297 * Returns <code>sym.to_s =~ obj</code>.
11298 */
11299
11300static VALUE
11301sym_match(VALUE sym, VALUE other)
11302{
11303 return rb_str_match(rb_sym2str(sym), other);
11304}
11305
11306/*
11307 * call-seq:
11308 * sym.match(pattern) -> matchdata or nil
11309 * sym.match(pattern, pos) -> matchdata or nil
11310 *
11311 * Returns <code>sym.to_s.match</code>.
11312 */
11313
11314static VALUE
11315sym_match_m(int argc, VALUE *argv, VALUE sym)
11316{
11317 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11318}
11319
11320/*
11321 * call-seq:
11322 * sym.match?(pattern) -> true or false
11323 * sym.match?(pattern, pos) -> true or false
11324 *
11325 * Returns <code>sym.to_s.match?</code>.
11326 */
11327
11328static VALUE
11329sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11330{
11331 return rb_str_match_m_p(argc, argv, sym);
11332}
11333
11334/*
11335 * call-seq:
11336 * sym[idx] -> char
11337 * sym[b, n] -> string
11338 * sym.slice(idx) -> char
11339 * sym.slice(b, n) -> string
11340 *
11341 * Returns <code>sym.to_s[]</code>.
11342 */
11343
11344static VALUE
11345sym_aref(int argc, VALUE *argv, VALUE sym)
11346{
11347 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11348}
11349
11350/*
11351 * call-seq:
11352 * sym.length -> integer
11353 * sym.size -> integer
11354 *
11355 * Same as <code>sym.to_s.length</code>.
11356 */
11357
11358static VALUE
11359sym_length(VALUE sym)
11360{
11361 return rb_str_length(rb_sym2str(sym));
11362}
11363
11364/*
11365 * call-seq:
11366 * sym.empty? -> true or false
11367 *
11368 * Returns whether _sym_ is :"" or not.
11369 */
11370
11371static VALUE
11372sym_empty(VALUE sym)
11373{
11374 return rb_str_empty(rb_sym2str(sym));
11375}
11376
11377/*
11378 * call-seq:
11379 * sym.upcase -> symbol
11380 * sym.upcase([options]) -> symbol
11381 *
11382 * Same as <code>sym.to_s.upcase.intern</code>.
11383 */
11384
11385static VALUE
11386sym_upcase(int argc, VALUE *argv, VALUE sym)
11387{
11388 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11389}
11390
11391/*
11392 * call-seq:
11393 * sym.downcase -> symbol
11394 * sym.downcase([options]) -> symbol
11395 *
11396 * Same as <code>sym.to_s.downcase.intern</code>.
11397 */
11398
11399static VALUE
11400sym_downcase(int argc, VALUE *argv, VALUE sym)
11401{
11402 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11403}
11404
11405/*
11406 * call-seq:
11407 * sym.capitalize -> symbol
11408 * sym.capitalize([options]) -> symbol
11409 *
11410 * Same as <code>sym.to_s.capitalize.intern</code>.
11411 */
11412
11413static VALUE
11414sym_capitalize(int argc, VALUE *argv, VALUE sym)
11415{
11416 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11417}
11418
11419/*
11420 * call-seq:
11421 * sym.swapcase -> symbol
11422 * sym.swapcase([options]) -> symbol
11423 *
11424 * Same as <code>sym.to_s.swapcase.intern</code>.
11425 */
11426
11427static VALUE
11428sym_swapcase(int argc, VALUE *argv, VALUE sym)
11429{
11430 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11431}
11432
11433/*
11434 * call-seq:
11435 * sym.start_with?([prefixes]+) -> true or false
11436 *
11437 * Returns true if +sym+ starts with one of the +prefixes+ given.
11438 * Each of the +prefixes+ should be a String or a Regexp.
11439 *
11440 * :hello.start_with?("hell") #=> true
11441 * :hello.start_with?(/H/i) #=> true
11442 *
11443 * # returns true if one of the prefixes matches.
11444 * :hello.start_with?("heaven", "hell") #=> true
11445 * :hello.start_with?("heaven", "paradise") #=> false
11446 */
11447
11448static VALUE
11449sym_start_with(int argc, VALUE *argv, VALUE sym)
11450{
11451 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11452}
11453
11454/*
11455 * call-seq:
11456 * sym.end_with?([suffixes]+) -> true or false
11457 *
11458 * Returns true if +sym+ ends with one of the +suffixes+ given.
11459 *
11460 * :hello.end_with?("ello") #=> true
11461 *
11462 * # returns true if one of the +suffixes+ matches.
11463 * :hello.end_with?("heaven", "ello") #=> true
11464 * :hello.end_with?("heaven", "paradise") #=> false
11465 */
11466
11467static VALUE
11468sym_end_with(int argc, VALUE *argv, VALUE sym)
11469{
11470 return rb_str_end_with(argc, argv, rb_sym2str(sym));
11471}
11472
11473/*
11474 * call-seq:
11475 * sym.encoding -> encoding
11476 *
11477 * Returns the Encoding object that represents the encoding of _sym_.
11478 */
11479
11480static VALUE
11481sym_encoding(VALUE sym)
11482{
11484}
11485
11486static VALUE
11487string_for_symbol(VALUE name)
11488{
11489 if (!RB_TYPE_P(name, T_STRING)) {
11491 if (NIL_P(tmp)) {
11492 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11493 name);
11494 }
11495 name = tmp;
11496 }
11497 return name;
11498}
11499
11500ID
11502{
11503 if (SYMBOL_P(name)) {
11504 return SYM2ID(name);
11505 }
11506 name = string_for_symbol(name);
11507 return rb_intern_str(name);
11508}
11509
11510VALUE
11512{
11513 if (SYMBOL_P(name)) {
11514 return name;
11515 }
11516 name = string_for_symbol(name);
11517 return rb_str_intern(name);
11518}
11519
11520/*
11521 * call-seq:
11522 * Symbol.all_symbols => array
11523 *
11524 * Returns an array of all the symbols currently in Ruby's symbol
11525 * table.
11526 *
11527 * Symbol.all_symbols.size #=> 903
11528 * Symbol.all_symbols[1,20] #=> [:floor, :ARGV, :Binding, :symlink,
11529 * :chown, :EOFError, :$;, :String,
11530 * :LOCK_SH, :"setuid?", :$<,
11531 * :default_proc, :compact, :extend,
11532 * :Tms, :getwd, :$=, :ThreadGroup,
11533 * :wait2, :$>]
11534 */
11535
11536static VALUE
11537sym_all_symbols(VALUE _)
11538{
11539 return rb_sym_all_symbols();
11540}
11541
11542VALUE
11544{
11545 return rb_fstring(str);
11546}
11547
11548VALUE
11549rb_interned_str(const char *ptr, long len)
11550{
11551 struct RString fake_str;
11552 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11553}
11554
11555VALUE
11557{
11558 return rb_interned_str(ptr, strlen(ptr));
11559}
11560
11561VALUE
11562rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11563{
11564 if (UNLIKELY(rb_enc_autoload_p(enc))) {
11565 rb_enc_autoload(enc);
11566 }
11567
11568 struct RString fake_str;
11569 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11570}
11571
11572VALUE
11574{
11575 return rb_enc_interned_str(ptr, strlen(ptr), enc);
11576}
11577
11578/*
11579 * A String object holds and manipulates an arbitrary sequence of
11580 * bytes, typically representing characters. String objects may be created
11581 * using String::new or as literals.
11582 *
11583 * Because of aliasing issues, users of strings should be aware of the methods
11584 * that modify the contents of a String object. Typically,
11585 * methods with names ending in ``!'' modify their receiver, while those
11586 * without a ``!'' return a new String. However, there are
11587 * exceptions, such as String#[]=.
11588 *
11589 */
11590
11591void
11593{
11596 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
11598 rb_define_alloc_func(rb_cString, empty_str_alloc);
11599 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
11600 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
11601 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
11602 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
11606 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
11607 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
11608 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
11611 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
11612 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
11613 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
11614 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
11617 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
11618 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
11619 rb_define_method(rb_cString, "=~", rb_str_match, 1);
11620 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
11621 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
11623 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
11625 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
11626 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
11627 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
11628 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
11630 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
11631 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
11632 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
11633 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
11634 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
11635 rb_define_method(rb_cString, "scrub", str_scrub, -1);
11636 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
11638 rb_define_method(rb_cString, "+@", str_uplus, 0);
11639 rb_define_method(rb_cString, "-@", str_uminus, 0);
11640
11641 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
11642 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
11643 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
11644 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
11647 rb_define_method(rb_cString, "undump", str_undump, 0);
11648
11649 sym_ascii = ID2SYM(rb_intern_const("ascii"));
11650 sym_turkic = ID2SYM(rb_intern_const("turkic"));
11651 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
11652 sym_fold = ID2SYM(rb_intern_const("fold"));
11653
11654 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
11655 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
11656 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
11657 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
11658
11659 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
11660 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
11661 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
11662 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
11663
11664 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
11665 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
11666 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
11667 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
11668 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
11669 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
11670 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
11671 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
11672 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
11673 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
11674 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
11676 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
11677 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
11678 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
11679 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
11680 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
11681
11682 rb_define_method(rb_cString, "include?", rb_str_include, 1);
11683 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
11684 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
11685
11686 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
11687
11688 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
11689 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
11690 rb_define_method(rb_cString, "center", rb_str_center, -1);
11691
11692 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
11693 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
11694 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
11695 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
11696 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
11697 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
11698 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
11699 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
11700 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
11701
11702 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
11703 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
11704 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
11705 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
11706 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
11707 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
11708 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
11709 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
11710 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
11711
11712 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
11713 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
11714 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
11715 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
11716 rb_define_method(rb_cString, "count", rb_str_count, -1);
11717
11718 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
11719 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
11720 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
11721 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
11722
11723 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
11724 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
11725 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
11726 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
11727 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
11728
11729 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
11730
11731 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
11732 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
11733
11734 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
11735 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
11736
11737 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
11738 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
11739 rb_define_method(rb_cString, "b", rb_str_b, 0);
11740 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
11741 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
11742
11743 /* define UnicodeNormalize module here so that we don't have to look it up */
11744 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
11745 id_normalize = rb_intern_const("normalize");
11746 id_normalized_p = rb_intern_const("normalized?");
11747
11748 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
11749 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
11750 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
11751
11752 rb_fs = Qnil;
11753 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
11754 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
11756
11761 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
11762
11765 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
11767 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
11769 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
11770 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
11772 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
11773 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
11774
11775 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
11776 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
11777 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
11778 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
11779
11780 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
11781 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
11782 rb_define_method(rb_cSymbol, "length", sym_length, 0);
11783 rb_define_method(rb_cSymbol, "size", sym_length, 0);
11784 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
11785 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
11786 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
11787
11788 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
11789 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
11790 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
11791 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
11792
11793 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
11794 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
11795
11796 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
11797}
#define offsetof(p_type, field)
Definition: addrinfo.h:186
VALUE rb_ary_push(VALUE ary, VALUE item)
Definition: array.c:1301
VALUE rb_ary_new(void)
Definition: array.c:749
VALUE rb_check_array_type(VALUE ary)
Definition: array.c:988
#define L(x)
Definition: asm.h:125
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy iff RUBY_DEBUG is truthy.
Definition: assert.h:177
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition: assert.h:167
#define RUBY_ALIAS_FUNCTION(prot, name, args)
Definition: attributes.h:144
#define UNREACHABLE
Definition: assume.h:30
#define UNREACHABLE_RETURN
Definition: assume.h:31
#define rb_category_warn(category,...)
Definition: bigdecimal.h:163
VALUE rb_str_to_inum(VALUE str, int base, int badcheck)
Definition: bignum.c:4274
int bits(struct state *s, int need)
Definition: blast.c:72
VALUE rb_invcmp(VALUE x, VALUE y)
Definition: compar.c:50
VALUE rb_mComparable
Definition: compar.c:19
Internal header absorbing C compipler differences.
#define OBJ_BUILTIN_TYPE(obj)
Definition: compilers.h:68
#define FLEX_ARY_LEN
Definition: compilers.h:88
char * crypt_r(const char *key, const char *setting, struct crypt_data *data)
Definition: crypt.c:396
Our own, locale independent, character handling routines.
#define ISSPACE
Definition: ctype.h:38
#define ISDIGIT
Definition: ctype.h:43
#define ISALPHA
Definition: ctype.h:42
#define ISASCII
Definition: ctype.h:35
#define TOLOWER
Definition: ctype.h:51
#define ISPRINT
Definition: ctype.h:36
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
Definition: cxxanyargs.hpp:653
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:668
#define sub(x, y)
Definition: date_strftime.c:24
#define mod(x, y)
Definition: date_strftime.c:28
#define range(low, item, hi)
Definition: date_strftime.c:21
struct RIMemo * ptr
Definition: debug.c:88
#define RB_DEBUG_COUNTER_INC_IF(type, cond)
#define RB_DEBUG_COUNTER_INC(type)
#define MJIT_FUNC_EXPORTED
Definition: dllexport.h:55
#define assert(x)
Definition: dlmalloc.c:1176
#define DBL2NUM
Definition: double.h:29
#define ENCINDEX_UTF_32BE
Definition: encindex.h:48
#define ENCINDEX_UTF_32LE
Definition: encindex.h:49
#define ENCINDEX_UTF_16BE
Definition: encindex.h:46
#define ENCINDEX_UTF_16
Definition: encindex.h:50
#define ENCINDEX_UTF_8
Definition: encindex.h:44
#define ENCINDEX_UTF_16LE
Definition: encindex.h:47
#define rb_ascii8bit_encindex()
Definition: encindex.h:57
#define rb_usascii_encindex()
Definition: encindex.h:59
int rb_enc_find_index2(const char *name, long len)
Definition: encoding.c:905
#define ENCINDEX_UTF_32
Definition: encindex.h:51
#define rb_utf8_encindex()
Definition: encindex.h:58
#define ENCINDEX_US_ASCII
Definition: encindex.h:45
#define ENCINDEX_ASCII
Definition: encindex.h:43
int rb_enc_dummy_p(rb_encoding *enc)
Definition: encoding.c:203
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1230
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:977
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:1064
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1537
rb_encoding * rb_enc_check_str(VALUE str1, VALUE str2)
Definition: encoding.c:1078
rb_encoding * rb_ascii8bit_encoding(void)
Definition: encoding.c:1525
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
Definition: encoding.c:1266
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:414
rb_encoding * rb_filesystem_encoding(void)
Definition: encoding.c:1602
int rb_enc_autoload(rb_encoding *enc)
Definition: encoding.c:867
rb_encoding * rb_default_internal_encoding(void)
Definition: encoding.c:1734
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1212
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:1070
rb_encoding * rb_enc_get_from_index(int index)
Definition: encoding.c:795
int rb_enc_unicode_p(rb_encoding *enc)
Definition: encoding.c:688
void rb_enc_copy(VALUE obj1, VALUE obj2)
Definition: encoding.c:1188
int rb_enc_to_index(rb_encoding *enc)
Definition: encoding.c:197
void rb_enc_set_index(VALUE obj, int idx)
Definition: encoding.c:1028
rb_encoding * rb_default_external_encoding(void)
Definition: encoding.c:1647
ID rb_id_encoding(void)
Definition: encoding.c:947
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Definition: encoding.c:1089
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1218
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Definition: encoding.c:1172
rb_encoding * rb_locale_encoding(void)
Definition: encoding.c:1583
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:1202
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:329
rb_encoding * rb_usascii_encoding(void)
Definition: encoding.c:1549
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:188
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:1036
int rb_enc_codelen(int c, rb_encoding *enc)
Definition: encoding.c:1287
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Definition: encoding.c:1242
int root
Definition: enough.c:226
big_t * num
Definition: enough.c:232
int max
Definition: enough.c:225
#define sym(name)
Definition: enumerator.c:4007
uint8_t len
Definition: escape.c:17
char str[HTML_ESCAPE_MAX_LEN+1]
Definition: escape.c:18
#define numberof(array)
Definition: etc.c:649
#define RSTRING_LEN(string)
Definition: fbuffer.h:22
#define RSTRING_PTR(string)
Definition: fbuffer.h:19
#define MAYBE_UNUSED
Definition: ffi_common.h:30
#define UNLIKELY(x)
Definition: ffi_common.h:126
#define memcpy(d, s, n)
Definition: ffi_common.h:55
#define LIKELY(x)
Definition: ffi_common.h:125
#define FL_EXIVAR
Definition: fl_type.h:58
#define FL_WB_PROTECTED
Definition: fl_type.h:50
#define FL_FREEZE
Definition: fl_type.h:59
@ RUBY_FL_FREEZE
Definition: fl_type.h:173
#define PRIsVALUE
Definition: function.c:10
void ruby_xfree(void *x)
Deallocates a storage instance.
Definition: gc.c:10914
int rb_objspace_garbage_object_p(VALUE obj)
Definition: gc.c:3933
void rb_gc_force_recycle(VALUE obj)
Definition: gc.c:7968
void rb_gc_register_address(VALUE *addr)
Inform the garbage collector that valptr points to a live Ruby object that should not be moved.
Definition: gc.c:8040
#define rb_intern_str(string)
Definition: generator.h:16
#define CLASS_OF
Definition: globals.h:153
void rb_include_module(VALUE klass, VALUE module)
Definition: class.c:962
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:748
VALUE rb_define_module(const char *name)
Definition: class.c:871
void rb_undef_method(VALUE klass, const char *name)
Definition: class.c:1777
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Definition: class.c:2296
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition: eval.c:935
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Definition: class.c:2085
#define FL_UNSET_RAW
Definition: fl_type.h:133
#define OBJ_FROZEN
Definition: fl_type.h:136
#define OBJ_FREEZE_RAW
Definition: fl_type.h:135
#define OBJ_FREEZE
Definition: fl_type.h:134
#define FL_TEST_RAW
Definition: fl_type.h:131
#define FL_SET
Definition: fl_type.h:128
#define FL_TEST
Definition: fl_type.h:130
#define FL_UNSET
Definition: fl_type.h:132
#define FL_SET_RAW
Definition: fl_type.h:129
#define OBJ_FROZEN_RAW
Definition: fl_type.h:137
void rb_syserr_fail(int e, const char *mesg)
Definition: error.c:3029
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:2917
void rb_warn_deprecated_to_remove(const char *fmt, const char *removal,...)
Definition: error.c:496
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:712
void rb_bug(const char *fmt,...)
Definition: error.c:768
VALUE rb_eRangeError
Definition: error.c:1061
VALUE rb_eTypeError
Definition: error.c:1057
void rb_fatal(const char *fmt,...)
Definition: error.c:2968
VALUE rb_eEncCompatError
Definition: error.c:1064
void rb_warn_deprecated(const char *fmt, const char *suggest,...)
Definition: error.c:480
VALUE rb_eRuntimeError
Definition: error.c:1055
VALUE rb_eArgError
Definition: error.c:1058
VALUE rb_eIndexError
Definition: error.c:1059
VALUE rb_ensure(VALUE(*b_proc)(VALUE), VALUE data1, VALUE(*e_proc)(VALUE), VALUE data2)
An equivalent to ensure clause.
Definition: eval.c:1148
VALUE rb_cObject
Object class.
Definition: object.c:49
VALUE rb_any_to_s(VALUE)
Default implementation of #to_s.
Definition: object.c:561
VALUE rb_obj_alloc(VALUE)
Allocates an instance of klass.
Definition: object.c:1900
VALUE rb_obj_frozen_p(VALUE)
Definition: object.c:1113
double rb_str_to_dbl(VALUE, int)
Parses a string representation of a floating point number.
Definition: object.c:3409
VALUE rb_obj_class(VALUE)
Definition: object.c:245
VALUE rb_check_convert_type_with_id(VALUE, int, const char *, ID)
Definition: object.c:2987
VALUE rb_convert_type_with_id(VALUE v, int t, const char *nam, ID mid)
Definition: object.c:2944
VALUE rb_equal(VALUE, VALUE)
This function is an optimized version of calling #==.
Definition: object.c:157
VALUE rb_obj_freeze(VALUE)
Make the object unmodifiable.
Definition: object.c:1101
VALUE rb_str_escape(VALUE str)
Definition: string.c:6135
VALUE rb_to_int(VALUE)
Converts val into Integer.
Definition: object.c:3051
unsigned char suffix[65536]
Definition: gun.c:164
unsigned char match[65280+2]
Definition: gun.c:165
unsigned short prefix[65536]
Definition: gun.c:163
void skip(file *in, unsigned n)
Definition: gzappend.c:202
VALUE rb_check_hash_type(VALUE hash)
Definition: hash.c:1860
VALUE rb_hash_aref(VALUE hash, VALUE key)
Definition: hash.c:2046
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Definition: hash.c:2901
VALUE rb_hash_lookup(VALUE hash, VALUE key)
Definition: hash.c:2072
VALUE rb_hash_new(void)
Definition: hash.c:1538
@ idLTLT
Definition: id.h:90
@ idLE
Definition: id.h:93
@ idEqTilde
Definition: id.h:103
#define ENCODING_SET_INLINED(obj, i)
Definition: encoding.h:48
#define ENC_CODERANGE_7BIT
Definition: encoding.h:93
#define ENC_CODERANGE_VALID
Definition: encoding.h:94
#define rb_enc_left_char_head(s, p, e, enc)
Definition: encoding.h:213
#define rb_enc_mbcput(c, buf, enc)
Definition: encoding.h:208
#define ENC_CODERANGE_CLEAN_P(cr)
Definition: encoding.h:96
#define rb_enc_isctype(c, t, enc)
Definition: encoding.h:220
#define ENC_CODERANGE_AND(a, b)
Definition: encoding.h:101
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Definition: transcode.c:1438
rb_econv_result_t
Definition: encoding.h:288
@ econv_finished
Definition: encoding.h:293
@ econv_destination_buffer_full
Definition: encoding.h:291
#define rb_enc_step_back(s, p, e, n, enc)
Definition: encoding.h:215
long rb_memsearch(const void *, long, const void *, long, rb_encoding *)
Definition: re.c:247
#define rb_enc_prev_char(s, p, e, enc)
Definition: encoding.h:211
int rb_enc_symname2_p(const char *, long, rb_encoding *)
Definition: symbol.c:407
#define ENC_CODERANGE(obj)
Definition: encoding.h:97
#define ENC_CODERANGE_UNKNOWN
Definition: encoding.h:92
#define rb_enc_name(enc)
Definition: encoding.h:168
#define rb_enc_isascii(c, enc)
Definition: encoding.h:221
#define rb_enc_mbmaxlen(enc)
Definition: encoding.h:172
#define ENCODING_GET(obj)
Definition: encoding.h:51
#define ENC_CODERANGE_MASK
Definition: encoding.h:91
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:199
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Definition: transcode.c:2577
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:183
#define rb_enc_asciicompat(enc)
Definition: encoding.h:236
#define rb_enc_codepoint(p, e, enc)
Definition: encoding.h:198
VALUE rb_enc_sprintf(rb_encoding *, const char *,...)
Definition: sprintf.c:1184
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2892
#define ENCODING_INLINE_MAX
Definition: encoding.h:29
#define MBCLEN_INVALID_P(ret)
Definition: encoding.h:184
#define rb_enc_code_to_mbclen(c, enc)
Definition: encoding.h:205
#define rb_enc_isprint(c, enc)
Definition: encoding.h:227
#define MBCLEN_NEEDMORE_P(ret)
Definition: encoding.h:185
#define rb_enc_mbminlen(enc)
Definition: encoding.h:171
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:95
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:182
#define ENCODING_SHIFT
Definition: encoding.h:30
void rb_econv_close(rb_econv_t *ec)
Definition: transcode.c:1694
#define rb_enc_right_char_head(s, p, e, enc)
Definition: encoding.h:214
#define ENCODING_GET_INLINED(obj)
Definition: encoding.h:50
#define ENC_CODERANGE_CLEAR(obj)
Definition: encoding.h:100
#define ENCODING_IS_ASCII8BIT(obj)
Definition: encoding.h:52
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:99
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Definition: encoding.h:102
#define rb_enc_is_newline(p, end, enc)
Definition: encoding.h:218
#define ENCODING_MASK
Definition: encoding.h:31
#define FIXABLE
Definition: fixnum.h:25
#define FIXNUM_MAX
Definition: fixnum.h:26
Thin wrapper to ruby/config.h.
#define UNALIGNED_WORD_ACCESS
Definition: config.h:122
@ RB_WARN_CATEGORY_DEPRECATED
Definition: error.h:34
VALUE rb_funcall(VALUE, ID, int,...)
Calls a method.
Definition: vm_eval.c:1077
VALUE rb_funcall_with_block_kw(VALUE, ID, int, const VALUE *, VALUE, int)
Definition: vm_eval.c:1173
#define rb_ary_new3
Definition: array.h:73
#define rb_ary_new2
Definition: array.h:72
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
Definition: enumerator.h:64
#define RETURN_ENUMERATOR(obj, argc, argv)
Definition: enumerator.h:74
#define UNLIMITED_ARGUMENTS
Definition: error.h:29
#define rb_check_frozen
Definition: error.h:72
void rb_error_arity(int, int, int)
#define rb_check_arity
Definition: error.h:34
VALUE rb_default_rs
Definition: io.c:202
VALUE rb_backref_get(void)
Definition: vm.c:1544
VALUE rb_sym_all_symbols(void)
Definition: symbol.c:983
void rb_backref_set(VALUE)
Definition: vm.c:1550
VALUE rb_range_beg_len(VALUE, long *, long *, long, int)
Definition: range.c:1398
int rb_reg_backref_number(VALUE match, VALUE backref)
Definition: re.c:1191
void rb_match_busy(VALUE)
Definition: re.c:1305
int rb_reg_options(VALUE)
Definition: re.c:3593
VALUE rb_reg_match(VALUE, VALUE)
Definition: re.c:3194
VALUE rb_reg_nth_match(int, VALUE)
Definition: re.c:1725
#define rb_str_buf_cat2
Definition: string.h:284
#define rb_utf8_str_new_cstr(str)
Definition: string.h:246
#define rb_str_cat2
Definition: string.h:285
st_index_t rb_memhash(const void *ptr, long len)
Definition: random.c:1629
#define rb_str_new(str, len)
Definition: string.h:213
#define rb_str_buf_cat
Definition: string.h:283
#define rb_usascii_str_new(str, len)
Definition: string.h:224
#define rb_str_buf_new_cstr(str)
Definition: string.h:261
#define rb_usascii_str_new_cstr(str)
Definition: string.h:241
rb_gvar_setter_t rb_str_setter
Definition: string.h:130
#define rb_external_str_new_cstr(str)
Definition: string.h:251
#define rb_strlen_lit(str)
Definition: string.h:286
VALUE rb_str_intern(VALUE)
Definition: symbol.c:840
#define rb_locale_str_new_cstr(str)
Definition: string.h:256
VALUE rb_str_locktmp(VALUE)
#define rb_str_cat_cstr(buf, str)
Definition: string.h:266
#define rb_utf8_str_new(str, len)
Definition: string.h:230
#define rb_tainted_str_new_cstr(str)
Definition: string.h:236
#define rb_str_new_cstr(str)
Definition: string.h:219
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:2561
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
void rb_undef_alloc_func(VALUE)
Definition: vm_method.c:954
#define ID2SYM
Definition: symbol.h:44
#define SYM2ID
Definition: symbol.h:45
VALUE rb_sym2str(VALUE)
Definition: symbol.c:927
ID rb_intern(const char *)
Definition: symbol.c:785
#define CONST_ID
Definition: symbol.h:47
void * memmove(void *, const void *, size_t)
Definition: memmove.c:7
char * crypt(const char *, const char *)
VALUE rb_reg_regcomp(VALUE)
Definition: re.c:2984
VALUE rb_reg_regsub(VALUE, VALUE, struct re_registers *, VALUE)
Definition: re.c:3790
long rb_reg_search(VALUE, VALUE, long, int)
Definition: re.c:1628
#define scan_hex(s, l, e)
Definition: util.h:24
#define NUM2INT
Definition: int.h:44
#define INT2NUM
Definition: int.h:43
#define UINT2NUM
Definition: int.h:46
Internal header for Array.
Internal header for Comparable.
Internal header for Encoding.
#define rb_enc_autoload_p(enc)
Definition: encoding.h:15
Internal header for GC.
#define SIZED_REALLOC_N(v, T, m, n)
Definition: gc.h:159
#define ruby_sized_xfree
Definition: gc.h:166
#define RB_EC_NEWOBJ_OF(ec, var, T, c, f)
Definition: gc.h:34
Internal header for Numeric.
VALUE rb_int_and(VALUE x, VALUE y)
Definition: numeric.c:4436
int rb_num_to_uint(VALUE val, unsigned int *ret)
Definition: numeric.c:250
Internal header for Object.
Internal header for Proc.
VALUE rb_sym_to_proc(VALUE sym)
Definition: proc.c:1443
Internal header for Regexp.
void rb_backref_set_string(VALUE string, long pos, long len)
Definition: re.c:1358
VALUE rb_reg_check_preprocess(VALUE)
Definition: re.c:2718
long rb_reg_search0(VALUE, VALUE, long, int, int)
Definition: re.c:1622
VALUE rb_reg_match_p(VALUE re, VALUE str, long pos)
Definition: re.c:3353
bool rb_reg_start_with_p(VALUE re, VALUE str)
Definition: re.c:1634
void rb_match_unbusy(VALUE)
Definition: re.c:1311
#define STATIC_ASSERT
Definition: static_assert.h:14
Internal header for String.
#define rb_fstring_lit(str)
Definition: string.h:78
#define STR_SHARED
Definition: string.h:20
#define STR_NOEMBED
Definition: string.h:19
st_table * rb_vm_fstring_table(void)
Definition: vm.c:3787
#define RUBY_DTRACE_CREATE_HOOK(name, arg)
Definition: vm.h:120
#define bp()
Definition: internal.h:105
#define rb_fstring_cstr(...)
Definition: internal.h:71
#define rp(obj)
Definition: internal.h:95
#define rb_funcallv(...)
Definition: internal.h:77
#define PRIuSIZE
Definition: inttypes.h:127
voidpf void uLong size
Definition: ioapi.h:138
typedef long(ZCALLBACK *tell_file_func) OF((voidpf opaque
voidpf uLong offset
Definition: ioapi.h:144
typedef int(ZCALLBACK *close_file_func) OF((voidpf opaque
const char int mode
Definition: ioapi.h:137
voidpf void * buf
Definition: ioapi.h:138
VALUE rb_yield(VALUE)
Definition: vm_eval.c:1341
#define CHAR_BIT
Definition: limits.h:44
#define LONG_MAX
Definition: limits.h:36
#define INT2FIX
Definition: long.h:48
#define LONG2FIX
Definition: long.h:49
#define LONG2NUM
Definition: long.h:50
#define FIX2LONG
Definition: long.h:46
#define NUM2LONG
Definition: long.h:51
unsigned char u8
Definition: many2.c:13
Internal header for Math.
int memcmp(const void *s1, const void *s2, size_t len)
Definition: memcmp.c:7
#define MEMCPY(p1, p2, type, n)
Definition: memory.h:129
#define ALLOCA_N(type, n)
Definition: memory.h:112
#define ALLOCV
Definition: memory.h:138
#define MEMZERO(p, type, n)
Definition: memory.h:128
#define ZALLOC_N
Definition: memory.h:135
#define ALLOC_N
Definition: memory.h:133
#define RB_GC_GUARD(v)
Definition: memory.h:91
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
Definition: cxxanyargs.hpp:136
#define NEWOBJ_OF
Definition: newobj.h:35
const char * name
Definition: nkf.c:208
unsigned int last
Definition: nkf.c:4324
#define TRUE
Definition: nkf.h:175
#define FALSE
Definition: nkf.h:174
#define ONIGENC_CTYPE_DIGIT
Definition: onigmo.h:298
ONIG_EXTERN int onig_error_code_to_str(OnigUChar *s, OnigPosition err_code,...)
#define ONIGENC_CASE_ASCII_ONLY
Definition: onigmo.h:125
unsigned char OnigUChar
Definition: onigmo.h:79
unsigned int OnigCaseFoldType
Definition: onigmo.h:95
#define ONIG_MAX_ERROR_MESSAGE_LEN
Definition: onigmo.h:443
ONIG_EXTERN int onig_new(OnigRegex *, const OnigUChar *pattern, const OnigUChar *pattern_end, OnigOptionType option, OnigEncoding enc, const OnigSyntaxType *syntax, OnigErrorInfo *einfo)
#define ONIGENC_CASE_MODIFIED
Definition: onigmo.h:119
#define ONIGENC_MBCLEN_CHARFOUND_LEN(r)
Definition: onigmo.h:347
#define ONIGENC_CTYPE_ALPHA
Definition: onigmo.h:295
#define UChar
Definition: onigmo.h:76
#define ONIGENC_CODE_TO_MBC_MAXLEN
Definition: onigmo.h:289
ptrdiff_t OnigPosition
Definition: onigmo.h:83
#define ONIGENC_MBCLEN_CHARFOUND_P(r)
Definition: onigmo.h:346
#define ONIGENC_CASE_UPCASE
Definition: onigmo.h:113
#define ONIGENC_CASE_FOLD
Definition: onigmo.h:120
#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
Definition: onigmo.h:691
ONIG_EXTERN const OnigSyntaxType * OnigDefaultSyntax
Definition: onigmo.h:515
#define ONIGENC_CASE_DOWNCASE
Definition: onigmo.h:114
#define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, s, end)
Definition: onigmo.h:334
ONIG_EXTERN int onigenc_ascii_only_case_map(OnigCaseFoldType *flagP, const OnigUChar **pp, const OnigUChar *end, OnigUChar *to, OnigUChar *to_end, const struct OnigEncodingTypeST *enc)
Definition: regenc.c:955
ONIG_EXTERN OnigPosition onig_match(OnigRegex, const OnigUChar *str, const OnigUChar *end, const OnigUChar *at, OnigRegion *region, OnigOptionType option)
#define ONIGENC_CASE_FOLD_TURKISH_AZERI
Definition: onigmo.h:122
#define ONIGENC_CASE_TITLECASE
Definition: onigmo.h:115
#define ONIGENC_CASE_FOLD_LITHUANIAN
Definition: onigmo.h:124
#define ONIGENC_CODE_TO_MBCLEN(enc, code)
Definition: onigmo.h:367
#define ONIGERR_INVALID_CODE_POINT_VALUE
Definition: onigmo.h:689
#define ONIG_OPTION_DEFAULT
Definition: onigmo.h:447
#define RARRAY_CONST_PTR(s)
Definition: psych_emitter.c:4
#define RBASIC(obj)
Definition: rbasic.h:34
#define RBASIC_CLASS
Definition: rbasic.h:35
#define DATA_PTR(obj)
Definition: rdata.h:56
#define NULL
Definition: regenc.h:69
#define RGENGC_WB_PROTECTED_STRING
Definition: rgengc.h:60
#define RB_OBJ_WRITE(a, slot, b)
WB for new reference from ‘a’ to ‘b’.
Definition: rgengc.h:107
#define StringValue(v)
Definition: rstring.h:50
#define RSTRING_NOEMBED
Definition: rstring.h:36
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Definition: rstring.h:211
#define RSTRING_EMBED_LEN_MAX
Definition: rstring.h:39
#define RSTRING_FSTR
Definition: rstring.h:40
#define RSTRING(obj)
Definition: rstring.h:35
#define StringValueCStr(v)
Definition: rstring.h:52
#define RSTRING_EMBED_LEN_MASK
Definition: rstring.h:37
#define TypedData_Wrap_Struct(klass, data_type, sval)
Definition: rtypeddata.h:101
VALUE rb_require(const char *)
Definition: load.c:1199
int argc
Definition: ruby.c:240
char ** argv
Definition: ruby.c:241
#define RB_INTEGER_TYPE_P(obj)
Definition: ruby_missing.h:15
#define ST2FIX(h)
Definition: ruby_missing.h:21
Internal header for ASAN / MSAN / etc.
#define SIZET2NUM
Definition: size_t.h:52
#define Qundef
#define Qtrue
#define RTEST
#define Qnil
#define Qfalse
#define NIL_P
#define FIXNUM_P
#define f
VALUE rb_str_format(int, const VALUE *, VALUE)
Definition: sprintf.c:214
@ ST_STOP
Definition: st.h:99
@ ST_DELETE
Definition: st.h:99
@ ST_CONTINUE
Definition: st.h:99
unsigned long st_data_t
Definition: st.h:22
#define st_foreach
Definition: st.h:142
#define st_delete
Definition: st.h:118
st_data_t st_index_t
Definition: st.h:50
#define st_update
Definition: st.h:136
#define _(args)
Definition: stdarg.h:31
size_t strlen(const char *)
VALUE rb_str_to_interned_str(VALUE str)
Definition: string.c:11543
#define sym_equal
Definition: string.c:11008
VALUE rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
Definition: string.c:435
VALUE rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
Definition: string.c:1034
#define STR_SET_LEN(str, n)
Definition: string.c:118
#define STR_EMBEDDABLE_P(len, termlen)
Definition: string.c:196
int rb_str_symname_p(VALUE sym)
Definition: string.c:11027
VALUE rb_str_locktmp_ensure(VALUE str, VALUE(*func)(VALUE), VALUE arg)
Definition: string.c:2835
VALUE rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
Definition: string.c:4618
void rb_str_free(VALUE str)
Definition: string.c:1433
VALUE rb_str_times(VALUE str, VALUE times)
Definition: string.c:2114
#define SHARABLE_SUBSTRING_P(beg, len, end)
Definition: string.c:191
#define STR_ENC_GET(str)
Definition: string.c:185
long rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
Definition: string.c:617
#define STR_HEAP_PTR(str)
Definition: string.c:181
VALUE rb_str_new_shared(VALUE str)
Definition: string.c:1267
#define STR_NOFREE
Definition: string.c:104
const char * ruby_escaped_char(int c)
Definition: string.c:6117
#define TR_TABLE_SIZE
Definition: string.c:7626
void rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
Definition: string.c:2378
VALUE rb_str_new_frozen(VALUE orig)
Definition: string.c:1273
VALUE rb_str_eql(VALUE str1, VALUE str2)
Definition: string.c:3451
#define aligned_ptr(value)
VALUE rb_str_buf_cat_ascii(VALUE str, const char *ptr)
Definition: string.c:3079
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Definition: string.c:1100
#define STR_SET_NOEMBED(str)
Definition: string.c:107
#define STR_SET_SHARED(str, shared_str)
Definition: string.c:171
VALUE rb_str_buf_append(VALUE str, VALUE str2)
Definition: string.c:3103
VALUE rb_str_cat(VALUE str, const char *ptr, long len)
Definition: string.c:2962
VALUE rb_filesystem_str_new(const char *ptr, long len)
Definition: string.c:1181
long rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
Definition: string.c:1887
#define rb_str_splice(str, beg, len, val)
Definition: string.c:4905
VALUE rb_str_export(VALUE str)
Definition: string.c:1193
#define DEFAULT_REPLACE_CHAR(str)
char * rb_string_value_cstr(volatile VALUE *ptr)
Definition: string.c:2439
#define lesser(a, b)
Definition: string.c:3350
VALUE rb_sym_to_s(VALUE sym)
Definition: string.c:11148
VALUE rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
Definition: string.c:11171
void rb_str_shared_replace(VALUE str, VALUE str2)
Definition: string.c:1480
VALUE rb_external_str_new(const char *ptr, long len)
Definition: string.c:1157
VALUE rb_str_tmp_new(long len)
Definition: string.c:1427
char * rb_str_fill_terminator(VALUE str, const int newminlen)
Definition: string.c:2454
long rb_str_offset(VALUE str, long pos)
Definition: string.c:2566
char * rb_str_subpos(VALUE str, long beg, long *lenp)
Definition: string.c:2647
VALUE rb_str_succ(VALUE orig)
Definition: string.c:4315
#define CASE_MAPPING_ADDITIONAL_LENGTH
Definition: string.c:6755
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Definition: string.c:3324
VALUE rb_str_subseq(VALUE str, long beg, long len)
Definition: string.c:2624
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition: string.c:10517
#define RUBY_MAX_CHAR_LEN
Definition: string.c:100
VALUE rb_str_new_static(const char *ptr, long len)
*_str_new_static functions are intended for C string literals.
Definition: string.c:933
int rb_enc_str_coderange(VALUE str)
Definition: string.c:725
VALUE rb_str_chomp_string(VALUE str, VALUE rs)
Definition: string.c:9236
#define STR_SHARED_ROOT
Definition: string.c:101
#define ENUM_ELEM(ary, e)
Definition: string.c:8436
#define CRYPT_END()
VALUE rb_str_upto_each(VALUE beg, VALUE end, int excl, int(*each)(VALUE, VALUE), VALUE arg)
Definition: string.c:4483
size_t rb_str_capacity(VALUE str)
Definition: string.c:773
split_type_t
Definition: string.c:8108
@ SPLIT_TYPE_AWK
Definition: string.c:8109
@ SPLIT_TYPE_CHARS
Definition: string.c:8109
@ SPLIT_TYPE_REGEXP
Definition: string.c:8109
@ SPLIT_TYPE_STRING
Definition: string.c:8109
VALUE rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len, rb_encoding *from, int ecflags, VALUE ecopts)
Definition: string.c:1013
#define STR_SET_EMBED(str)
Definition: string.c:111
const struct st_hash_type rb_fstring_hash_type
Definition: string.c:287
#define BARE_STRING_P(str)
Definition: string.c:292
VALUE rb_str_dup(VALUE str)
Definition: string.c:1631
void Init_String(void)
Definition: string.c:11592
void rb_str_modify(VALUE str)
Definition: string.c:2262
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:951
VALUE rb_str_to_str(VALUE str)
Definition: string.c:1471
st_index_t rb_str_hash(VALUE str)
Definition: string.c:3314
VALUE rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:453
long rb_str_strlen(VALUE str)
Definition: string.c:1976
VALUE rb_str_resurrect(VALUE str)
Definition: string.c:1637
VALUE rb_str_quote_unprintable(VALUE str)
Definition: string.c:11046
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Definition: string.c:984
VALUE rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
Definition: string.c:1144
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:11562
VALUE rb_str_opt_plus(VALUE str1, VALUE str2)
Definition: string.c:2075
#define IS_EVSTR(p, e)
Definition: string.c:6292
#define CASEMAP_DEBUG
Definition: string.c:6757
#define ascii_isspace(c)
Definition: string.c:8077
VALUE rb_fs
Definition: string.c:502
#define WANTARRAY(m, size)
Definition: string.c:8421
VALUE rb_interned_str_cstr(const char *ptr)
Definition: string.c:11556
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Definition: string.c:1187
#define rb_str_index(str, sub, offset)
Definition: string.c:3643
size_t rb_str_memsize(VALUE str)
Definition: string.c:1460
#define BEG(no)
Definition: string.c:58
int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
Definition: string.c:6084
VALUE rb_str_plus(VALUE str1, VALUE str2)
Definition: string.c:2044
long rb_str_sublen(VALUE str, long pos)
Definition: string.c:2613
VALUE rb_str_equal(VALUE str1, VALUE str2)
Definition: string.c:3423
VALUE rb_str_tmp_frozen_acquire(VALUE orig)
Definition: string.c:1287
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
Definition: string.c:1106
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Definition: string.c:897
VALUE rb_str_concat_literals(size_t num, const VALUE *strary)
Definition: string.c:3127
VALUE rb_str_replace(VALUE str, VALUE str2)
Definition: string.c:5632
VALUE rb_check_string_type(VALUE str)
Definition: string.c:2462
void rb_str_set_len(VALUE str, long len)
Definition: string.c:2842
#define END(no)
Definition: string.c:59
VALUE rb_str_export_locale(VALUE str)
Definition: string.c:1199
VALUE rb_str_inspect(VALUE str)
Definition: string.c:6199
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:857
VALUE rb_interned_str(const char *ptr, long len)
Definition: string.c:11549
void rb_str_make_independent(VALUE str)
Definition: string.c:220
#define CHECK_IF_ASCII(c)
VALUE rb_cSymbol
Definition: string.c:81
VALUE rb_tainted_str_new(const char *ptr, long len)
Definition: string.c:957
VALUE rb_str_length(VALUE str)
Definition: string.c:1995
int rb_str_comparable(VALUE str1, VALUE str2)
Definition: string.c:3353
#define str_buf_cat2(str, ptr)
Definition: string.c:2959
VALUE rb_obj_as_string_result(VALUE str, VALUE obj)
Definition: string.c:1541
#define MIN_PRE_ALLOC_SIZE
Definition: string.c:3124
VALUE rb_str_append(VALUE str, VALUE str2)
Definition: string.c:3118
#define RESIZE_CAPA_TERM(str, capacity, termlen)
Definition: string.c:151
VALUE rb_fstring_new(const char *ptr, long len)
Definition: string.c:446
VALUE rb_str_freeze(VALUE str)
Definition: string.c:2766
VALUE rb_string_value(volatile VALUE *ptr)
Definition: string.c:2323
void rb_str_modify_expand(VALUE str, long expand)
Definition: string.c:2270
#define RESIZE_CAPA(str, capacity)
Definition: string.c:147
unsigned char * USTR
Definition: string.c:7255
VALUE rb_str_scrub(VALUE str, VALUE repl)
Definition: string.c:10580
#define STR_BORROWED
Definition: string.c:102
#define STR_TMPLOCK
Definition: string.c:103
int rb_enc_str_asciionly_p(VALUE str)
Definition: string.c:739
#define CHAR_ESC_LEN
Definition: string.c:6081
#define STR_FAKESTR
Definition: string.c:105
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
Definition: string.c:3072
#define STR_SET_EMBED_LEN(str, n)
Definition: string.c:112
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Definition: string.c:1381
#define rb_str_dup_frozen
int rb_str_cmp(VALUE str1, VALUE str2)
Definition: string.c:3378
VALUE rb_to_symbol(VALUE name)
Definition: string.c:11511
char * rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
Definition: string.c:2538
void rb_str_update(VALUE str, long beg, long len, VALUE val)
Definition: string.c:4865
char * rb_str_to_cstr(VALUE str)
Definition: string.c:2432
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Definition: string.c:11573
ID rb_to_id(VALUE name)
Definition: string.c:11501
VALUE rb_str_substr(VALUE str, long beg, long len)
Definition: string.c:2734
VALUE rb_str_unlocktmp(VALUE str)
Definition: string.c:2825
VALUE rb_id_quote_unprintable(ID id)
Definition: string.c:11067
VALUE rb_str_upto_endless_each(VALUE beg, int(*each)(VALUE, VALUE), VALUE arg)
Definition: string.c:4565
VALUE rb_str_resize(VALUE str, long len)
Definition: string.c:2859
void rb_must_asciicompat(VALUE str)
Definition: string.c:2314
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Definition: string.c:945
VALUE rb_str_split(VALUE str, const char *sep0)
Definition: string.c:8412
char * rb_string_value_ptr(volatile VALUE *ptr)
Definition: string.c:2334
#define TERM_LEN(str)
Definition: string.c:138
VALUE rb_str_dump(VALUE str)
Definition: string.c:6311
#define rb_rs
Definition: string.c:8464
VALUE rb_str_concat(VALUE str1, VALUE str2)
Definition: string.c:3217
void rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
Definition: string.c:1294
VALUE rb_locale_str_new(const char *ptr, long len)
Definition: string.c:1169
#define CASE_UTF(e)
VALUE rb_str_buf_new(long capa)
Definition: string.c:1398
#define STR_BUF_MIN_SIZE
Definition: string.c:1394
#define SPLIT_STR(beg, len)
#define STR_HEAP_SIZE(str)
Definition: string.c:182
VALUE rb_str_drop_bytes(VALUE str, long len)
Definition: string.c:4795
int rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
Definition: string.c:719
long rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
Definition: string.c:1896
#define TERM_FILL(ptr, termlen)
Definition: string.c:139
VALUE rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
Definition: string.c:1644
VALUE rb_cString
Definition: string.c:80
VALUE rb_str_export_to_enc(VALUE str, rb_encoding *enc)
Definition: string.c:1205
#define TR_TABLE_MAX
Definition: string.c:7625
VALUE rb_fstring(VALUE str)
Definition: string.c:353
neighbor_char
Definition: string.c:4076
@ NEIGHBOR_FOUND
Definition: string.c:4078
@ NEIGHBOR_WRAPPED
Definition: string.c:4079
@ NEIGHBOR_NOT_CHAR
Definition: string.c:4077
VALUE rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
Definition: string.c:10587
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Definition: string.c:939
VALUE rb_obj_as_string(VALUE obj)
Definition: string.c:1529
const char * name
Definition: onigmo.h:162
int(* case_map)(OnigCaseFoldType *flagP, const OnigUChar **pp, const OnigUChar *end, OnigUChar *to, OnigUChar *to_end, const struct OnigEncodingTypeST *enc)
Definition: onigmo.h:177
VALUE flags
Definition: rbasic.h:48
struct RBasic basic
Definition: rstring.h:74
long capa
Definition: rstring.h:80
long len
Definition: rstring.h:77
VALUE shared
Definition: rstring.h:81
char * ptr
Definition: rstring.h:78
union RString::@100 as
union RString::@100::@101::@102 aux
struct RString::@100::@101 heap
Definition: inftree9.h:24
VALUE fstr
Definition: string.c:295
size_t capa
Definition: string.c:6762
OnigUChar space[FLEX_ARY_LEN]
Definition: string.c:6765
struct mapping_buffer * next
Definition: string.c:6764
size_t used
Definition: string.c:6763
OnigPosition * beg
Definition: onigmo.h:719
int num_regs
Definition: onigmo.h:718
OnigPosition * end
Definition: onigmo.h:720
Definition: st.h:79
Definition: string.c:7257
int gen
Definition: string.c:7258
unsigned int now
Definition: string.c:7259
unsigned int max
Definition: string.c:7259
char * p
Definition: string.c:7260
char * pend
Definition: string.c:7260
#define snprintf
Definition: subst.h:14
#define t
Definition: symbol.c:253
Internal header for Encoding::Converter.
VALUE rb_cEncodingConverter
Definition: transcode.c:34
unsigned long VALUE
Definition: value.h:38
unsigned long ID
Definition: value.h:39
#define TYPE(_)
Definition: value_type.h:105
#define T_STRING
Definition: value_type.h:77
#define T_FIXNUM
Definition: value_type.h:62
#define BUILTIN_TYPE
Definition: value_type.h:84
#define SYMBOL_P
Definition: value_type.h:87
#define T_REGEXP
Definition: value_type.h:76
#define rb_id2str(id)
Definition: vm_backtrace.c:30
#define dp(v)
Definition: vm_debug.h:20
#define RB_VM_LOCK_ENTER()
Definition: vm_sync.h:121
#define RB_VM_LOCK_LEAVE()
Definition: vm_sync.h:122
int err
Definition: win32.c:142
unsigned int uintptr_t
Definition: win32.h:106
#define xfree
Definition: xmalloc.h:49
#define xmalloc
Definition: xmalloc.h:44