Skip to content

Commit e86d372

Browse files
committed
[GR-50320] [GR-50321] [GR-50322] Backports for 23.1.
PullRequest: truffleruby/4067
2 parents e976a4d + 152a876 commit e86d372

File tree

9 files changed

+145
-68
lines changed

9 files changed

+145
-68
lines changed

CHANGELOG.md

+8
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
# 23.1.2
2+
3+
Bug fixes:
4+
5+
* Fix `rb_enc_left_char_head()` so it is not always `ArgumentError` (#3267, @eregon).
6+
* Fix `IO.copy_stream` with a `Tempfile` destination (#3280, @eregon).
7+
* Fix `Regexp.union` negotiating the wrong result encoding (#3287, @nirvdrum, @simonlevasseur).
8+
19
# 23.1.0
210

311
New features:

lib/cext/ABI_check.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2
1+
3

spec/ruby/core/io/copy_stream_spec.rb

+27-6
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,12 @@
6969
end
7070

7171
it "raises an IOError if the destination IO is not open for writing" do
72-
@to_io.close
73-
@to_io = new_io @to_name, "r"
74-
-> { IO.copy_stream @object.from, @to_io }.should raise_error(IOError)
72+
to_io = new_io __FILE__, "r"
73+
begin
74+
-> { IO.copy_stream @object.from, to_io }.should raise_error(IOError)
75+
ensure
76+
to_io.close
77+
end
7578
end
7679

7780
it "does not close the destination IO" do
@@ -109,7 +112,8 @@
109112
end
110113

111114
after :each do
112-
rm_r @to_name, @from_bigfile
115+
rm_r @to_name if @to_name
116+
rm_r @from_bigfile
113117
end
114118

115119
describe "from an IO" do
@@ -164,6 +168,25 @@
164168
it_behaves_like :io_copy_stream_to_io, nil, IOSpecs::CopyStream
165169
it_behaves_like :io_copy_stream_to_io_with_offset, nil, IOSpecs::CopyStream
166170
end
171+
172+
describe "to a Tempfile" do
173+
before :all do
174+
require 'tempfile'
175+
end
176+
177+
before :each do
178+
@to_io = Tempfile.new("rubyspec_copy_stream", encoding: Encoding::BINARY, mode: File::RDONLY)
179+
@to_name = @to_io.path
180+
end
181+
182+
after :each do
183+
@to_io.close!
184+
@to_name = nil # do not rm_r it, already done by Tempfile#close!
185+
end
186+
187+
it_behaves_like :io_copy_stream_to_io, nil, IOSpecs::CopyStream
188+
it_behaves_like :io_copy_stream_to_io_with_offset, nil, IOSpecs::CopyStream
189+
end
167190
end
168191

169192
describe "from a file name" do
@@ -277,10 +300,8 @@
277300
@io.should_not_receive(:pos)
278301
IO.copy_stream(@io, @to_name)
279302
end
280-
281303
end
282304

283-
284305
describe "with a destination that does partial reads" do
285306
before do
286307
@from_out, @from_in = IO.pipe

spec/ruby/core/regexp/union_spec.rb

+37-14
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,27 @@
4343
Regexp.union("\u00A9".encode("ISO-8859-1"), "a".encode("UTF-8")).encoding.should == Encoding::ISO_8859_1
4444
end
4545

46+
it "returns ASCII-8BIT if the regexp encodings are ASCII-8BIT and at least one has non-ASCII characters" do
47+
us_ascii_implicit, us_ascii_explicit, binary = /abc/, /[\x00-\x7f]/n, /[\x80-\xBF]/n
48+
us_ascii_implicit.encoding.should == Encoding::US_ASCII
49+
us_ascii_explicit.encoding.should == Encoding::US_ASCII
50+
binary.encoding.should == Encoding::BINARY
51+
52+
Regexp.union(us_ascii_implicit, us_ascii_explicit, binary).encoding.should == Encoding::BINARY
53+
Regexp.union(us_ascii_implicit, binary, us_ascii_explicit).encoding.should == Encoding::BINARY
54+
Regexp.union(us_ascii_explicit, us_ascii_implicit, binary).encoding.should == Encoding::BINARY
55+
Regexp.union(us_ascii_explicit, binary, us_ascii_implicit).encoding.should == Encoding::BINARY
56+
Regexp.union(binary, us_ascii_implicit, us_ascii_explicit).encoding.should == Encoding::BINARY
57+
Regexp.union(binary, us_ascii_explicit, us_ascii_implicit).encoding.should == Encoding::BINARY
58+
end
59+
60+
it "return US-ASCII if all patterns are ASCII-only" do
61+
Regexp.union(/abc/e, /def/e).encoding.should == Encoding::US_ASCII
62+
Regexp.union(/abc/n, /def/n).encoding.should == Encoding::US_ASCII
63+
Regexp.union(/abc/s, /def/s).encoding.should == Encoding::US_ASCII
64+
Regexp.union(/abc/u, /def/u).encoding.should == Encoding::US_ASCII
65+
end
66+
4667
it "returns a Regexp with UTF-8 if one part is UTF-8" do
4768
Regexp.union(/probl[éeè]me/i, /help/i).encoding.should == Encoding::UTF_8
4869
end
@@ -54,83 +75,83 @@
5475
it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Strings" do
5576
-> {
5677
Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-16BE"))
57-
}.should raise_error(ArgumentError)
78+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE')
5879
end
5980

6081
it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Regexps" do
6182
-> {
6283
Regexp.union(Regexp.new("a".encode("UTF-16LE")),
6384
Regexp.new("b".encode("UTF-16BE")))
64-
}.should raise_error(ArgumentError)
85+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE')
6586
end
6687

6788
it "raises ArgumentError if the arguments include conflicting fixed encoding Regexps" do
6889
-> {
6990
Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING),
7091
Regexp.new("b".encode("US-ASCII"), Regexp::FIXEDENCODING))
71-
}.should raise_error(ArgumentError)
92+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and US-ASCII')
7293
end
7394

7495
it "raises ArgumentError if the arguments include a fixed encoding Regexp and a String containing non-ASCII-compatible characters in a different encoding" do
7596
-> {
7697
Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING),
7798
"\u00A9".encode("ISO-8859-1"))
78-
}.should raise_error(ArgumentError)
99+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and ISO-8859-1')
79100
end
80101

81102
it "raises ArgumentError if the arguments include a String containing non-ASCII-compatible characters and a fixed encoding Regexp in a different encoding" do
82103
-> {
83104
Regexp.union("\u00A9".encode("ISO-8859-1"),
84105
Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING))
85-
}.should raise_error(ArgumentError)
106+
}.should raise_error(ArgumentError, 'incompatible encodings: ISO-8859-1 and UTF-8')
86107
end
87108

88109
it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only String" do
89110
-> {
90111
Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-8"))
91-
}.should raise_error(ArgumentError)
112+
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
92113
end
93114

94115
it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only String" do
95116
-> {
96117
Regexp.union(Regexp.new("a".encode("UTF-16LE")), "b".encode("UTF-8"))
97-
}.should raise_error(ArgumentError)
118+
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
98119
end
99120

100121
it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only Regexp" do
101122
-> {
102123
Regexp.union("a".encode("UTF-16LE"), Regexp.new("b".encode("UTF-8")))
103-
}.should raise_error(ArgumentError)
124+
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
104125
end
105126

106127
it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only Regexp" do
107128
-> {
108129
Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("b".encode("UTF-8")))
109-
}.should raise_error(ArgumentError)
130+
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
110131
end
111132

112133
it "raises ArgumentError if the arguments include an ASCII-incompatible String and a String containing non-ASCII-compatible characters in a different encoding" do
113134
-> {
114135
Regexp.union("a".encode("UTF-16LE"), "\u00A9".encode("ISO-8859-1"))
115-
}.should raise_error(ArgumentError)
136+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
116137
end
117138

118139
it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a String containing non-ASCII-compatible characters in a different encoding" do
119140
-> {
120141
Regexp.union(Regexp.new("a".encode("UTF-16LE")), "\u00A9".encode("ISO-8859-1"))
121-
}.should raise_error(ArgumentError)
142+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
122143
end
123144

124145
it "raises ArgumentError if the arguments include an ASCII-incompatible String and a Regexp containing non-ASCII-compatible characters in a different encoding" do
125146
-> {
126147
Regexp.union("a".encode("UTF-16LE"), Regexp.new("\u00A9".encode("ISO-8859-1")))
127-
}.should raise_error(ArgumentError)
148+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
128149
end
129150

130151
it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a Regexp containing non-ASCII-compatible characters in a different encoding" do
131152
-> {
132153
Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("\u00A9".encode("ISO-8859-1")))
133-
}.should raise_error(ArgumentError)
154+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
134155
end
135156

136157
it "uses to_str to convert arguments (if not Regexp)" do
@@ -154,6 +175,8 @@
154175
not_supported_on :opal do
155176
Regexp.union([/dogs/, /cats/i]).should == /(?-mix:dogs)|(?i-mx:cats)/
156177
end
157-
->{Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])}.should raise_error(TypeError)
178+
-> {
179+
Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])
180+
}.should raise_error(TypeError, 'no implicit conversion of Array into String')
158181
end
159182
end

spec/ruby/optional/capi/encoding_spec.rb

+16
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,22 @@
674674
end
675675
end
676676

677+
describe "rb_enc_left_char_head" do
678+
it 'returns the head position of a character' do
679+
@s.rb_enc_left_char_head("é", 1).should == 0
680+
@s.rb_enc_left_char_head("éééé", 7).should == 6
681+
682+
@s.rb_enc_left_char_head("a", 0).should == 0
683+
684+
# unclear if this is intended to work
685+
@s.rb_enc_left_char_head("a", 1).should == 1
686+
687+
# Works because for single-byte encodings rb_enc_left_char_head() just returns the pointer
688+
@s.rb_enc_left_char_head("a".force_encoding(Encoding::US_ASCII), 88).should == 88
689+
@s.rb_enc_left_char_head("a".b, 88).should == 88
690+
end
691+
end
692+
677693
describe "ONIGENC_MBC_CASE_FOLD" do
678694
it "returns the correct case fold for the given string" do
679695
@s.ONIGENC_MBC_CASE_FOLD("lower").should == ["l", 1]

spec/ruby/optional/capi/ext/encoding_spec.c

+7
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,12 @@ static VALUE encoding_spec_rb_enc_strlen(VALUE self, VALUE str, VALUE length, VA
307307
return LONG2FIX(rb_enc_strlen(p, e, rb_to_encoding(encoding)));
308308
}
309309

310+
static VALUE encoding_spec_rb_enc_left_char_head(VALUE self, VALUE str, VALUE offset) {
311+
char *ptr = RSTRING_PTR(str);
312+
char *result = rb_enc_left_char_head(ptr, ptr + NUM2INT(offset), RSTRING_END(str), rb_enc_get(str));
313+
return LONG2NUM(result - ptr);
314+
}
315+
310316
void Init_encoding_spec(void) {
311317
VALUE cls;
312318
native_rb_encoding_pointer = (rb_encoding**) malloc(sizeof(rb_encoding*));
@@ -364,6 +370,7 @@ void Init_encoding_spec(void) {
364370
rb_define_method(cls, "rb_enc_str_asciionly_p", encoding_spec_rb_enc_str_asciionly_p, 1);
365371
rb_define_method(cls, "rb_uv_to_utf8", encoding_spec_rb_uv_to_utf8, 2);
366372
rb_define_method(cls, "ONIGENC_MBC_CASE_FOLD", encoding_spec_ONIGENC_MBC_CASE_FOLD, 1);
373+
rb_define_method(cls, "rb_enc_left_char_head", encoding_spec_rb_enc_left_char_head, 2);
367374
}
368375

369376
#ifdef __cplusplus

src/main/c/cext/encoding.c

+4-1
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,10 @@ int rb_enc_get_index(VALUE obj) {
226226
}
227227

228228
char* rb_enc_left_char_head(const char *start, const char *p, const char *end, rb_encoding *enc) {
229-
int length = start - end;
229+
if (p <= start || p >= end) {
230+
return p;
231+
}
232+
int length = end - start;
230233
int position = polyglot_as_i32(polyglot_invoke(RUBY_CEXT, "rb_enc_left_char_head",
231234
rb_tr_unwrap(rb_enc_from_encoding(enc)),
232235
rb_tr_unwrap(rb_str_new(start, length)),

src/main/ruby/truffleruby/core/io.rb

+11-15
Original file line numberDiff line numberDiff line change
@@ -356,24 +356,20 @@ def initialize(from, to, length, offset)
356356
@method = read_method @from
357357
end
358358

359+
# From copy_stream_body in io.c in CRuby
360+
# The first element is true if obj can be used as an IO directly
359361
def to_io(obj, mode)
360-
if Primitive.is_a?(obj, IO)
361-
flag = true
362-
io = obj
363-
else
364-
flag = false
365-
366-
if Primitive.is_a?(obj, String)
367-
io = File.open obj, mode
368-
elsif obj.respond_to? :to_path
369-
path = Truffle::Type.coerce_to obj, String, :to_path
370-
io = File.open path, mode
371-
else
372-
io = obj
373-
end
362+
unless Primitive.is_a?(obj, IO) || Primitive.is_a?(obj, String) || obj.respond_to?(:to_path)
363+
return [false, obj]
374364
end
375365

376-
[flag, io]
366+
if io = IO.try_convert(obj)
367+
[true, io]
368+
else
369+
path = Truffle::Type.coerce_to obj, String, :to_path
370+
io = File.open path, mode
371+
[false, io]
372+
end
377373
end
378374

379375
def read_method(obj)

src/main/ruby/truffleruby/core/regexp.rb

+34-31
Original file line numberDiff line numberDiff line change
@@ -55,22 +55,27 @@ def self.try_convert(obj)
5555
Truffle::Type.try_convert obj, Regexp, :to_regexp
5656
end
5757

58-
def self.convert(pattern)
59-
return pattern if Primitive.is_a?(pattern, Regexp)
60-
if Primitive.is_a?(pattern, Array)
61-
union(*pattern)
62-
else
63-
Regexp.quote(pattern.to_s)
64-
end
65-
end
58+
def self.negotiate_union_encoding(*patterns)
59+
compatible_enc = nil
60+
61+
patterns.each do |pattern|
62+
converted = Primitive.is_a?(pattern, Regexp) ? pattern : Regexp.quote(pattern)
63+
64+
enc = converted.encoding
65+
66+
if Primitive.nil?(compatible_enc)
67+
compatible_enc = enc
68+
else
69+
if test = Primitive.encoding_compatible?(enc, compatible_enc)
70+
compatible_enc = test
71+
else
72+
raise ArgumentError, "incompatible encodings: #{compatible_enc} and #{enc}"
73+
end
6674

67-
def self.compatible?(*patterns)
68-
encodings = patterns.map { |r| convert(r).encoding }
69-
last_enc = encodings.pop
70-
encodings.each do |encoding|
71-
raise ArgumentError, "incompatible encodings: #{encoding} and #{last_enc}" unless Primitive.encoding_compatible?(last_enc, encoding)
72-
last_enc = encoding
75+
end
7376
end
77+
78+
compatible_enc
7479
end
7580

7681
def self.last_match(index = nil)
@@ -96,37 +101,35 @@ def self.last_match(index = nil)
96101
def self.union(*patterns)
97102
case patterns.size
98103
when 0
99-
return %r/(?!)/
104+
%r/(?!)/
100105
when 1
101106
pattern = patterns.first
102107
case pattern
103108
when Array
104-
return union(*pattern)
109+
union(*pattern)
105110
else
106111
converted = Truffle::Type.rb_check_convert_type(pattern, Regexp, :to_regexp)
107112
if Primitive.nil? converted
108-
return Regexp.new(Regexp.quote(pattern))
113+
Regexp.new(Regexp.quote(pattern))
109114
else
110-
return converted
115+
converted
111116
end
112117
end
113118
else
114-
compatible?(*patterns)
115-
enc = convert(patterns.first).encoding
116-
end
119+
patterns = patterns.map do |pat|
120+
if Primitive.is_a?(pat, Regexp)
121+
pat
122+
else
123+
StringValue(pat)
124+
end
125+
end
117126

118-
sep = '|'.encode(enc)
119-
str = ''.encode(enc)
127+
enc = negotiate_union_encoding(*patterns)
128+
sep = '|'.encode(enc)
129+
str = ''.encode(enc)
120130

121-
patterns = patterns.map do |pat|
122-
if Primitive.is_a?(pat, Regexp)
123-
pat
124-
else
125-
StringValue(pat)
126-
end
131+
Truffle::RegexpOperations.union(str, sep, *patterns)
127132
end
128-
129-
Truffle::RegexpOperations.union(str, sep, *patterns)
130133
end
131134
Truffle::Graal.always_split(method(:union))
132135

0 commit comments

Comments
 (0)