[GR-50320] [GR-50321] [GR-50322] Backports for 23.1.

eregon · eregon · commit e86d3721c8ab · 2023-11-23T17:08:16.000Z
PullRequest: truffleruby/4067
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+# 23.1.2
+
+Bug fixes:
+
+* Fix `rb_enc_left_char_head()` so it is not always `ArgumentError` (#3267, @eregon).
+* Fix `IO.copy_stream` with a `Tempfile` destination (#3280, @eregon).
+* Fix `Regexp.union` negotiating the wrong result encoding (#3287, @nirvdrum, @simonlevasseur).
+
 # 23.1.0
 
 New features:
diff --git a/lib/cext/ABI_check.txt b/lib/cext/ABI_check.txt
@@ -1 +1 @@
-2
+3
diff --git a/spec/ruby/core/io/copy_stream_spec.rb b/spec/ruby/core/io/copy_stream_spec.rb
@@ -69,9 +69,12 @@
   end
 
   it "raises an IOError if the destination IO is not open for writing" do
-    @to_io.close
-    @to_io = new_io @to_name, "r"
-    -> { IO.copy_stream @object.from, @to_io }.should raise_error(IOError)
+    to_io = new_io __FILE__, "r"
+    begin
+      -> { IO.copy_stream @object.from, to_io }.should raise_error(IOError)
+    ensure
+      to_io.close
+    end
   end
 
   it "does not close the destination IO" do
@@ -109,7 +112,8 @@
   end
 
   after :each do
-    rm_r @to_name, @from_bigfile
+    rm_r @to_name if @to_name
+    rm_r @from_bigfile
   end
 
   describe "from an IO" do
@@ -164,6 +168,25 @@
       it_behaves_like :io_copy_stream_to_io, nil, IOSpecs::CopyStream
       it_behaves_like :io_copy_stream_to_io_with_offset, nil, IOSpecs::CopyStream
     end
+
+    describe "to a Tempfile" do
+      before :all do
+        require 'tempfile'
+      end
+
+      before :each do
+        @to_io = Tempfile.new("rubyspec_copy_stream", encoding: Encoding::BINARY, mode: File::RDONLY)
+        @to_name = @to_io.path
+      end
+
+      after :each do
+        @to_io.close!
+        @to_name = nil # do not rm_r it, already done by Tempfile#close!
+      end
+
+      it_behaves_like :io_copy_stream_to_io, nil, IOSpecs::CopyStream
+      it_behaves_like :io_copy_stream_to_io_with_offset, nil, IOSpecs::CopyStream
+    end
   end
 
   describe "from a file name" do
@@ -277,10 +300,8 @@
       @io.should_not_receive(:pos)
       IO.copy_stream(@io, @to_name)
     end
-
   end
 
-
   describe "with a destination that does partial reads" do
     before do
       @from_out, @from_in = IO.pipe
diff --git a/spec/ruby/core/regexp/union_spec.rb b/spec/ruby/core/regexp/union_spec.rb
@@ -43,6 +43,27 @@
     Regexp.union("\u00A9".encode("ISO-8859-1"), "a".encode("UTF-8")).encoding.should == Encoding::ISO_8859_1
   end
 
+  it "returns ASCII-8BIT if the regexp encodings are ASCII-8BIT and at least one has non-ASCII characters" do
+    us_ascii_implicit, us_ascii_explicit, binary = /abc/, /[\x00-\x7f]/n, /[\x80-\xBF]/n
+    us_ascii_implicit.encoding.should == Encoding::US_ASCII
+    us_ascii_explicit.encoding.should == Encoding::US_ASCII
+    binary.encoding.should == Encoding::BINARY
+
+    Regexp.union(us_ascii_implicit, us_ascii_explicit, binary).encoding.should == Encoding::BINARY
+    Regexp.union(us_ascii_implicit, binary, us_ascii_explicit).encoding.should == Encoding::BINARY
+    Regexp.union(us_ascii_explicit, us_ascii_implicit, binary).encoding.should == Encoding::BINARY
+    Regexp.union(us_ascii_explicit, binary, us_ascii_implicit).encoding.should == Encoding::BINARY
+    Regexp.union(binary, us_ascii_implicit, us_ascii_explicit).encoding.should == Encoding::BINARY
+    Regexp.union(binary, us_ascii_explicit, us_ascii_implicit).encoding.should == Encoding::BINARY
+  end
+
+  it "return US-ASCII if all patterns are ASCII-only" do
+    Regexp.union(/abc/e, /def/e).encoding.should == Encoding::US_ASCII
+    Regexp.union(/abc/n, /def/n).encoding.should == Encoding::US_ASCII
+    Regexp.union(/abc/s, /def/s).encoding.should == Encoding::US_ASCII
+    Regexp.union(/abc/u, /def/u).encoding.should == Encoding::US_ASCII
+  end
+
   it "returns a Regexp with UTF-8 if one part is UTF-8" do
     Regexp.union(/probl[éeè]me/i, /help/i).encoding.should == Encoding::UTF_8
   end
@@ -54,83 +75,83 @@
   it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Strings" do
     -> {
       Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-16BE"))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE')
   end
 
   it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Regexps" do
     -> {
       Regexp.union(Regexp.new("a".encode("UTF-16LE")),
                    Regexp.new("b".encode("UTF-16BE")))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE')
   end
 
   it "raises ArgumentError if the arguments include conflicting fixed encoding Regexps" do
     -> {
       Regexp.union(Regexp.new("a".encode("UTF-8"),    Regexp::FIXEDENCODING),
                    Regexp.new("b".encode("US-ASCII"), Regexp::FIXEDENCODING))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and US-ASCII')
   end
 
   it "raises ArgumentError if the arguments include a fixed encoding Regexp and a String containing non-ASCII-compatible characters in a different encoding" do
     -> {
       Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING),
                    "\u00A9".encode("ISO-8859-1"))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and ISO-8859-1')
   end
 
   it "raises ArgumentError if the arguments include a String containing non-ASCII-compatible characters and a fixed encoding Regexp in a different encoding" do
     -> {
       Regexp.union("\u00A9".encode("ISO-8859-1"),
                    Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, 'incompatible encodings: ISO-8859-1 and UTF-8')
   end
 
   it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only String" do
     -> {
       Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-8"))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
   end
 
   it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only String" do
     -> {
       Regexp.union(Regexp.new("a".encode("UTF-16LE")), "b".encode("UTF-8"))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
   end
 
   it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only Regexp" do
     -> {
       Regexp.union("a".encode("UTF-16LE"), Regexp.new("b".encode("UTF-8")))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
   end
 
   it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only Regexp" do
     -> {
       Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("b".encode("UTF-8")))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
   end
 
   it "raises ArgumentError if the arguments include an ASCII-incompatible String and a String containing non-ASCII-compatible characters in a different encoding" do
     -> {
       Regexp.union("a".encode("UTF-16LE"), "\u00A9".encode("ISO-8859-1"))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
   end
 
   it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a String containing non-ASCII-compatible characters in a different encoding" do
     -> {
       Regexp.union(Regexp.new("a".encode("UTF-16LE")), "\u00A9".encode("ISO-8859-1"))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
   end
 
   it "raises ArgumentError if the arguments include an ASCII-incompatible String and a Regexp containing non-ASCII-compatible characters in a different encoding" do
     -> {
       Regexp.union("a".encode("UTF-16LE"), Regexp.new("\u00A9".encode("ISO-8859-1")))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
   end
 
   it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a Regexp containing non-ASCII-compatible characters in a different encoding" do
     -> {
       Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("\u00A9".encode("ISO-8859-1")))
-    }.should raise_error(ArgumentError)
+    }.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
   end
 
   it "uses to_str to convert arguments (if not Regexp)" do
@@ -154,6 +175,8 @@
     not_supported_on :opal do
       Regexp.union([/dogs/, /cats/i]).should == /(?-mix:dogs)|(?i-mx:cats)/
     end
-    ->{Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])}.should raise_error(TypeError)
+    -> {
+      Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])
+    }.should raise_error(TypeError, 'no implicit conversion of Array into String')
   end
 end
diff --git a/spec/ruby/optional/capi/encoding_spec.rb b/spec/ruby/optional/capi/encoding_spec.rb
@@ -674,6 +674,22 @@
     end
   end
 
+  describe "rb_enc_left_char_head" do
+    it 'returns the head position of a character' do
+      @s.rb_enc_left_char_head("é", 1).should == 0
+      @s.rb_enc_left_char_head("éééé", 7).should == 6
+
+      @s.rb_enc_left_char_head("a", 0).should == 0
+
+      # unclear if this is intended to work
+      @s.rb_enc_left_char_head("a", 1).should == 1
+
+      # Works because for single-byte encodings rb_enc_left_char_head() just returns the pointer
+      @s.rb_enc_left_char_head("a".force_encoding(Encoding::US_ASCII), 88).should == 88
+      @s.rb_enc_left_char_head("a".b, 88).should == 88
+    end
+  end
+
   describe "ONIGENC_MBC_CASE_FOLD" do
     it "returns the correct case fold for the given string" do
       @s.ONIGENC_MBC_CASE_FOLD("lower").should == ["l", 1]
diff --git a/spec/ruby/optional/capi/ext/encoding_spec.c b/spec/ruby/optional/capi/ext/encoding_spec.c
@@ -307,6 +307,12 @@ static VALUE encoding_spec_rb_enc_strlen(VALUE self, VALUE str, VALUE length, VA
   return LONG2FIX(rb_enc_strlen(p, e, rb_to_encoding(encoding)));
 }
 
+static VALUE encoding_spec_rb_enc_left_char_head(VALUE self, VALUE str, VALUE offset) {
+  char *ptr = RSTRING_PTR(str);
+  char *result = rb_enc_left_char_head(ptr, ptr + NUM2INT(offset), RSTRING_END(str), rb_enc_get(str));
+  return LONG2NUM(result - ptr);
+}
+
 void Init_encoding_spec(void) {
   VALUE cls;
   native_rb_encoding_pointer = (rb_encoding**) malloc(sizeof(rb_encoding*));
@@ -364,6 +370,7 @@ void Init_encoding_spec(void) {
   rb_define_method(cls, "rb_enc_str_asciionly_p", encoding_spec_rb_enc_str_asciionly_p, 1);
   rb_define_method(cls, "rb_uv_to_utf8", encoding_spec_rb_uv_to_utf8, 2);
   rb_define_method(cls, "ONIGENC_MBC_CASE_FOLD", encoding_spec_ONIGENC_MBC_CASE_FOLD, 1);
+  rb_define_method(cls, "rb_enc_left_char_head", encoding_spec_rb_enc_left_char_head, 2);
 }
 
 #ifdef __cplusplus
diff --git a/src/main/c/cext/encoding.c b/src/main/c/cext/encoding.c
@@ -226,7 +226,10 @@ int rb_enc_get_index(VALUE obj) {
 }
 
 char* rb_enc_left_char_head(const char *start, const char *p, const char *end, rb_encoding *enc) {
-  int length = start - end;
+  if (p <= start || p >= end) {
+    return p;
+  }
+  int length = end - start;
   int position = polyglot_as_i32(polyglot_invoke(RUBY_CEXT, "rb_enc_left_char_head",
       rb_tr_unwrap(rb_enc_from_encoding(enc)),
       rb_tr_unwrap(rb_str_new(start, length)),
diff --git a/src/main/ruby/truffleruby/core/io.rb b/src/main/ruby/truffleruby/core/io.rb
@@ -356,24 +356,20 @@ def initialize(from, to, length, offset)
       @method = read_method @from
     end
 
+    # From copy_stream_body in io.c in CRuby
+    # The first element is true if obj can be used as an IO directly
     def to_io(obj, mode)
-      if Primitive.is_a?(obj, IO)
-        flag = true
-        io = obj
-      else
-        flag = false
-
-        if Primitive.is_a?(obj, String)
-          io = File.open obj, mode
-        elsif obj.respond_to? :to_path
-          path = Truffle::Type.coerce_to obj, String, :to_path
-          io = File.open path, mode
-        else
-          io = obj
-        end
+      unless Primitive.is_a?(obj, IO) || Primitive.is_a?(obj, String) || obj.respond_to?(:to_path)
+        return [false, obj]
       end
 
-      [flag, io]
+      if io = IO.try_convert(obj)
+        [true, io]
+      else
+        path = Truffle::Type.coerce_to obj, String, :to_path
+        io = File.open path, mode
+        [false, io]
+      end
     end
 
     def read_method(obj)
diff --git a/src/main/ruby/truffleruby/core/regexp.rb b/src/main/ruby/truffleruby/core/regexp.rb
@@ -55,22 +55,27 @@ def self.try_convert(obj)
     Truffle::Type.try_convert obj, Regexp, :to_regexp
   end
 
-  def self.convert(pattern)
-    return pattern if Primitive.is_a?(pattern, Regexp)
-    if Primitive.is_a?(pattern, Array)
-      union(*pattern)
-    else
-      Regexp.quote(pattern.to_s)
-    end
-  end
+  def self.negotiate_union_encoding(*patterns)
+    compatible_enc = nil
+
+    patterns.each do |pattern|
+      converted = Primitive.is_a?(pattern, Regexp) ? pattern : Regexp.quote(pattern)
+
+      enc = converted.encoding
+
+      if Primitive.nil?(compatible_enc)
+        compatible_enc = enc
+      else
+        if test = Primitive.encoding_compatible?(enc, compatible_enc)
+          compatible_enc = test
+        else
+          raise ArgumentError, "incompatible encodings: #{compatible_enc} and #{enc}"
+        end
 
-  def self.compatible?(*patterns)
-    encodings = patterns.map { |r| convert(r).encoding }
-    last_enc = encodings.pop
-    encodings.each do |encoding|
-      raise ArgumentError, "incompatible encodings: #{encoding} and #{last_enc}" unless Primitive.encoding_compatible?(last_enc, encoding)
-      last_enc = encoding
+      end
     end
+
+    compatible_enc
   end
 
   def self.last_match(index = nil)
@@ -96,37 +101,35 @@ def self.last_match(index = nil)
   def self.union(*patterns)
     case patterns.size
     when 0
-      return %r/(?!)/
+      %r/(?!)/
     when 1
       pattern = patterns.first
       case pattern
       when Array
-        return union(*pattern)
+        union(*pattern)
       else
         converted = Truffle::Type.rb_check_convert_type(pattern, Regexp, :to_regexp)
         if Primitive.nil? converted
-          return Regexp.new(Regexp.quote(pattern))
+          Regexp.new(Regexp.quote(pattern))
         else
-          return converted
+          converted
         end
       end
     else
-      compatible?(*patterns)
-      enc = convert(patterns.first).encoding
-    end
+      patterns = patterns.map do |pat|
+        if Primitive.is_a?(pat, Regexp)
+          pat
+        else
+          StringValue(pat)
+        end
+      end
 
-    sep = '|'.encode(enc)
-    str = ''.encode(enc)
+      enc = negotiate_union_encoding(*patterns)
+      sep = '|'.encode(enc)
+      str = ''.encode(enc)
 
-    patterns = patterns.map do |pat|
-      if Primitive.is_a?(pat, Regexp)
-        pat
-      else
-        StringValue(pat)
-      end
+      Truffle::RegexpOperations.union(str, sep, *patterns)
     end
-
-    Truffle::RegexpOperations.union(str, sep, *patterns)
   end
   Truffle::Graal.always_split(method(:union))