Skip to content

Commit fd28f98

Browse files
committed
Remove noshift SubString constructor and add unsafe_substring
This commit includes two changes: * It removes an undocumented SubString constructor which triage has agreed can be considered internal * It adds a new public, but unexported function `unsafe_substring`, which creates a substring without checking for valid string indices.
1 parent 4f1e471 commit fd28f98

File tree

8 files changed

+89
-25
lines changed

8 files changed

+89
-25
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ New library functions
5757
* `Base.donotdelete` is now public. It prevents deadcode elimination of its arguments ([#55774]).
5858
* `Sys.sysimage_target()` returns the CPU target string used to build the current system image ([#58970]).
5959
* `Iterators.findeach` is a lazy version of `findall` ([#54124])
60+
* `Base.unsafe_substring` is an unexported, public constuctor to build a `SubString` without checking for
61+
valid string indices.
6062

6163
New library features
6264
--------------------

base/public.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ public
107107

108108
# Strings
109109
escape_raw_string,
110+
unsafe_substring,
110111

111112
# IO
112113
# types

base/regex.jl

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -447,12 +447,10 @@ end
447447

448448
function _annotatedmatch(m::RegexMatch{S}, str::AnnotatedString{S}) where {S<:AbstractString}
449449
RegexMatch{AnnotatedString{S}}(
450-
(@inbounds SubString{AnnotatedString{S}}(
451-
str, m.match.offset, m.match.ncodeunits, Val(:noshift))),
450+
(@inbounds unsafe_substring(str, m.match.offset + 1, m.match.ncodeunits)),
452451
Union{Nothing,SubString{AnnotatedString{S}}}[
453452
if !isnothing(cap)
454-
(@inbounds SubString{AnnotatedString{S}}(
455-
str, cap.offset, cap.ncodeunits, Val(:noshift)))
453+
(@inbounds unsafe_substring(str, cap.offset + 1, cap.ncodeunits))
456454
end for cap in m.captures],
457455
m.offset, m.offsets, m.regex)
458456
end

base/strings/annotated.jl

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,13 @@ eltype(::Type{<:AnnotatedString{S}}) where {S} = AnnotatedChar{eltype(S)}
158158
firstindex(s::AnnotatedString) = firstindex(s.string)
159159
lastindex(s::AnnotatedString) = lastindex(s.string)
160160

161+
plain_substring(s::AnnotatedString) = @inbounds unsafe_substring(s.string, 1, ncodeunits(s))
162+
163+
function plain_substring(s::SubString{<:AnnotatedString})
164+
@inbounds unsafe_substring(s.string.string, s.offset + 1, s.ncodeunits)
165+
end
166+
167+
161168
function getindex(s::AnnotatedString, i::Integer)
162169
@boundscheck checkbounds(s, i)
163170
@inbounds if isvalid(s, i)
@@ -204,16 +211,14 @@ cmp(a::AnnotatedString, b::AnnotatedString) = cmp(a.string, b.string)
204211
# To prevent substring equality from hitting the generic fallback
205212

206213
function ==(a::SubString{<:AnnotatedString}, b::SubString{<:AnnotatedString})
207-
SubString(a.string.string, a.offset, a.ncodeunits, Val(:noshift)) ==
208-
SubString(b.string.string, b.offset, b.ncodeunits, Val(:noshift)) &&
209-
annotations(a) == annotations(b)
214+
plain_substring(a) == plain_substring(b) && annotations(a) == annotations(b)
210215
end
211216

212217
==(a::SubString{<:AnnotatedString}, b::AnnotatedString) =
213-
annotations(a) == annotations(b) && SubString(a.string.string, a.offset, a.ncodeunits, Val(:noshift)) == b.string
218+
annotations(a) == annotations(b) && plain_substring(a) == b.string
214219

215220
==(a::SubString{<:AnnotatedString}, b::AbstractString) =
216-
isempty(annotations(a)) && SubString(a.string.string, a.offset, a.ncodeunits, Val(:noshift)) == b
221+
isempty(annotations(a)) && plain_substring(a) == b
217222

218223
==(a::AbstractString, b::SubString{<:AnnotatedString}) = b == a
219224

@@ -262,7 +267,7 @@ function annotatedstring(xs...)
262267
push!(annotations, setindex(annot, rstart:rstop, :region))
263268
end
264269
end
265-
print(s, SubString(x.string.string, x.offset, x.ncodeunits, Val(:noshift)))
270+
print(s, plain_substring(x))
266271
elseif x isa AnnotatedChar
267272
for annot in x.annotations
268273
push!(annotations, (region=1+size:1+size, annot...))

base/strings/substring.jl

Lines changed: 60 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,18 +36,67 @@ struct SubString{T<:AbstractString} <: AbstractString
3636
end
3737
return new(s, i-1, nextind(s,j)-i)
3838
end
39-
function SubString{T}(s::T, i::Int, j::Int, ::Val{:noshift}) where T<:AbstractString
40-
@boundscheck if !(i == j == 0)
41-
si, sj = i + 1, prevind(s, j + i + 1)
42-
@inbounds isvalid(s, si) || string_index_err(s, si)
43-
@inbounds isvalid(s, sj) || string_index_err(s, sj)
44-
end
45-
new(s, i, j)
39+
# We don't expose this, because the exposed constructor needs to avoid constructing
40+
# a SubString{SubString{T}} when passed a substring.
41+
global function _unsafe_substring(s::T, offset::Int, ncodeunits::Int) where {T <: AbstractString}
42+
new{T}(s, offset, ncodeunits)
43+
end
44+
end
45+
46+
function check_codeunit_bounds(s::AbstractString, first_index::Int, n_codeunits::Int)
47+
last_index = first_index + n_codeunits - 1
48+
bad_index = if first_index < 1
49+
first_index
50+
elseif last_index > ncodeunits(s)
51+
last_index
52+
else
53+
return nothing
4654
end
55+
throw(BoundsError(s, bad_index))
56+
end
57+
58+
"""
59+
unsafe_substring(s::AbstractString, first_index::Int, n_codeunits::Int)::SubString{typeof(s)}
60+
unsafe_substring(s::SubString{S}, first_index::Int, n_codeunits::Int)::SubString{S}
61+
62+
Create a substring of `s` spanning the codeunits `first_index:(first_index + n_codeunits - 1)`.
63+
64+
If `first_index` < 1, or `first_index + n_codeunits - 1 > ncodeunits(s)`, throw a `BoundsError`.
65+
66+
This function does check bounds, but does not validate that the arguments corresponds to valid
67+
start and end indices in `s`, and so the resulting substring may contain truncated characters.
68+
The presence of truncated characters is safe and well-defined for `String` and `SubString{String}`,
69+
but may not be permitted for custom subtypes of `AbstractString`.
70+
71+
# Examples
72+
```jldoctest
73+
julia> s = "Hello, Bjørn!";
74+
75+
julia> ss = unsafe_substring(s, 3, 10)
76+
"lo, Bjørn"
77+
78+
julia> typeof(ss)
79+
SubString{String}
80+
81+
julia> ss2 = unsafe_substring(ss, 2, 6)
82+
"o, Bj\\xc3"
83+
84+
julia> typeof(ss2)
85+
SubString{String}
86+
```
87+
"""
88+
function unsafe_substring(s::AbstractString, first_index::Int, n_codeunits::Int)
89+
@boundscheck @inline checkbounds(codeunits(s), first_index:(first_index + n_codeunits - 1))
90+
return _unsafe_substring(s, first_index - 1, n_codeunits)
91+
end
92+
93+
function unsafe_substring(s::SubString, first_index::Int, n_codeunits::Int)
94+
@boundscheck @inline check_codeunit_bounds(s, first_index, n_codeunits)
95+
string = s.string
96+
return _unsafe_substring(string, first_index + s.offset - 1, n_codeunits)
4797
end
4898

4999
@propagate_inbounds SubString(s::T, i::Int, j::Int) where {T<:AbstractString} = SubString{T}(s, i, j)
50-
@propagate_inbounds SubString(s::T, i::Int, j::Int, v::Val{:noshift}) where {T<:AbstractString} = SubString{T}(s, i, j, v)
51100
@propagate_inbounds SubString(s::AbstractString, i::Integer, j::Integer=lastindex(s)) = SubString(s, Int(i)::Int, Int(j)::Int)
52101
@propagate_inbounds SubString(s::AbstractString, r::AbstractUnitRange{<:Integer}) = SubString(s, first(r), last(r))
53102

@@ -56,8 +105,9 @@ end
56105
SubString(s.string, s.offset+i, s.offset+j)
57106
end
58107

59-
SubString(s::AbstractString) = SubString(s, 1, lastindex(s)::Int)
60-
SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, lastindex(s)::Int)
108+
SubString(s::AbstractString) = @inbounds unsafe_substring(s, 1, Int(ncodeunits(s))::Int)
109+
SubString{T}(s::T) where {T<:AbstractString} = SubString(s)
110+
SubString(s::SubString) = s
61111

62112
@propagate_inbounds view(s::AbstractString, r::AbstractUnitRange{<:Integer}) = SubString(s, r)
63113
@propagate_inbounds maybeview(s::AbstractString, r::AbstractUnitRange{<:Integer}) = view(s, r)

base/strings/util.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ end
377377
end
378378
off = s isa String ? 0 : s.offset
379379
par = s isa String ? s : s.string
380-
@inbounds @inline SubString{String}(par, off, len, Val{:noshift}())
380+
@inbounds unsafe_substring(s, 1, len)
381381
end
382382
"""
383383
lstrip([pred=isspace,] str::AbstractString)::SubString

doc/src/base/strings.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Base.repeat(::AbstractChar, ::Integer)
1515
Base.repr(::Any)
1616
Core.String(::AbstractString)
1717
Base.SubString
18+
Base.unsafe_substring
1819
Base.LazyString
1920
Base.@lazy_str
2021
Base.transcode

test/strings/basic.jl

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -222,10 +222,17 @@ end
222222
@test (@views (x[3], x[1:2], x[[1,4]])) == ('c', "ab", "ad")
223223
end
224224

225-
@testset ":noshift constructor" begin
226-
@test SubString("", 0, 0, Val(:noshift)) == ""
227-
@test SubString("abcd", 0, 1, Val(:noshift)) == "a"
228-
@test SubString("abcd", 0, 4, Val(:noshift)) == "abcd"
225+
@testset "unsafe_substring" begin
226+
s = "abcdefgøø"
227+
@test unsafe_substring(s, 1, 11) == s
228+
@test unsafe_substring(s, 1, 3) == "abc"
229+
@test unsafe_substring(s, 3, 3) == "cde"
230+
@test unsafe_substring(s, 5, 4) == String(codeunits(s)[5:8])
231+
@test unsafe_substring(s, 1, 2) isa SubString{String}
232+
@test unsafe_substring(unsafe_substring(s, 2, 8), 1, 3) isa SubString{String}
233+
234+
@test_throws BoundsError unsafe_substring(s, 0, 2)
235+
@test_throws BoundsError unsafe_substring(s, 2, 11)
229236
end
230237
end
231238

0 commit comments

Comments
 (0)