Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ New library functions
* `Base.donotdelete` is now public. It prevents deadcode elimination of its arguments ([#55774]).
* `Sys.sysimage_target()` returns the CPU target string used to build the current system image ([#58970]).
* `Iterators.findeach` is a lazy version of `findall` ([#54124])
* `Base.unsafe_substring` is an unexported, public constructor to build a `SubString` without checking for
valid string indices.

New library features
--------------------
Expand Down
2 changes: 2 additions & 0 deletions base/public.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ public

# Strings
escape_raw_string,
unsafe_substring,
unannotate,

# IO
# types
Expand Down
6 changes: 2 additions & 4 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -447,12 +447,10 @@ end

function _annotatedmatch(m::RegexMatch{S}, str::AnnotatedString{S}) where {S<:AbstractString}
RegexMatch{AnnotatedString{S}}(
(@inbounds SubString{AnnotatedString{S}}(
str, m.match.offset, m.match.ncodeunits, Val(:noshift))),
(@inbounds unsafe_substring(str, m.match.offset + 1, m.match.ncodeunits)),
Union{Nothing,SubString{AnnotatedString{S}}}[
if !isnothing(cap)
(@inbounds SubString{AnnotatedString{S}}(
str, cap.offset, cap.ncodeunits, Val(:noshift)))
(@inbounds unsafe_substring(str, cap.offset + 1, cap.ncodeunits))
end for cap in m.captures],
m.offset, m.offsets, m.regex)
end
Expand Down
36 changes: 30 additions & 6 deletions base/strings/annotated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,32 @@ eltype(::Type{<:AnnotatedString{S}}) where {S} = AnnotatedChar{eltype(S)}
firstindex(s::AnnotatedString) = firstindex(s.string)
lastindex(s::AnnotatedString) = lastindex(s.string)

"""
unannotate(s::AnnotatedString{S})::S
unannotate(s::SubString{AnnotatedString{S}})::SubString{S}

Get the underlying string of `s`, without copying.

# Examples
```jldoctest; setup=:(using Base: AnnotatedString)
julia> s = AnnotatedString("abcde", [(1:3, :A, 4)])
"abcde"

julia> u = unannotate(s)
"abcde"

julia> typeof(u)
String
```
"""
unannotate(s::AnnotatedString) = s.string

function unannotate(s::SubString{<:AnnotatedString})
start_index = first(parentindices(s)[1])
@inbounds unsafe_substring(parent(s).string, start_index, ncodeunits(s))
end


function getindex(s::AnnotatedString, i::Integer)
@boundscheck checkbounds(s, i)
@inbounds if isvalid(s, i)
Expand Down Expand Up @@ -204,16 +230,14 @@ cmp(a::AnnotatedString, b::AnnotatedString) = cmp(a.string, b.string)
# To prevent substring equality from hitting the generic fallback

function ==(a::SubString{<:AnnotatedString}, b::SubString{<:AnnotatedString})
SubString(a.string.string, a.offset, a.ncodeunits, Val(:noshift)) ==
SubString(b.string.string, b.offset, b.ncodeunits, Val(:noshift)) &&
annotations(a) == annotations(b)
unannotate(a) == unannotate(b) && annotations(a) == annotations(b)
end

==(a::SubString{<:AnnotatedString}, b::AnnotatedString) =
annotations(a) == annotations(b) && SubString(a.string.string, a.offset, a.ncodeunits, Val(:noshift)) == b.string
annotations(a) == annotations(b) && unannotate(a) == b.string

==(a::SubString{<:AnnotatedString}, b::AbstractString) =
isempty(annotations(a)) && SubString(a.string.string, a.offset, a.ncodeunits, Val(:noshift)) == b
isempty(annotations(a)) && unannotate(a) == b

==(a::AbstractString, b::SubString{<:AnnotatedString}) = b == a

Expand Down Expand Up @@ -262,7 +286,7 @@ function annotatedstring(xs...)
push!(annotations, setindex(annot, rstart:rstop, :region))
end
end
print(s, SubString(x.string.string, x.offset, x.ncodeunits, Val(:noshift)))
print(s, unannotate(x))
elseif x isa AnnotatedChar
for annot in x.annotations
push!(annotations, (region=1+size:1+size, annot...))
Expand Down
70 changes: 60 additions & 10 deletions base/strings/substring.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,67 @@ struct SubString{T<:AbstractString} <: AbstractString
end
return new(s, i-1, nextind(s,j)-i)
end
function SubString{T}(s::T, i::Int, j::Int, ::Val{:noshift}) where T<:AbstractString
@boundscheck if !(i == j == 0)
si, sj = i + 1, prevind(s, j + i + 1)
@inbounds isvalid(s, si) || string_index_err(s, si)
@inbounds isvalid(s, sj) || string_index_err(s, sj)
end
new(s, i, j)
# We don't expose this, because the exposed constructor needs to avoid constructing
# a SubString{SubString{T}} when passed a substring.
global function _unsafe_substring(s::T, offset::Int, ncodeunits::Int) where {T <: AbstractString}
new{T}(s, offset, ncodeunits)
end
end

function check_codeunit_bounds(s::AbstractString, first_index::Int, n_codeunits::Int)
last_index = first_index + n_codeunits - 1
bad_index = if first_index < 1
first_index
elseif last_index > ncodeunits(s)
last_index
else
return nothing
end
throw(BoundsError(s, bad_index))
end

"""
unsafe_substring(s::AbstractString, first_index::Int, n_codeunits::Int)::SubString{typeof(s)}
unsafe_substring(s::SubString{S}, first_index::Int, n_codeunits::Int)::SubString{S}

Create a substring of `s` spanning the codeunits `first_index:(first_index + n_codeunits - 1)`.

If `first_index` < 1, or `first_index + n_codeunits - 1 > ncodeunits(s)`, throw a `BoundsError`.

This function does check bounds, but does not validate that the arguments corresponds to valid
start and end indices in `s`, and so the resulting substring may contain truncated characters.
The presence of truncated characters is safe and well-defined for `String` and `SubString{String}`,
but may not be permitted for custom subtypes of `AbstractString`.

# Examples
```jldoctest
julia> s = "Hello, Bjørn!";

julia> ss = unsafe_substring(s, 3, 10)
"lo, Bjørn"

julia> typeof(ss)
SubString{String}

julia> ss2 = unsafe_substring(ss, 2, 6)
"o, Bj\\xc3"

julia> typeof(ss2)
SubString{String}
```
"""
function unsafe_substring(s::AbstractString, first_index::Int, n_codeunits::Int)
@boundscheck @inline checkbounds(codeunits(s), first_index:(first_index + n_codeunits - 1))
return _unsafe_substring(s, first_index - 1, n_codeunits)
end

function unsafe_substring(s::SubString, first_index::Int, n_codeunits::Int)
@boundscheck @inline check_codeunit_bounds(s, first_index, n_codeunits)
string = s.string
return _unsafe_substring(string, first_index + s.offset - 1, n_codeunits)
end

@propagate_inbounds SubString(s::T, i::Int, j::Int) where {T<:AbstractString} = SubString{T}(s, i, j)
@propagate_inbounds SubString(s::T, i::Int, j::Int, v::Val{:noshift}) where {T<:AbstractString} = SubString{T}(s, i, j, v)
@propagate_inbounds SubString(s::AbstractString, i::Integer, j::Integer=lastindex(s)) = SubString(s, Int(i)::Int, Int(j)::Int)
@propagate_inbounds SubString(s::AbstractString, r::AbstractUnitRange{<:Integer}) = SubString(s, first(r), last(r))

Expand All @@ -56,8 +105,9 @@ end
SubString(s.string, s.offset+i, s.offset+j)
end

SubString(s::AbstractString) = SubString(s, 1, lastindex(s)::Int)
SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, lastindex(s)::Int)
SubString(s::AbstractString) = @inbounds unsafe_substring(s, 1, Int(ncodeunits(s))::Int)
SubString{T}(s::T) where {T<:AbstractString} = SubString(s)
SubString(s::SubString) = s

@propagate_inbounds view(s::AbstractString, r::AbstractUnitRange{<:Integer}) = SubString(s, r)
@propagate_inbounds maybeview(s::AbstractString, r::AbstractUnitRange{<:Integer}) = view(s, r)
Expand Down
2 changes: 1 addition & 1 deletion base/strings/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ end
end
off = s isa String ? 0 : s.offset
par = s isa String ? s : s.string
@inbounds @inline SubString{String}(par, off, len, Val{:noshift}())
@inbounds unsafe_substring(s, 1, len)
end
"""
lstrip([pred=isspace,] str::AbstractString)::SubString
Expand Down
2 changes: 2 additions & 0 deletions doc/src/base/strings.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Base.repeat(::AbstractChar, ::Integer)
Base.repr(::Any)
Core.String(::AbstractString)
Base.SubString
Base.unsafe_substring
Base.LazyString
Base.@lazy_str
Base.transcode
Expand Down Expand Up @@ -110,4 +111,5 @@ Base.AnnotatedChar
Base.annotatedstring
Base.annotations
Base.annotate!
Base.unannotate
```
11 changes: 11 additions & 0 deletions test/strings/annotated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,17 @@

@test Bool === Base.infer_return_type(isvalid, Tuple{Base.AnnotatedString, Vararg})
@test Int === Base.infer_return_type(ncodeunits, Tuple{Base.AnnotatedString})

@testset "unannotate" begin
s = "some string"
str = Base.AnnotatedString(s, [(2:5, :A, 3)])
@test Base.unannotate(str) === s

str2 = SubString(str, 2:9)
u = Base.unannotate(str2)
@test u isa SubString{String}
@test u == SubString(s, 2:9)
end
end

@testset "AnnotatedChar" begin
Expand Down
15 changes: 11 additions & 4 deletions test/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,17 @@ end
@test (@views (x[3], x[1:2], x[[1,4]])) == ('c', "ab", "ad")
end

@testset ":noshift constructor" begin
@test SubString("", 0, 0, Val(:noshift)) == ""
@test SubString("abcd", 0, 1, Val(:noshift)) == "a"
@test SubString("abcd", 0, 4, Val(:noshift)) == "abcd"
@testset "unsafe_substring" begin
s = "abcdefgøø"
@test unsafe_substring(s, 1, 11) == s
@test unsafe_substring(s, 1, 3) == "abc"
@test unsafe_substring(s, 3, 3) == "cde"
@test unsafe_substring(s, 5, 4) == String(codeunits(s)[5:8])
@test unsafe_substring(s, 1, 2) isa SubString{String}
@test unsafe_substring(unsafe_substring(s, 2, 8), 1, 3) isa SubString{String}

@test_throws BoundsError unsafe_substring(s, 0, 2)
@test_throws BoundsError unsafe_substring(s, 2, 11)
end
end

Expand Down