Line | Exclusive | Inclusive | Code |
---|---|---|---|
1 | # This file is a part of Julia. License is MIT: https://julialang.org/license | ||
2 | |||
3 | """ | ||
4 | StringIndexError(str, i) | ||
5 | |||
6 | An error occurred when trying to access `str` at index `i` that is not valid. | ||
7 | """ | ||
8 | struct StringIndexError <: Exception | ||
9 | string::AbstractString | ||
10 | index::Integer | ||
11 | end | ||
12 | @noinline string_index_err(s::AbstractString, i::Integer) = | ||
13 | throw(StringIndexError(s, Int(i))) | ||
14 | |||
15 | const ByteArray = Union{Vector{UInt8},Vector{Int8}} | ||
16 | |||
17 | @inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi) | ||
18 | |||
19 | ## constructors and conversions ## | ||
20 | |||
21 | # String constructor docstring from boot.jl, workaround for #16730 | ||
22 | # and the unavailability of @doc in boot.jl context. | ||
23 | """ | ||
24 | String(v::AbstractVector{UInt8}) | ||
25 | |||
26 | Create a new `String` object from a byte vector `v` containing UTF-8 encoded | ||
27 | characters. If `v` is `Vector{UInt8}` it will be truncated to zero length and | ||
28 | future modification of `v` cannot affect the contents of the resulting string. | ||
29 | To avoid truncation use `String(copy(v))`. | ||
30 | |||
31 | When possible, the memory of `v` will be used without copying when the `String` | ||
32 | object is created. This is guaranteed to be the case for byte vectors returned | ||
33 | by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to | ||
34 | [`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings. | ||
35 | In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway | ||
36 | to guarantee consistent behavior. | ||
37 | """ | ||
38 | String(v::AbstractVector{UInt8}) = String(copyto!(StringVector(length(v)), v)) | ||
39 | 1 (2.38%) | 1 (2.38%) |
1 (2.38%) samples spent in Type
String(v::Vector{UInt8}) = ccall(:jl_array_to_string, Ref{String}, (Any,), v)
1 (100.00%) (ex.), 1 (100.00%) (incl.) when called from print_to_string line 124 |
40 | |||
41 | """ | ||
42 | unsafe_string(p::Ptr{UInt8}, [length::Integer]) | ||
43 | |||
44 | Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8. | ||
45 | (The pointer can be safely freed afterwards.) If `length` is specified | ||
46 | (the length of the data in bytes), the string does not have to be NUL-terminated. | ||
47 | |||
48 | This function is labeled "unsafe" because it will crash if `p` is not | ||
49 | a valid memory address to data of the requested length. | ||
50 | """ | ||
51 | function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer) | ||
52 | p == C_NULL && throw(ArgumentError("cannot convert NULL to string")) | ||
53 | ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len) | ||
54 | end | ||
55 | function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}) | ||
56 | p == C_NULL && throw(ArgumentError("cannot convert NULL to string")) | ||
57 | ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p) | ||
58 | end | ||
59 | |||
60 | 5 (11.90%) | 5 (11.90%) |
5 (11.90%) samples spent in _string_n
_string_n(n::Integer) = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n)
5 (100.00%) (ex.), 5 (100.00%) (incl.) when called from StringVector line 31 |
61 | |||
62 | """ | ||
63 | String(s::AbstractString) | ||
64 | |||
65 | Convert a string to a contiguous byte array representation encoded as UTF-8 bytes. | ||
66 | This representation is often appropriate for passing strings to C. | ||
67 | """ | ||
68 | String(s::AbstractString) = print_to_string(s) | ||
69 | String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s)) | ||
70 | |||
71 | 6 (14.29%) | 6 (14.29%) |
6 (14.29%) samples spent in StringVector
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
6 (100.00%) (ex.), 6 (100.00%) (incl.) when called from #IOBuffer#302 line 114 |
72 | |||
73 | (::Type{Vector{UInt8}})(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s) | ||
74 | |||
75 | String(s::CodeUnits{UInt8,String}) = s.s | ||
76 | |||
77 | ## low-level functions ## | ||
78 | |||
79 | pointer(s::String) = unsafe_convert(Ptr{UInt8}, s) | ||
80 | pointer(s::String, i::Integer) = pointer(s)+(i-1) | ||
81 | |||
82 | ncodeunits(s::String) = Core.sizeof(s) | ||
83 | sizeof(s::String) = Core.sizeof(s) | ||
84 | codeunit(s::String) = UInt8 | ||
85 | |||
86 | @inline function codeunit(s::String, i::Integer) | ||
87 | @boundscheck checkbounds(s, i) | ||
88 | GC.@preserve s unsafe_load(pointer(s, i)) | ||
89 | end | ||
90 | |||
91 | ## comparison ## | ||
92 | |||
93 | function cmp(a::String, b::String) | ||
94 | al, bl = sizeof(a), sizeof(b) | ||
95 | c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), | ||
96 | a, b, min(al,bl)) | ||
97 | return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl) | ||
98 | end | ||
99 | |||
100 | function ==(a::String, b::String) | ||
101 | al = sizeof(a) | ||
102 | al == sizeof(b) && 0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, al) | ||
103 | end | ||
104 | |||
105 | typemin(::Type{String}) = "" | ||
106 | typemin(::String) = typemin(String) | ||
107 | |||
108 | ## thisind, nextind ## | ||
109 | |||
110 | 1 (2.38%) | 1 (2.38%) |
1 (2.38%) samples spent in lastindex
thisind(s::String, i::Int) = _thisind_str(s, i)
1 (100.00%) (ex.), 1 (100.00%) (incl.) when called from tostr_sizehint line 107 |
111 | |||
112 | # s should be String or SubString{String} | ||
113 |
1 (2.38%) samples spent in _thisind_str
function _thisind_str(s, i::Int)
1 (100.00%) (ex.), 1 (100.00%) (incl.) when called from print_to_string line 117 |
||
114 | 1 (2.38%) | 1 (2.38%) | i == 0 && return 0 |
115 | n = ncodeunits(s) | ||
116 | i == n + 1 && return i | ||
117 | @boundscheck between(i, 1, n) || throw(BoundsError(s, i)) | ||
118 | @inbounds b = codeunit(s, i) | ||
119 | (b & 0xc0 == 0x80) & (i-1 > 0) || return i | ||
120 | @inbounds b = codeunit(s, i-1) | ||
121 | between(b, 0b11000000, 0b11110111) && return i-1 | ||
122 | (b & 0xc0 == 0x80) & (i-2 > 0) || return i | ||
123 | @inbounds b = codeunit(s, i-2) | ||
124 | between(b, 0b11100000, 0b11110111) && return i-2 | ||
125 | (b & 0xc0 == 0x80) & (i-3 > 0) || return i | ||
126 | @inbounds b = codeunit(s, i-3) | ||
127 | between(b, 0b11110000, 0b11110111) && return i-3 | ||
128 | return i | ||
129 | end | ||
130 | |||
131 | nextind(s::String, i::Int) = _nextind_str(s, i) | ||
132 | |||
133 | # s should be String or SubString{String} | ||
134 | function _nextind_str(s, i::Int) | ||
135 | i == 0 && return 1 | ||
136 | n = ncodeunits(s) | ||
137 | @boundscheck between(i, 1, n) || throw(BoundsError(s, i)) | ||
138 | @inbounds l = codeunit(s, i) | ||
139 | (l < 0x80) | (0xf8 ≤ l) && return i+1 | ||
140 | if l < 0xc0 | ||
141 | i′ = thisind(s, i) | ||
142 | return i′ < i ? nextind(s, i′) : i+1 | ||
143 | end | ||
144 | # first continuation byte | ||
145 | (i += 1) > n && return i | ||
146 | @inbounds b = codeunit(s, i) | ||
147 | b & 0xc0 ≠ 0x80 && return i | ||
148 | ((i += 1) > n) | (l < 0xe0) && return i | ||
149 | # second continuation byte | ||
150 | @inbounds b = codeunit(s, i) | ||
151 | b & 0xc0 ≠ 0x80 && return i | ||
152 | ((i += 1) > n) | (l < 0xf0) && return i | ||
153 | # third continuation byte | ||
154 | @inbounds b = codeunit(s, i) | ||
155 | ifelse(b & 0xc0 ≠ 0x80, i, i+1) | ||
156 | end | ||
157 | |||
158 | ## checking UTF-8 & ACSII validity ## | ||
159 | |||
160 | byte_string_classify(data::Vector{UInt8}) = | ||
161 | ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), data, length(data)) | ||
162 | byte_string_classify(s::String) = | ||
163 | ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s)) | ||
164 | # 0: neither valid ASCII nor UTF-8 | ||
165 | # 1: valid ASCII | ||
166 | # 2: valid UTF-8 | ||
167 | |||
168 | isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) ≠ 0 | ||
169 | isvalid(s::String) = isvalid(String, s) | ||
170 | |||
171 | is_valid_continuation(c) = c & 0xc0 == 0x80 | ||
172 | |||
173 | ## required core functionality ## | ||
174 | |||
175 | @propagate_inbounds function iterate(s::String, i::Int=firstindex(s)) | ||
176 | i > ncodeunits(s) && return nothing | ||
177 | b = codeunit(s, i) | ||
178 | u = UInt32(b) << 24 | ||
179 | between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1 | ||
180 | return next_continued(s, i, u) | ||
181 | end | ||
182 | |||
183 | function next_continued(s::String, i::Int, u::UInt32) | ||
184 | u < 0xc0000000 && (i += 1; @goto ret) | ||
185 | n = ncodeunits(s) | ||
186 | # first continuation byte | ||
187 | (i += 1) > n && @goto ret | ||
188 | @inbounds b = codeunit(s, i) | ||
189 | b & 0xc0 == 0x80 || @goto ret | ||
190 | u |= UInt32(b) << 16 | ||
191 | # second continuation byte | ||
192 | ((i += 1) > n) | (u < 0xe0000000) && @goto ret | ||
193 | @inbounds b = codeunit(s, i) | ||
194 | b & 0xc0 == 0x80 || @goto ret | ||
195 | u |= UInt32(b) << 8 | ||
196 | # third continuation byte | ||
197 | ((i += 1) > n) | (u < 0xf0000000) && @goto ret | ||
198 | @inbounds b = codeunit(s, i) | ||
199 | b & 0xc0 == 0x80 || @goto ret | ||
200 | u |= UInt32(b); i += 1 | ||
201 | @label ret | ||
202 | return reinterpret(Char, u), i | ||
203 | end | ||
204 | |||
205 | @propagate_inbounds function getindex(s::String, i::Int) | ||
206 | b = codeunit(s, i) | ||
207 | u = UInt32(b) << 24 | ||
208 | between(b, 0x80, 0xf7) || return reinterpret(Char, u) | ||
209 | return getindex_continued(s, i, u) | ||
210 | end | ||
211 | |||
212 | function getindex_continued(s::String, i::Int, u::UInt32) | ||
213 | if u < 0xc0000000 | ||
214 | # called from `getindex` which checks bounds | ||
215 | @inbounds isvalid(s, i) && @goto ret | ||
216 | string_index_err(s, i) | ||
217 | end | ||
218 | n = ncodeunits(s) | ||
219 | |||
220 | (i += 1) > n && @goto ret | ||
221 | @inbounds b = codeunit(s, i) # cont byte 1 | ||
222 | b & 0xc0 == 0x80 || @goto ret | ||
223 | u |= UInt32(b) << 16 | ||
224 | |||
225 | ((i += 1) > n) | (u < 0xe0000000) && @goto ret | ||
226 | @inbounds b = codeunit(s, i) # cont byte 2 | ||
227 | b & 0xc0 == 0x80 || @goto ret | ||
228 | u |= UInt32(b) << 8 | ||
229 | |||
230 | ((i += 1) > n) | (u < 0xf0000000) && @goto ret | ||
231 | @inbounds b = codeunit(s, i) # cont byte 3 | ||
232 | b & 0xc0 == 0x80 || @goto ret | ||
233 | u |= UInt32(b) | ||
234 | @label ret | ||
235 | return reinterpret(Char, u) | ||
236 | end | ||
237 | |||
238 | getindex(s::String, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))] | ||
239 | |||
240 | function getindex(s::String, r::UnitRange{Int}) | ||
241 | isempty(r) && return "" | ||
242 | i, j = first(r), last(r) | ||
243 | @boundscheck begin | ||
244 | checkbounds(s, r) | ||
245 | @inbounds isvalid(s, i) || string_index_err(s, i) | ||
246 | @inbounds isvalid(s, j) || string_index_err(s, j) | ||
247 | end | ||
248 | j = nextind(s, j) - 1 | ||
249 | n = j - i + 1 | ||
250 | ss = _string_n(n) | ||
251 | p = pointer(ss) | ||
252 | for k = 1:n | ||
253 | unsafe_store!(p, codeunit(s, i + k - 1), k) | ||
254 | end | ||
255 | return ss | ||
256 | end | ||
257 | |||
258 | function length(s::String, i::Int, j::Int) | ||
259 | @boundscheck begin | ||
260 | 0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i)) | ||
261 | 0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j)) | ||
262 | end | ||
263 | j < i && return 0 | ||
264 | @inbounds i, k = thisind(s, i), i | ||
265 | c = j - i + (i == k) | ||
266 | length(s, i, j, c) | ||
267 | end | ||
268 | |||
269 | length(s::String) = length(s, 1, ncodeunits(s), ncodeunits(s)) | ||
270 | |||
271 | @inline function length(s::String, i::Int, n::Int, c::Int) | ||
272 | i < n || return c | ||
273 | @inbounds b = codeunit(s, i) | ||
274 | @inbounds while true | ||
275 | while true | ||
276 | (i += 1) ≤ n || return c | ||
277 | 0xc0 ≤ b ≤ 0xf7 && break | ||
278 | b = codeunit(s, i) | ||
279 | end | ||
280 | l = b | ||
281 | b = codeunit(s, i) # cont byte 1 | ||
282 | c -= (x = b & 0xc0 == 0x80) | ||
283 | x & (l ≥ 0xe0) || continue | ||
284 | |||
285 | (i += 1) ≤ n || return c | ||
286 | b = codeunit(s, i) # cont byte 2 | ||
287 | c -= (x = b & 0xc0 == 0x80) | ||
288 | x & (l ≥ 0xf0) || continue | ||
289 | |||
290 | (i += 1) ≤ n || return c | ||
291 | b = codeunit(s, i) # cont byte 3 | ||
292 | c -= (b & 0xc0 == 0x80) | ||
293 | end | ||
294 | end | ||
295 | |||
296 | # TODO: delete or move to char.jl | ||
297 | first_utf8_byte(c::Char) = (reinterpret(UInt32, c) >> 24) % UInt8 | ||
298 | |||
299 | ## overload methods for efficiency ## | ||
300 | |||
301 | isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i | ||
302 | |||
303 | # UTF-8 encoding length of a character | ||
304 | # TODO: delete or move to char.jl | ||
305 | codelen(c::Char) = 4 - (trailing_zeros(0xff000000 | reinterpret(UInt32, c)) >> 3) | ||
306 | |||
307 | """ | ||
308 | repeat(c::AbstractChar, r::Integer) -> String | ||
309 | |||
310 | Repeat a character `r` times. This can equivalently be accomplished by calling [`c^r`](@ref ^). | ||
311 | |||
312 | # Examples | ||
313 | ```jldoctest | ||
314 | julia> repeat('A', 3) | ||
315 | "AAA" | ||
316 | ``` | ||
317 | """ | ||
318 | repeat(c::AbstractChar, r::Integer) = repeat(Char(c), r) # fallback | ||
319 | function repeat(c::Char, r::Integer) | ||
320 | r == 0 && return "" | ||
321 | r < 0 && throw(ArgumentError("can't repeat a character $r times")) | ||
322 | u = bswap(reinterpret(UInt32, c)) | ||
323 | n = 4 - (leading_zeros(u | 0xff) >> 3) | ||
324 | s = _string_n(n*r) | ||
325 | p = pointer(s) | ||
326 | if n == 1 | ||
327 | ccall(:memset, Ptr{Cvoid}, (Ptr{UInt8}, Cint, Csize_t), p, u % UInt8, r) | ||
328 | elseif n == 2 | ||
329 | p16 = reinterpret(Ptr{UInt16}, p) | ||
330 | for i = 1:r | ||
331 | unsafe_store!(p16, u % UInt16, i) | ||
332 | end | ||
333 | elseif n == 3 | ||
334 | b1 = (u >> 0) % UInt8 | ||
335 | b2 = (u >> 8) % UInt8 | ||
336 | b3 = (u >> 16) % UInt8 | ||
337 | for i = 0:r-1 | ||
338 | unsafe_store!(p, b1, 3i + 1) | ||
339 | unsafe_store!(p, b2, 3i + 2) | ||
340 | unsafe_store!(p, b3, 3i + 3) | ||
341 | end | ||
342 | elseif n == 4 | ||
343 | p32 = reinterpret(Ptr{UInt32}, pointer(s)) | ||
344 | for i = 1:r | ||
345 | unsafe_store!(p32, u, i) | ||
346 | end | ||
347 | end | ||
348 | return s | ||
349 | end |