@@ -118,15 +118,37 @@ pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
118
118
}
119
119
}
120
120
121
- // Return the exact byte index for [start, end), set count to -1 to ignore count
122
- fn get_true_start_end ( input : & str , start : usize , count : i64 ) -> ( usize , usize ) {
121
+ // Convert the given `start` and `count` to valid byte indices within `input` string
122
+ // Input `start` and `count` are equivalent to PostgreSQL's `substr(s, start, count)`
123
+ // `start` is 1-based, if `count` is not provided count to the end of the string
124
+ // Input indices are character-based, and return values are byte indices
125
+ // The input bounds can be outside string bounds, this function will return
126
+ // the intersection between input bounds and valid string bounds
127
+ //
128
+ // * Example
129
+ // 'Hi🌏' in-mem (`[]` for one char, `x` for one byte): [x][x][xxxx]
130
+ // `get_true_start_end('Hi🌏', 1, None) -> (0, 6)`
131
+ // `get_true_start_end('Hi🌏', 1, 1) -> (0, 1)`
132
+ // `get_true_start_end('Hi🌏', -10, 2) -> (0, 0)`
133
+ fn get_true_start_end ( input : & str , start : i64 , count : Option < u64 > ) -> ( usize , usize ) {
134
+ let start = start - 1 ;
135
+ let end = match count {
136
+ Some ( count) => start + count as i64 ,
137
+ None => input. len ( ) as i64 ,
138
+ } ;
139
+ let count_to_end = count. is_some ( ) ;
140
+
141
+ let start = start. clamp ( 0 , input. len ( ) as i64 ) as usize ;
142
+ let end = end. clamp ( 0 , input. len ( ) as i64 ) as usize ;
143
+ let count = end - start;
144
+
123
145
let ( mut st, mut ed) = ( input. len ( ) , input. len ( ) ) ;
124
146
let mut start_counting = false ;
125
147
let mut cnt = 0 ;
126
148
for ( char_cnt, ( byte_cnt, _) ) in input. char_indices ( ) . enumerate ( ) {
127
149
if char_cnt == start {
128
150
st = byte_cnt;
129
- if count != - 1 {
151
+ if count_to_end {
130
152
start_counting = true ;
131
153
} else {
132
154
break ;
@@ -153,20 +175,15 @@ fn make_and_append_view(
153
175
start : u32 ,
154
176
) {
155
177
let substr_len = substr. len ( ) ;
156
- if substr_len == 0 {
157
- null_builder . append_null ( ) ;
158
- views_buffer . push ( 0 ) ;
178
+ let sub_view = if substr_len > 12 {
179
+ let view = ByteView :: from ( * raw ) ;
180
+ make_view ( substr . as_bytes ( ) , view . buffer_index , view . offset + start )
159
181
} else {
160
- let sub_view = if substr_len > 12 {
161
- let view = ByteView :: from ( * raw) ;
162
- make_view ( substr. as_bytes ( ) , view. buffer_index , view. offset + start)
163
- } else {
164
- // inline value does not need block id or offset
165
- make_view ( substr. as_bytes ( ) , 0 , 0 )
166
- } ;
167
- views_buffer. push ( sub_view) ;
168
- null_builder. append_non_null ( ) ;
169
- }
182
+ // inline value does not need block id or offset
183
+ make_view ( substr. as_bytes ( ) , 0 , 0 )
184
+ } ;
185
+ views_buffer. push ( sub_view) ;
186
+ null_builder. append_non_null ( ) ;
170
187
}
171
188
172
189
// The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44
@@ -180,32 +197,26 @@ fn string_view_substr(
180
197
181
198
let start_array = as_int64_array ( & args[ 0 ] ) ?;
182
199
200
+ // In either case of `substr(s, i)` or `substr(s, i, cnt)`
201
+ // If any of input argument is `NULL`, the result is `NULL`
183
202
match args. len ( ) {
184
203
1 => {
185
- for ( idx, ( raw, start) ) in string_view_array
186
- . views ( )
204
+ for ( ( str_opt, raw_view) , start_opt) in string_view_array
187
205
. iter ( )
206
+ . zip ( string_view_array. views ( ) . iter ( ) )
188
207
. zip ( start_array. iter ( ) )
189
- . enumerate ( )
190
208
{
191
- if let Some ( start) = start {
192
- let start = ( start - 1 ) . max ( 0 ) as usize ;
193
-
194
- // Safety:
195
- // idx is always smaller or equal to string_view_array.views.len()
196
- unsafe {
197
- let str = string_view_array. value_unchecked ( idx) ;
198
- let ( start, end) = get_true_start_end ( str, start, -1 ) ;
199
- let substr = & str[ start..end] ;
209
+ if let ( Some ( str) , Some ( start) ) = ( str_opt, start_opt) {
210
+ let ( start, end) = get_true_start_end ( str, start, None ) ;
211
+ let substr = & str[ start..end] ;
200
212
201
- make_and_append_view (
202
- & mut views_buf,
203
- & mut null_builder,
204
- raw,
205
- substr,
206
- start as u32 ,
207
- ) ;
208
- }
213
+ make_and_append_view (
214
+ & mut views_buf,
215
+ & mut null_builder,
216
+ raw_view,
217
+ substr,
218
+ start as u32 ,
219
+ ) ;
209
220
} else {
210
221
null_builder. append_null ( ) ;
211
222
views_buf. push ( 0 ) ;
@@ -214,35 +225,31 @@ fn string_view_substr(
214
225
}
215
226
2 => {
216
227
let count_array = as_int64_array ( & args[ 1 ] ) ?;
217
- for ( idx, ( ( raw, start) , count) ) in string_view_array
218
- . views ( )
228
+ for ( ( ( str_opt, raw_view) , start_opt) , count_opt) in string_view_array
219
229
. iter ( )
230
+ . zip ( string_view_array. views ( ) . iter ( ) )
220
231
. zip ( start_array. iter ( ) )
221
232
. zip ( count_array. iter ( ) )
222
- . enumerate ( )
223
233
{
224
- if let ( Some ( start) , Some ( count) ) = ( start, count) {
225
- let start = ( start - 1 ) . max ( 0 ) as usize ;
234
+ if let ( Some ( str) , Some ( start) , Some ( count) ) =
235
+ ( str_opt, start_opt, count_opt)
236
+ {
226
237
if count < 0 {
227
238
return exec_err ! (
228
239
"negative substring length not allowed: substr(<str>, {start}, {count})"
229
240
) ;
230
241
} else {
231
- // Safety:
232
- // idx is always smaller or equal to string_view_array.views.len()
233
- unsafe {
234
- let str = string_view_array. value_unchecked ( idx) ;
235
- let ( start, end) = get_true_start_end ( str, start, count) ;
236
- let substr = & str[ start..end] ;
237
-
238
- make_and_append_view (
239
- & mut views_buf,
240
- & mut null_builder,
241
- raw,
242
- substr,
243
- start as u32 ,
244
- ) ;
245
- }
242
+ let ( start, end) =
243
+ get_true_start_end ( str, start, Some ( count as u64 ) ) ;
244
+ let substr = & str[ start..end] ;
245
+
246
+ make_and_append_view (
247
+ & mut views_buf,
248
+ & mut null_builder,
249
+ raw_view,
250
+ substr,
251
+ start as u32 ,
252
+ ) ;
246
253
}
247
254
} else {
248
255
null_builder. append_null ( ) ;
0 commit comments