Skip to content

Commit e721d35

Browse files
authored
slightly faster String allocation (#29254)
Only a few % saved, but it essentially removes the cost of computing the sizeclass, so this should save on processor resources (frontend, BTP) also. And it makes the computation correct (previously the cutoff was slightly offset, so it would switch to the next larger size-class at 609 instead of at 624, for example).
1 parent 61696f9 commit e721d35

File tree

2 files changed

+28
-34
lines changed

2 files changed

+28
-34
lines changed

src/array.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,8 @@ JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a)
467467

468468
JL_DLLEXPORT jl_value_t *jl_pchar_to_string(const char *str, size_t len)
469469
{
470-
jl_value_t *s = jl_gc_alloc(jl_get_ptls_states(), sizeof(size_t)+len+1, jl_string_type);
470+
size_t sz = sizeof(size_t) + len + 1; // add space for trailing \nul protector and size
471+
jl_value_t *s = jl_gc_alloc_(jl_get_ptls_states(), sz, jl_string_type); // force inlining
471472
*(size_t*)s = len;
472473
memcpy((char*)s + sizeof(size_t), str, len);
473474
((char*)s + sizeof(size_t))[len] = 0;
@@ -476,7 +477,8 @@ JL_DLLEXPORT jl_value_t *jl_pchar_to_string(const char *str, size_t len)
476477

477478
JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len)
478479
{
479-
jl_value_t *s = jl_gc_alloc(jl_get_ptls_states(), sizeof(size_t)+len+1, jl_string_type);
480+
size_t sz = sizeof(size_t) + len + 1; // add space for trailing \nul protector and size
481+
jl_value_t *s = jl_gc_alloc_(jl_get_ptls_states(), sz, jl_string_type); // force inlining
480482
*(size_t*)s = len;
481483
((char*)s + sizeof(size_t))[len] = 0;
482484
return s;

src/julia_internal.h

Lines changed: 24 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ void gc_sweep_sysimg(void);
164164

165165

166166
// pools are 16376 bytes large (GC_POOL_SZ - GC_PAGE_OFFSET)
167-
static const int jl_gc_sizeclasses[JL_GC_N_POOLS] = {
167+
static const int jl_gc_sizeclasses[] = {
168168
#ifdef _P64
169169
8,
170170
#elif defined(_CPU_ARM_) || defined(_CPU_PPC_)
@@ -197,6 +197,7 @@ static const int jl_gc_sizeclasses[JL_GC_N_POOLS] = {
197197
// 15, 14, 13, 12, 11, 10, 9, 8, /pool
198198
// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost
199199
};
200+
static_assert(sizeof(jl_gc_sizeclasses) / sizeof(jl_gc_sizeclasses[0]) == JL_GC_N_POOLS, "");
200201

201202
STATIC_INLINE int jl_gc_alignment(size_t sz)
202203
{
@@ -220,35 +221,31 @@ STATIC_INLINE int jl_gc_alignment(size_t sz)
220221
}
221222
JL_DLLEXPORT int jl_alignment(size_t sz);
222223

223-
STATIC_INLINE int JL_CONST_FUNC jl_gc_szclass(size_t sz)
224+
// the following table is computed from jl_gc_sizeclasses via the formula:
225+
// [searchsortedfirst(TABLE, i) for i = 0:16:table[end]]
226+
static const uint8_t szclass_table[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 21, 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40};
227+
static_assert(sizeof(szclass_table) == 128, "");
228+
229+
STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass(unsigned sz)
224230
{
231+
assert(sz <= 2032);
232+
uint8_t klass = szclass_table[(sz + 15) / 16];
225233
#ifdef _P64
226-
if (sz <= 8)
234+
if (sz <= 8)
227235
return 0;
228236
const int N = 0;
229237
#elif defined(_CPU_ARM_) || defined(_CPU_PPC_)
230-
if (sz <= 8)
231-
return (sz + 3) / 4 - 1;
238+
if (sz <= 8)
239+
return (sz >= 4 ? 1 : 0);
232240
const int N = 1;
233241
#else
234-
if (sz <= 12)
235-
return (sz + 3) / 4 - 1;
242+
if (sz <= 12)
243+
return (sz >= 8 ? 2 : (sz >= 4 ? 1 : 0));
236244
const int N = 2;
237245
#endif
238-
if (sz <= 256)
239-
return (sz + 15) / 16 + N;
240-
if (sz <= 496)
241-
return 16 - 16376 / 4 / LLT_ALIGN(sz, 16 * 4) + 16 + N;
242-
if (sz <= 1008)
243-
return 16 - 16376 / 2 / LLT_ALIGN(sz, 16 * 2) + 24 + N;
244-
return 16 - 16376 / 1 / LLT_ALIGN(sz, 16 * 1) + 32 + N;
246+
return klass + N;
245247
}
246248

247-
#ifdef __GNUC__
248-
# define jl_is_constexpr(e) __builtin_constant_p(e)
249-
#else
250-
# define jl_is_constexpr(e) (0)
251-
#endif
252249
#define JL_SMALL_BYTE_ALIGNMENT 16
253250
#define JL_CACHE_BYTE_ALIGNMENT 64
254251
// JL_HEAP_ALIGNMENT is the maximum alignment that the GC can provide
@@ -257,23 +254,17 @@ STATIC_INLINE int JL_CONST_FUNC jl_gc_szclass(size_t sz)
257254

258255
STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
259256
{
260-
const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
261-
if (allocsz < sz) // overflow in adding offs, size was "negative"
262-
jl_throw(jl_memory_exception);
263257
jl_value_t *v;
264-
if (allocsz <= GC_MAX_SZCLASS + sizeof(jl_taggedvalue_t)) {
258+
const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
259+
if (sz <= GC_MAX_SZCLASS) {
265260
int pool_id = jl_gc_szclass(allocsz);
266261
jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id];
267-
int osize;
268-
if (jl_is_constexpr(allocsz)) {
269-
osize = jl_gc_sizeclasses[pool_id];
270-
}
271-
else {
272-
osize = p->osize;
273-
}
262+
int osize = jl_gc_sizeclasses[pool_id];
274263
v = jl_gc_pool_alloc(ptls, (char*)p - (char*)ptls, osize);
275264
}
276265
else {
266+
if (allocsz < sz) // overflow in adding offs, size was "negative"
267+
jl_throw(jl_memory_exception);
277268
v = jl_gc_big_alloc(ptls, allocsz);
278269
}
279270
jl_set_typeof(v, ty);
@@ -282,8 +273,9 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
282273
JL_DLLEXPORT jl_value_t *jl_gc_alloc(jl_ptls_t ptls, size_t sz, void *ty);
283274
// On GCC, only inline when sz is constant
284275
#ifdef __GNUC__
285-
# define jl_gc_alloc(ptls, sz, ty) \
286-
(__builtin_constant_p(sz) ? jl_gc_alloc_(ptls, sz, ty) : \
276+
# define jl_gc_alloc(ptls, sz, ty) \
277+
(__builtin_constant_p(sz) ? \
278+
jl_gc_alloc_(ptls, sz, ty) : \
287279
(jl_gc_alloc)(ptls, sz, ty))
288280
#else
289281
# define jl_gc_alloc(ptls, sz, ty) jl_gc_alloc_(ptls, sz, ty)

0 commit comments

Comments
 (0)