Skip to content

Commit

Permalink
Optimizations for Armv8-A
Browse files Browse the repository at this point in the history
  • Loading branch information
volyrique committed Feb 20, 2019
1 parent 1d2b8a1 commit c04a8f7
Showing 1 changed file with 76 additions and 2 deletions.
78 changes: 76 additions & 2 deletions picohttpparser.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
#include <x86intrin.h>
#endif
#endif
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#include "picohttpparser.h"

/* $Id$ */
Expand Down Expand Up @@ -73,9 +76,8 @@
#define ADVANCE_TOKEN(tok, toklen) \
do { \
const char *tok_start = buf; \
static const char ALIGNED(16) ranges2[] = "\000\040\177\177"; \
int found2; \
buf = findchar_fast(buf, buf_end, ranges2, sizeof(ranges2) - 1, &found2); \
buf = findchar_nonprintable_fast(buf, buf_end, &found2); \
if (!found2) { \
CHECK_EOF(); \
} \
Expand Down Expand Up @@ -133,6 +135,46 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
return buf;
}

static const char *findchar_nonprintable_fast(const char *buf, const char *buf_end, int *found)
{
#if defined(__ARM_64BIT_STATE) && defined(__ARM_FEATURE_UNALIGNED) && !defined(__ARM_BIG_ENDIAN)
*found = 0;

for (size_t i = (buf_end - buf) / sizeof(uint8x16_t); i; i--) {
// This mask makes it possible to pack the comparison result into half a vector,
// which has the same size as uint64_t.
const uint8x16_t mask = vreinterpretq_u8_u16(vmovq_n_u16(0x8008));
uint8x16_t v = vld1q_u8((const uint8_t *) buf);

v = vorrq_u8(vcltq_u8(v, vmovq_n_u8('\041')), vceqq_u8(v, vmovq_n_u8('\177')));
// After masking, a byte in the result does not have the same bits set as any of its neighbours.
v = vandq_u8(v, mask);
// Pack the comparison result into 64 bits.
v = vpaddq_u8(v, v);

uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v), 0);

if (offset) {
*found = 1;
__asm__ ("rbit %x0, %x0" : "+r" (offset));
static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
"Need the number of leading 0-bits in uint64_t.");
// offset uses 4 bits per byte of input.
buf += __builtin_clzll(offset) / 4;
break;
}

buf += sizeof(v);
}

return buf;
#else
static const char ALIGNED(16) ranges2[] = "\000\040\177\177";

return findchar_fast(buf, buf_end, ranges2, sizeof(ranges2) - 1, found);
#endif
}

static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret)
{
const char *token_start = buf;
Expand All @@ -149,6 +191,38 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found);
if (found)
goto FOUND_CTL;
#elif defined(__ARM_64BIT_STATE) && defined(__ARM_FEATURE_UNALIGNED) && !defined(__ARM_BIG_ENDIAN)
for (size_t i = (buf_end - buf) / sizeof(uint8x16_t); i; i--) {
const uint8x16_t space = vmovq_n_u8('\040');
const uint8x16_t v = vld1q_u8((const uint8_t *) buf);
uint8x16_t v2 = vcgeq_u8(vsubq_u8(v, space), vmovq_n_u8(0137u));

// Pack the comparison result into half a vector, i.e. 64 bits; the result will still be non-zero
// even if any adjacent bytes are the same (either 0 or 0xFF).
v2 = vpaddq_u8(v2, v2);

if (vgetq_lane_u64(vreinterpretq_u64_u8(v2), 0)) {
v2 = vandq_u8(vcltq_u8(v, space), vmvnq_u8(vceqq_u8(v, vmovq_n_u8('\011'))));
v2 = vorrq_u8(v2, vceqq_u8(v, vmovq_n_u8('\177')));
// After masking, a byte in the result does not have the same bits set as any of its neighbours.
v2 = vandq_u8(v2, vreinterpretq_u8_u16(vmovq_n_u16(0x8008)));
// Pack the comparison result into 64 bits.
v2 = vpaddq_u8(v2, v2);

uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v2), 0);

if (offset) {
__asm__ ("rbit %x0, %x0" : "+r" (offset));
static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
"Need the number of leading 0-bits in uint64_t.");
// offset uses 4 bits per byte of input.
buf += __builtin_clzll(offset) / 4;
goto FOUND_CTL;
}
}

buf += sizeof(v);
}
#else
/* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
while (likely(buf_end - buf >= 8)) {
Expand Down

0 comments on commit c04a8f7

Please sign in to comment.