Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port to web assembly #367

Open
wants to merge 13 commits into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -6,6 +6,9 @@ set (HS_MINOR_VERSION 4)
set (HS_PATCH_VERSION 0)
set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})

option(HYPERSCAN_WASM32 "Enable hyperscan compilation to wasm32" OFF)
option(HYPERSCAN_EMSCRIPTEN_BOOST "Use boost library that is bundled with emscripten" OFF)

set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
include(CheckCCompilerFlag)
include(CheckCXXCompilerFlag)
@@ -40,6 +43,11 @@ endif()
set(BINDIR "${PROJECT_BINARY_DIR}/bin")
set(LIBDIR "${PROJECT_BINARY_DIR}/lib")

if (HYPERSCAN_WASM32)
string(APPEND CMAKE_C_FLAGS " -msse -msse2 -mssse3 -msimd128")
string(APPEND CMAKE_CXX_FLAGS " -msse -msse2 -mssse3 -msimd128 -fwasm-exceptions")
endif()

set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR})

# First for the generic no-config case
@@ -64,12 +72,21 @@ if (XCODE OR CMAKE_CXX_COMPILER_ID MATCHES "Intel")
set(CMAKE_INCLUDE_SYSTEM_FLAG_CXX "-isystem")
endif ()

if (HYPERSCAN_WASM32)
set(CMAKE_INCLUDE_SYSTEM_FLAG_CXX "-Xclang -isystem")
endif()

set(CMAKE_INCLUDE_CURRENT_DIR 1)
include_directories(${PROJECT_SOURCE_DIR}/src)
include_directories(${PROJECT_BINARY_DIR})
include_directories(SYSTEM include)

include (${CMAKE_MODULE_PATH}/boost.cmake)
if (HYPERSCAN_EMSCRIPTEN_BOOST)
string(APPEND CMAKE_CXX_FLAGS " -sUSE_BOOST_HEADERS")
else()
include (${CMAKE_MODULE_PATH}/boost.cmake)
endif()


# -- make this work? set(python_ADDITIONAL_VERSIONS 2.7 2.6)
find_package(PythonInterp)
@@ -294,6 +311,11 @@ CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)

if (HYPERSCAN_WASM32)
CHECK_INCLUDE_FILES(tmmintrin.h HAVE_C_TMMINTRIN_H)
set(HAVE_CXX_TMMINTRIN_H 1)
endif()

CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)

@@ -482,7 +504,9 @@ if (CORRECT_PCRE_VERSION AND PCRE_BUILD_SOURCE AND BUILD_STATIC_LIBS)
set(BUILD_CHIMERA TRUE)
endif()

add_subdirectory(unit)
if (NOT HYPERSCAN_WASM32)
add_subdirectory(unit)
endif()
if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt)
add_subdirectory(tools)
endif()
@@ -1190,6 +1214,12 @@ if (NOT FAT_RUNTIME)
src/hs_valid_platform.c
$<TARGET_OBJECTS:hs_exec>
$<TARGET_OBJECTS:hs_compile>)
if (HYPERSCAN_WASM32)
target_compile_definitions(hs_exec PRIVATE WASM32)
target_compile_definitions(hs_runtime PRIVATE WASM32)
target_compile_definitions(hs_compile PRIVATE WASM32)
target_compile_definitions(hs PRIVATE WASM32)
endif()
endif (BUILD_STATIC_LIBS)

if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
4 changes: 4 additions & 0 deletions chimera/ch_runtime.c
Original file line number Diff line number Diff line change
@@ -326,6 +326,10 @@ ch_error_t catchupPcre(struct HybridContext *hyctx, unsigned int id,
} else if (cbrv == CH_CALLBACK_SKIP_PATTERN) {
DEBUG_PRINTF("user callback told us to skip this pattern\n");
pd->scanStart = hyctx->length;
if (top_id == id) {
break;
}
continue;
}

if (top_id == id) {
2 changes: 2 additions & 0 deletions cmake/arch.cmake
Original file line number Diff line number Diff line change
@@ -6,6 +6,8 @@ if (HAVE_C_X86INTRIN_H)
set (INTRIN_INC_H "x86intrin.h")
elseif (HAVE_C_INTRIN_H)
set (INTRIN_INC_H "intrin.h")
elseif (HAVE_C_TMMINTRIN_H)
set (INTRIN_INC_H "tmmintrin.h")
else ()
message (FATAL_ERROR "No intrinsics header found")
endif ()
2 changes: 1 addition & 1 deletion cmake/build_wrapper.sh
Original file line number Diff line number Diff line change
@@ -17,7 +17,7 @@ KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX)
LIBC_SO=$("$@" --print-file-name=libc.so.6)
cp ${KEEPSYMS_IN} ${KEEPSYMS}
# get all symbols from libc and turn them into patterns
nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ ]*\).*/^\1$/' >> ${KEEPSYMS}
nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS}
# build the object
"$@"
# rename the symbols in the object
6 changes: 6 additions & 0 deletions cmake/config.h.in
Original file line number Diff line number Diff line change
@@ -48,6 +48,12 @@
/* C compiler has intrin.h */
#cmakedefine HAVE_C_INTRIN_H

/* C++ compiler has tmmintrin.h */
#cmakedefine HAVE_CXX_TMMINTRIN_H

/* C compiler has tmmintrin.h */
#cmakedefine HAVE_C_TMMINTRIN_H

/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
0 if you don't. */
#cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
4 changes: 4 additions & 0 deletions cmake/platform.cmake
Original file line number Diff line number Diff line change
@@ -5,5 +5,9 @@ CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error n

CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)

if (HYPERSCAN_WASM32)
set(ARCH_32_BIT 1)
endif()

set(ARCH_X86_64 ${ARCH_64_BIT})
set(ARCH_IA32 ${ARCH_32_BIT})
9 changes: 7 additions & 2 deletions src/compiler/compiler.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2015-2021, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -323,7 +323,8 @@ void addExpression(NG &ng, unsigned index, const char *expression,
}

// Ensure that our pattern isn't too long (in characters).
if (strlen(expression) > cc.grey.limitPatternLength) {
size_t maxlen = cc.grey.limitPatternLength + 1;
if (strnlen(expression, maxlen) >= maxlen) {
throw CompileError("Pattern length exceeds limit.");
}

@@ -416,6 +417,10 @@ void addLitExpression(NG &ng, unsigned index, const char *expression,
"HS_FLAG_SOM_LEFTMOST are supported in literal API.");
}

if (!strcmp(expression, "")) {
throw CompileError("Pure literal API doesn't support empty string.");
}

// This expression must be a pure literal, we can build ue2_literal
// directly based on expression text.
ParsedLitExpression ple(index, expression, expLength, flags, id);
8 changes: 7 additions & 1 deletion src/hs.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2015-2021, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -514,6 +514,12 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
return HS_COMPILER_ERROR;
}

if (flags & HS_FLAG_COMBINATION) {
*error = generateCompileError("Invalid parameter: unsupported "
"logical combination expression", -1);
return HS_COMPILER_ERROR;
}

*info = nullptr;
*error = nullptr;

12 changes: 3 additions & 9 deletions src/hs_compile.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2015-2021, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -748,10 +748,7 @@ hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error);
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
* when a match is found.
* - HS_FLAG_COMBINATION - Parse the expression in logical combination
* syntax.
* - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
* the sub-expressions in logical combinations.
* - HS_FLAG_QUIET - This flag will be ignored.
*
* @param info
* On success, a pointer to the pattern information will be returned in
@@ -814,10 +811,7 @@ hs_error_t HS_CDECL hs_expression_info(const char *expression,
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
* when a match is found.
* - HS_FLAG_COMBINATION - Parse the expression in logical combination
* syntax.
* - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
* the sub-expressions in logical combinations.
* - HS_FLAG_QUIET - This flag will be ignored.
*
* @param ext
* A pointer to a filled @ref hs_expr_ext_t structure that defines
6 changes: 4 additions & 2 deletions src/hs_internal.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, Intel Corporation
* Copyright (c) 2019-2021, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -80,7 +80,9 @@ extern "C"
| HS_FLAG_PREFILTER \
| HS_FLAG_SINGLEMATCH \
| HS_FLAG_ALLOWEMPTY \
| HS_FLAG_SOM_LEFTMOST)
| HS_FLAG_SOM_LEFTMOST \
| HS_FLAG_COMBINATION \
| HS_FLAG_QUIET)

#ifdef __cplusplus
} /* extern "C" */
6 changes: 4 additions & 2 deletions src/nfa/mcclellancompile.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2015-2021, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -1082,7 +1082,9 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
// Use the daddy already set for this state so long as it isn't already
// a Sherman state.
dstate_id_t daddy = currState.daddy;
if (!info.is_sherman(daddy) && !info.is_widestate(daddy)) {
if (info.is_widestate(daddy)) {
return;
} else if (!info.is_sherman(daddy)) {
hinted.insert(currState.daddy);
} else {
// Fall back to granddaddy, which has already been processed (due
23 changes: 22 additions & 1 deletion src/rose/program_runtime.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2015-2021, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -3110,6 +3110,7 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,

const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP;
const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV;
const char skip_mpv_catchup = prog_flags & ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;

const char *pc_base = getByOffset(t, programOffset);
const char *pc = pc_base;
@@ -3206,13 +3207,33 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
}
L_PROGRAM_NEXT_INSTRUCTION

L_PROGRAM_CASE(CATCH_UP_MPV) {
if (from_mpv || skip_mpv_catchup) {
DEBUG_PRINTF("skipping mpv catchup\n");
} else if (roseCatchUpMPV(t,
end - scratch->core_info.buf_offset,
scratch) == HWLM_TERMINATE_MATCHING) {
return HWLM_TERMINATE_MATCHING;
}
}
L_PROGRAM_NEXT_INSTRUCTION

L_PROGRAM_CASE(SOM_FROM_REPORT) {
som = handleSomExternal(scratch, &ri->som, end);
DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch,
som);
}
L_PROGRAM_NEXT_INSTRUCTION

L_PROGRAM_CASE(TRIGGER_SUFFIX) {
if (roseTriggerSuffix(t, scratch, ri->queue, ri->event, som,
end) == HWLM_TERMINATE_MATCHING) {
return HWLM_TERMINATE_MATCHING;
}
work_done = 1;
}
L_PROGRAM_NEXT_INSTRUCTION

L_PROGRAM_CASE(DEDUPE) {
updateSeqPoint(tctxt, end, from_mpv);
const char do_som = t->hasSom; // TODO: constant propagate
6 changes: 5 additions & 1 deletion src/util/alloc.h
Original file line number Diff line number Diff line change
@@ -76,7 +76,11 @@ class AlignedAllocator {

T *allocate(std::size_t size) const {
size_t alloc_size = size * sizeof(T);
return static_cast<T *>(aligned_malloc_internal(alloc_size, N));
T *ptr = static_cast<T *>(aligned_malloc_internal(alloc_size, N));
if (!ptr) {
throw std::bad_alloc();
}
return ptr;
}

void deallocate(T *x, std::size_t) const noexcept {
2 changes: 1 addition & 1 deletion src/util/arch.h
Original file line number Diff line number Diff line change
@@ -83,7 +83,7 @@
/*
* MSVC uses a different form of inline asm
*/
#if defined(_WIN32) && defined(_MSC_VER)
#if defined(WASM32) || defined(_WIN32) && defined(_MSC_VER)
#define NO_ASM
#endif

2 changes: 1 addition & 1 deletion src/util/cpuid_flags.c
Original file line number Diff line number Diff line change
@@ -33,7 +33,7 @@
#include "hs_internal.h"
#include "util/arch.h"

#if !defined(_WIN32) && !defined(CPUID_H_)
#if !defined(_WIN32) && !defined(CPUID_H_) && !defined(WASM32)
#include <cpuid.h>
#endif

2 changes: 1 addition & 1 deletion src/util/cpuid_flags.h
Original file line number Diff line number Diff line change
@@ -31,7 +31,7 @@

#include "ue2common.h"

#if !defined(_WIN32) && !defined(CPUID_H_)
#if !defined(_WIN32) && !defined(CPUID_H_) && !defined(WASM32)
#include <cpuid.h>
/* system header doesn't have a header guard */
#define CPUID_H_
16 changes: 13 additions & 3 deletions src/util/cpuid_inline.h
Original file line number Diff line number Diff line change
@@ -32,7 +32,7 @@
#include "ue2common.h"
#include "cpuid_flags.h"

#if !defined(_WIN32) && !defined(CPUID_H_)
#if !defined(_WIN32) && !defined(CPUID_H_) && !defined(WASM32)
#include <cpuid.h>
/* system header doesn't have a header guard */
#define CPUID_H_
@@ -46,7 +46,14 @@ extern "C"
static inline
void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
unsigned int *ebx, unsigned int *ecx, unsigned int *edx) {
#ifndef _WIN32
#if defined(WASM32)
(void)(op);
(void)(leaf);
*eax = 0;
*ebx = 0;
*ecx = 0;
*edx = 0;
#elif !defined(_WIN32)
__cpuid_count(op, leaf, *eax, *ebx, *ecx, *edx);
#else
int a[4];
@@ -95,7 +102,10 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,

static inline
u64a xgetbv(u32 op) {
#if defined(_WIN32) || defined(__INTEL_COMPILER)
#if defined(WASM32)
(void) op;
return 0;
#elif defined(_WIN32) || defined(__INTEL_COMPILER)
return _xgetbv(op);
#else
u32 a, d;
12 changes: 12 additions & 0 deletions src/util/intrinsics.h
Original file line number Diff line number Diff line change
@@ -55,10 +55,22 @@
# endif
#endif

#ifdef __cplusplus
# if defined(HAVE_CXX_TMMINTRIN_H)
# define USE_TMMINTRIN_H
# endif
#else // C
# if defined(HAVE_C_TMMINTRIN_H)
# define USE_TMMINTRIN_H
# endif
#endif

#if defined(USE_X86INTRIN_H)
#include <x86intrin.h>
#elif defined(USE_INTRIN_H)
#include <intrin.h>
#elif defined(USE_TMMINTRIN_H)
#include <tmmintrin.h>
#else
#error no intrinsics file
#endif
26 changes: 13 additions & 13 deletions src/util/simd_utils.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2015-2021, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -156,6 +156,16 @@ static really_inline u32 movd(const m128 in) {
return _mm_cvtsi128_si32(in);
}

static really_inline u64a movq(const m128 in) {
#if defined(ARCH_X86_64)
return _mm_cvtsi128_si64(in);
#else // 32-bit - this is horrific
u32 lo = movd(in);
u32 hi = movd(_mm_srli_epi64(in, 32));
return (u64a)hi << 32 | lo;
#endif
}

#if defined(HAVE_AVX512)
static really_inline u32 movd512(const m512 in) {
// NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
@@ -166,20 +176,10 @@ static really_inline u32 movd512(const m512 in) {
static really_inline u64a movq512(const m512 in) {
// NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
// so we use 2-step convertions to work around.
return _mm_cvtsi128_si64(_mm512_castsi512_si128(in));
return movq(_mm512_castsi512_si128(in));
}
#endif

static really_inline u64a movq(const m128 in) {
#if defined(ARCH_X86_64)
return _mm_cvtsi128_si64(in);
#else // 32-bit - this is horrific
u32 lo = movd(in);
u32 hi = movd(_mm_srli_epi64(in, 32));
return (u64a)hi << 32 | lo;
#endif
}

/* another form of movq */
static really_inline
m128 load_m128_from_u64a(const u64a *p) {
@@ -791,7 +791,7 @@ m128 movdq_lo(m256 x) {
#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
#define extractlow64from256(a) movq(cast256to128(a))
#define extractlow32from256(a) movd(cast256to128(a))
#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
4 changes: 2 additions & 2 deletions util/ng_corpus_editor.cpp
Original file line number Diff line number Diff line change
@@ -268,12 +268,12 @@ void CorpusEditorUtf8::flip_case(vector<unichar> &corpus) {
unichar CorpusEditorUtf8::chooseCodePoint(void) {
/* We need to ensure that we don't pick a surrogate cp */
const u32 range =
MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1);
unichar raw = props.rand(0, range - 1);
if (raw < UNICODE_SURROGATE_MIN) {
return raw;
} else {
return raw + UNICODE_SURROGATE_MAX + 1;
return raw + UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1;
}
}

4 changes: 2 additions & 2 deletions util/ng_corpus_generator.cpp
Original file line number Diff line number Diff line change
@@ -477,14 +477,14 @@ void CorpusGeneratorUtf8::generateCorpus(vector<string> &data) {
* that we've been asked for. */
unichar CorpusGeneratorUtf8::getRandomChar() {
u32 range = MAX_UNICODE + 1
- (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
- (UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1);
range = min(cProps.alphabetSize, range);
assert(range);

unichar c = 'a' + cProps.rand(0, range - 1);

if (c >= UNICODE_SURROGATE_MIN) {
c =+ UNICODE_SURROGATE_MAX + 1;
c += UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1;
}

return c % (MAX_UNICODE + 1);