Skip to content

Commit 6e7e9fc

Browse files
committed
Draft of the extended vectorscan API
This commit extends the vectorscan API with access to low level algorithms. This would enable a develloper to bypass most of the overhead of the regular vectorscan scan when the pattern to check is simple. Currently it only targets pure literal patterns. Signed-off-by: Yoan Picchi <[email protected]>
1 parent 4f09e78 commit 6e7e9fc

File tree

7 files changed

+894
-0
lines changed

7 files changed

+894
-0
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ set (hs_exec_SRCS
291291
src/crc32.h
292292
src/report.h
293293
src/runtime.c
294+
src/hs_extended_api.cpp
294295
src/stream_compress.c
295296
src/stream_compress.h
296297
src/stream_compress_impl.h

benchmarks/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,9 @@ if (NOT FAT_RUNTIME AND (BUILD_SHARED_LIBS OR BUILD_STATIC_LIBS))
66
set_source_files_properties(benchmarks.cpp PROPERTIES COMPILE_FLAGS
77
"-Wall -Wno-unused-variable")
88
target_link_libraries(benchmarks hs)
9+
10+
add_executable(test_api test_extended_api.cpp)
11+
set_source_files_properties(test_extended_api.cpp PROPERTIES COMPILE_FLAGS
12+
"-Wall -Wno-unused-variable")
13+
target_link_libraries(test_api hs)
914
endif()

benchmarks/test_extended_api.cpp

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
/*
2+
* Copyright (c) 2024-2025, Arm ltd
3+
*
4+
* Redistribution and use in source and binary forms, with or without
5+
* modification, are permitted provided that the following conditions are met:
6+
*
7+
* * Redistributions of source code must retain the above copyright notice,
8+
* this list of conditions and the following disclaimer.
9+
* * Redistributions in binary form must reproduce the above copyright
10+
* notice, this list of conditions and the following disclaimer in the
11+
* documentation and/or other materials provided with the distribution.
12+
* * Neither the name of Intel Corporation nor the names of its contributors
13+
* may be used to endorse or promote products derived from this software
14+
* without specific prior written permission.
15+
*
16+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26+
* POSSIBILITY OF SUCH DAMAGE.
27+
*/
28+
29+
#include <cstdlib>
30+
#include <cstring>
31+
#include <iostream>
32+
33+
#include "hs_compile.h"
34+
#include "hs_runtime.h"
35+
36+
#include "hwlm/hwlm_literal.h"
37+
#include "hwlm/noodle_build.h"
38+
#include "hwlm/noodle_engine.h"
39+
#include "hwlm/noodle_internal.h"
40+
41+
const char *buf1 = "azertyuioperty";
42+
int buf1_len = 14;
43+
const char *buf2 = "AZERTYUIOPERTY";
44+
int buf2_len = 14;
45+
46+
typedef struct context {
47+
/* array of indices in the string where we expect match to be reported */
48+
size_t *expected_array;
49+
size_t array_size;
50+
/* counter of matches hapenning at a position in expected_array */
51+
size_t number_matched;
52+
/* counter of matches hapenning at a position NOT in expected_array */
53+
size_t number_wrong;
54+
} context_t;
55+
56+
int callback(unsigned int id, unsigned long long start,
57+
unsigned long long end_offset, unsigned int flags,
58+
void *raw_context) {
59+
(void)id;
60+
(void)start;
61+
(void)flags;
62+
context_t *context = (context_t *)raw_context;
63+
bool matched = false;
64+
// Check if the match is expected
65+
for (size_t i = 0; i < context->array_size; i++) {
66+
if (end_offset == context->expected_array[i]) {
67+
matched = true;
68+
}
69+
}
70+
// Tally the right counter wether the match was expected or not
71+
if (matched) {
72+
context->number_matched += 1;
73+
} else {
74+
context->number_wrong += 1;
75+
printf("unplanned match at index %llu\n", end_offset);
76+
}
77+
78+
return CB_CONTINUE_MATCHING;
79+
}
80+
81+
int test_noodle() {
82+
const char *pattern = "ert";
83+
hs_short_literal_compiled_pattern_t noodle_database;
84+
85+
hs_error_t ret =
86+
hs_compile_short_literal_search(pattern, 3, &noodle_database);
87+
if (ret != HS_SUCCESS) {
88+
printf("Fail to build the pattern\n");
89+
return 1;
90+
}
91+
92+
size_t expected_array[2] = {4, 12};
93+
context_t context = {&(expected_array[0]), 2, 0, 0};
94+
ret = hs_short_literal_search(&noodle_database, buf1, buf1_len, nullptr,
95+
callback, &context);
96+
if (ret != HS_SUCCESS) {
97+
printf("Fail to run noodle\n");
98+
return 1;
99+
}
100+
if (context.number_matched != context.array_size) {
101+
printf("1- missed some matches. Expected: %lu, got %lu\n",
102+
context.array_size, context.number_matched);
103+
}
104+
105+
expected_array[0] = 8;
106+
context = {&(expected_array[0]), 1, 0, 0};
107+
ret = hs_short_literal_search(&noodle_database, buf1 + 4, buf1_len - 4,
108+
nullptr, callback, &context);
109+
if (ret != HS_SUCCESS) {
110+
printf("Fail to run noodle\n");
111+
return 1;
112+
}
113+
if (context.number_matched != context.array_size) {
114+
printf("2- missed some matches. Expected: %lu, got %lu\n",
115+
context.array_size, context.number_matched);
116+
}
117+
118+
pattern = "ERT";
119+
ret = hs_compile_short_literal_search(pattern, 3, &noodle_database);
120+
if (ret != HS_SUCCESS) {
121+
printf("Fail to build the pattern\n");
122+
return 1;
123+
}
124+
125+
expected_array[0] = 4;
126+
context = {&(expected_array[0]), 2, 0, 0};
127+
ret = hs_short_literal_search(&noodle_database, buf2, buf2_len, nullptr,
128+
callback, &context);
129+
if (ret != HS_SUCCESS) {
130+
printf("Fail to run noodle\n");
131+
return 1;
132+
}
133+
if (context.number_matched != context.array_size) {
134+
printf("3- missed some matches. Expected: %lu, got %lu\n",
135+
context.array_size, context.number_matched);
136+
}
137+
138+
expected_array[0] = 8;
139+
context = {&(expected_array[0]), 1, 0, 0};
140+
ret = hs_short_literal_search(&noodle_database, buf2 + 4, buf2_len - 4,
141+
nullptr, callback, &context);
142+
if (ret != HS_SUCCESS) {
143+
printf("Fail to run noodle\n");
144+
return 1;
145+
}
146+
if (context.number_matched != context.array_size) {
147+
printf("4- missed some matches. Expected: %lu, got %lu\n",
148+
context.array_size, context.number_matched);
149+
}
150+
151+
return 0;
152+
}
153+
154+
int main() {
155+
// test_plain_noodle();
156+
if (!test_noodle()) {
157+
printf("all test passed\n");
158+
}
159+
160+
return 0;
161+
}

src/hs_common.h

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
22
* Copyright (c) 2015-2019, Intel Corporation
3+
* Copyright (c) 2024-2025, Arm ltd
34
*
45
* Redistribution and use in source and binary forms, with or without
56
* modification, are permitted provided that the following conditions are met:
@@ -583,6 +584,155 @@ hs_error_t HS_CDECL hs_valid_platform(void);
583584
*/
584585
#define HS_UNKNOWN_ERROR (-13)
585586

587+
/**
588+
* The following functions and types are part of the extended API and are not
589+
* cross compatible with hyperscan. This extension intends on providing the
590+
* developer with minimal overhead search functions.
591+
*/
592+
593+
/**
594+
* The size threshold after which a pattern is considered long and must be fed
595+
* to @ref hs_compile_long_literal_search(). Patterns up to this length may be
596+
* fed to hs_compile_short_literal_search() instead.
597+
*/
598+
#define HS_SHORT_PATTERN_THRESHOLD 16
599+
600+
/**
601+
* The compiled pattern type for searches short literals
602+
* Generated by @ref hs_compile_short_literal_search()
603+
*/
604+
typedef struct {
605+
unsigned char data[32];
606+
} hs_short_literal_compiled_pattern_t;
607+
608+
/**
609+
* The compiled pattern type for searching long literals
610+
* Generated by @ref hs_compile_long_literal_search()
611+
* Note that given the unbounded nature of the pattern size, it is impossible to
612+
* give a constant size to the structure. You'll need to use @ref
613+
* hs_get_long_literal_search_database_size() and allocate enough memory to
614+
* store the compiled pattern.
615+
*/
616+
typedef struct hs_long_literal_compiled_pattern hs_long_literal_compiled_pattern_t;
617+
618+
/**
619+
* RFC: hs_compile_long_literal_search() and a few other don't have any obvious
620+
* bounds on the size of the pattern. To make it generic we need to get the time
621+
* at runtime. The issue is that it is slow. I'd like to also provide an
622+
* optional static size in some way when the backing algorithm allows it. It
623+
* might be in the form of a polynomial function in a #define that takes in the
624+
* expected max size of the pattern and the number of patterns. This would allow
625+
* the user to use a static size in their own algo if they know in advance what
626+
* kind of pattern they'll use. The problem with this though is that I'm not
627+
* sure how often such feature would be used and I don't want to clutter the
628+
* API. Any feedback?
629+
*/
630+
631+
/**
632+
* This function calculate the size needed to store the compiled version of the
633+
* given @p expression .
634+
*
635+
* @param expression
636+
* The expression to parse. Note that this string must represent ONLY the
637+
* pattern to be matched, with no delimiters. Null characters are accepted as
638+
* part of the expression.
639+
*
640+
* @param expression_length
641+
* The length of the expression in byte.
642+
*
643+
* @return
644+
* On success, the size in byte that needs to be allocated to store the
645+
* given pattern. Otherwise returns 0
646+
*/
647+
size_t HS_CDECL hs_get_long_literal_search_database_size(const char *expression,
648+
size_t pattern_len);
649+
650+
/**
651+
* The compiled pattern type for searching several long literal
652+
* Generated by @ref hs_compile_multi_literal_search()
653+
* Note that given the unbounded nature of the pattern size, it is impossible to
654+
* give a constant size to the structure. You'll need to use @ref
655+
* hs_get_multi_literal_search_database_size() and allocate enough memory to
656+
* store the compiled pattern.
657+
*/
658+
typedef struct hs_multi_literal_compiled_pattern hs_multi_literal_compiled_pattern_t;
659+
660+
/**
661+
* This function calculate the size needed to store the compiled version of the
662+
* given @p expression .
663+
*
664+
* @param expression
665+
* The array of expressions to parse. Note that the strings must represent
666+
* ONLY the patterns to be matched, with no delimiters. Null characters are
667+
* accepted as part of the expression.
668+
*
669+
* @param pattern_count
670+
* The number of expressions in the @p expression array.
671+
*
672+
* @param expression_length
673+
* The array of length of each expression in the @p expression array.
674+
* Expressed in byte.
675+
*
676+
* @return
677+
* On success, the size in byte that needs to be allocated to store the
678+
* given pattern. Otherwise returns 0
679+
*/
680+
size_t HS_CDECL hs_get_multi_literal_search_database_size(
681+
const char **expression, size_t pattern_count, size_t *pattern_len);
682+
683+
/**
684+
* The compiled pattern type for searching a single character
685+
* Generated by @ref hs_compile_single_char_search()
686+
*/
687+
typedef struct {
688+
unsigned char data[1];
689+
} hs_single_char_compiled_pattern_t;
690+
691+
/**
692+
* The compiled pattern type for searching a character set
693+
* Generated by @ref hs_compile_multi_char_search()
694+
*/
695+
typedef struct {
696+
unsigned char data[32];
697+
} hs_multi_char_compiled_pattern_t;
698+
699+
/**
700+
* The compiled pattern type for searching a character pair
701+
* Generated by @ref hs_compile_char_pair_search()
702+
*/
703+
typedef struct {
704+
unsigned char data[32];
705+
} hs_single_char_pair_compiled_pattern_t;
706+
707+
/**
708+
* The compiled pattern type for searching a set of character pairs
709+
* Generated by @ref hs_compile_multi_char_pair_search()
710+
* Note that given the unbounded maximum number of pair, it is impossible to
711+
* give a constant size to the structure. You'll need to use @ref
712+
* hs_get_multi_char_pair_search_database_size() and allocate enough memory to
713+
* store the compiled pattern.
714+
*/
715+
typedef struct hs_multi_char_pair_compiled_pattern hs_multi_char_pair_compiled_pattern_t;
716+
717+
/**
718+
* This function calculate the size needed to store the compiled version of the
719+
* given @p expression .
720+
*
721+
* @param expression
722+
* The concatenation of all pairs to be parsed. If one want to search for
723+
* "ab" or "Cd", then @p expression would be ['a','b','C','d']. Null terminator
724+
* is optional.
725+
*
726+
* @param pair_count
727+
* The number of characters pair in @p expression
728+
*
729+
* @return
730+
* On success, the size in byte that needs to be allocated to store the
731+
* given pattern. Otherwise returns 0
732+
*/
733+
size_t HS_CDECL hs_get_multi_char_pair_search_database_size(
734+
const char *expression, size_t pair_count);
735+
586736
/** @} */
587737

588738
#ifdef __cplusplus

0 commit comments

Comments
 (0)