From cf57dee68e9f4107ae6ab2b98c22f113d0140bd0 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Tue, 7 Feb 2023 15:21:58 -0500 Subject: [PATCH 1/3] prefactor: expose gumbo_free so it can be used in gumbo.c --- ext/nokogiri/gumbo.c | 2 +- gumbo-parser/src/nokogiri_gumbo.h | 2 ++ gumbo-parser/src/util.h | 1 - 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ext/nokogiri/gumbo.c b/ext/nokogiri/gumbo.c index 82e50217b7c..80027194ba2 100644 --- a/ext/nokogiri/gumbo.c +++ b/ext/nokogiri/gumbo.c @@ -273,7 +273,7 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url) char *msg; size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg); VALUE err_str = rb_utf8_str_new(msg, size); - free(msg); + gumbo_free(msg); VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError); const char *error_code = gumbo_error_code(err); VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil; diff --git a/gumbo-parser/src/nokogiri_gumbo.h b/gumbo-parser/src/nokogiri_gumbo.h index f98efa53f15..51f69547364 100644 --- a/gumbo-parser/src/nokogiri_gumbo.h +++ b/gumbo-parser/src/nokogiri_gumbo.h @@ -937,6 +937,8 @@ void gumbo_print_caret_diagnostic ( size_t source_length ); +void gumbo_free(void* ptr); + #ifdef __cplusplus } #endif diff --git a/gumbo-parser/src/util.h b/gumbo-parser/src/util.h index 64170e5cd08..2d0a828d368 100644 --- a/gumbo-parser/src/util.h +++ b/gumbo-parser/src/util.h @@ -18,7 +18,6 @@ char* gumbo_strdup(const char* str) XMALLOC NONNULL_ARGS; void* gumbo_alloc(size_t size) XMALLOC; void* gumbo_realloc(void* ptr, size_t size) RETURNS_NONNULL; -void gumbo_free(void* ptr); // Debug wrapper for printf #ifdef GUMBO_DEBUG From c86906c4e72de5cfaf741b0b864f5093127af5cb Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Tue, 7 Feb 2023 15:26:07 -0500 Subject: [PATCH 2/3] prefactor: gumbo_realloc takes previous size so we can implement an arena-based realloc --- gumbo-parser/src/string_buffer.c | 3 ++- gumbo-parser/src/token_buffer.c | 3 ++- gumbo-parser/src/util.c | 4 ++-- gumbo-parser/src/util.h | 2 +- gumbo-parser/src/vector.c | 3 ++- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/gumbo-parser/src/string_buffer.c b/gumbo-parser/src/string_buffer.c index 44852b78941..f4b9389ba76 100644 --- a/gumbo-parser/src/string_buffer.c +++ b/gumbo-parser/src/string_buffer.c @@ -27,12 +27,13 @@ static void maybe_resize_string_buffer ( GumboStringBuffer* buffer ) { size_t new_length = buffer->length + additional_chars; + size_t prev_capacity = buffer->capacity; size_t new_capacity = buffer->capacity; while (new_capacity < new_length) { new_capacity *= 2; } if (new_capacity != buffer->capacity) { - buffer->data = gumbo_realloc(buffer->data, new_capacity); + buffer->data = gumbo_realloc(buffer->data, prev_capacity, new_capacity); buffer->capacity = new_capacity; } } diff --git a/gumbo-parser/src/token_buffer.c b/gumbo-parser/src/token_buffer.c index 9cd6252f9ef..2270850b02f 100644 --- a/gumbo-parser/src/token_buffer.c +++ b/gumbo-parser/src/token_buffer.c @@ -40,12 +40,13 @@ void gumbo_character_token_buffer_append ( assert(token->type == GUMBO_TOKEN_WHITESPACE || token->type == GUMBO_TOKEN_CHARACTER); if (buffer->length == buffer->capacity) { + size_t prev_bytes = sizeof(*buffer->data) * buffer->capacity; if (buffer->capacity == 0) buffer->capacity = 10; else buffer->capacity *= 2; size_t bytes = sizeof(*buffer->data) * buffer->capacity; - buffer->data = gumbo_realloc(buffer->data, bytes); + buffer->data = gumbo_realloc(buffer->data, prev_bytes, bytes); } size_t index = buffer->length++; buffer->data[index].position = token->position; diff --git a/gumbo-parser/src/util.c b/gumbo-parser/src/util.c index 6238c296057..ea986dde21d 100644 --- a/gumbo-parser/src/util.c +++ b/gumbo-parser/src/util.c @@ -30,8 +30,8 @@ void* gumbo_alloc(size_t size) { return ptr; } -void* gumbo_realloc(void* ptr, size_t size) { - ptr = realloc(ptr, size); +void* gumbo_realloc(void* prev_ptr, size_t prev_size, size_t size) { + void* ptr = realloc(prev_ptr, size); if (unlikely(ptr == NULL)) { perror(__func__); abort(); diff --git a/gumbo-parser/src/util.h b/gumbo-parser/src/util.h index 2d0a828d368..128b3653be4 100644 --- a/gumbo-parser/src/util.h +++ b/gumbo-parser/src/util.h @@ -17,7 +17,7 @@ extern "C" { char* gumbo_strdup(const char* str) XMALLOC NONNULL_ARGS; void* gumbo_alloc(size_t size) XMALLOC; -void* gumbo_realloc(void* ptr, size_t size) RETURNS_NONNULL; +void* gumbo_realloc(void* prev_ptr, size_t prev_size, size_t size) RETURNS_NONNULL; // Debug wrapper for printf #ifdef GUMBO_DEBUG diff --git a/gumbo-parser/src/vector.c b/gumbo-parser/src/vector.c index 4782407787c..5b5e18f2de6 100644 --- a/gumbo-parser/src/vector.c +++ b/gumbo-parser/src/vector.c @@ -40,9 +40,10 @@ void gumbo_vector_destroy(GumboVector* vector) { static void enlarge_vector_if_full(GumboVector* vector) { if (vector->length >= vector->capacity) { if (vector->capacity) { + size_t prev_num_bytes = sizeof(void*) * vector->capacity; vector->capacity *= 2; size_t num_bytes = sizeof(void*) * vector->capacity; - vector->data = gumbo_realloc(vector->data, num_bytes); + vector->data = gumbo_realloc(vector->data, prev_num_bytes, num_bytes); } else { // 0-capacity vector; no previous array to deallocate. vector->capacity = 2; From 2000e35f2a91ec0be4389e6c950c0181a486f5ed Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Tue, 7 Feb 2023 15:29:19 -0500 Subject: [PATCH 3/3] wip: implement a scrappy arena allocator https://www.gingerbill.org/article/2019/02/08/memory-allocation-strategies-002/ so I can benchmark the speedup from not having to free chunks --- ext/nokogiri/gumbo.c | 7 ++ gumbo-parser/src/nokogiri_gumbo.h | 2 + gumbo-parser/src/util.c | 140 ++++++++++++++++++++++++++++++ gumbo-parser/src/util.h | 2 + 4 files changed, 151 insertions(+) diff --git a/ext/nokogiri/gumbo.c b/ext/nokogiri/gumbo.c index 80027194ba2..2c3ef8728ef 100644 --- a/ext/nokogiri/gumbo.c +++ b/ext/nokogiri/gumbo.c @@ -37,6 +37,8 @@ VALUE cNokogiriHtml5Document; static ID internal_subset; static ID parent; +#define GUMBO_ARENA_SIZE (10 * 1024 * 1024) + /* Backwards compatibility to Ruby 2.1.0 */ #if RUBY_API_VERSION_CODE < 20200 #define ONIG_ESCAPE_UCHAR_COLLISION 1 @@ -313,6 +315,7 @@ parse_cleanup(VALUE parse_args) if (args->doc != NULL) { xmlFreeDoc(args->doc); } + gumbo_arena_free_all(); return Qnil; } @@ -329,6 +332,8 @@ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors options.max_errors = NUM2INT(max_errors); options.max_tree_depth = NUM2INT(max_depth); + gumbo_arena_init(GUMBO_ARENA_SIZE); + GumboOutput *output = perform_parse(&options, input); ParseArgs args = { .output = output, @@ -547,6 +552,8 @@ fragment( options.quirks_mode = quirks_mode; options.fragment_context_has_form_ancestor = form; + gumbo_arena_init(GUMBO_ARENA_SIZE); + GumboOutput *output = perform_parse(&options, tags); ParseArgs args = { .output = output, diff --git a/gumbo-parser/src/nokogiri_gumbo.h b/gumbo-parser/src/nokogiri_gumbo.h index 51f69547364..5dad54f283d 100644 --- a/gumbo-parser/src/nokogiri_gumbo.h +++ b/gumbo-parser/src/nokogiri_gumbo.h @@ -938,6 +938,8 @@ void gumbo_print_caret_diagnostic ( ); void gumbo_free(void* ptr); +void gumbo_arena_init(size_t backing_buffer_length); +void gumbo_arena_free_all(void); #ifdef __cplusplus } diff --git a/gumbo-parser/src/util.c b/gumbo-parser/src/util.c index ea986dde21d..be97d588a47 100644 --- a/gumbo-parser/src/util.c +++ b/gumbo-parser/src/util.c @@ -21,8 +21,140 @@ #include "util.h" #include "nokogiri_gumbo.h" +#if GUMBO_USE_ARENA +#include +#include + +static bool is_power_of_two(uintptr_t x) { + return (x & (x-1)) == 0; +} + +static uintptr_t align_forward(uintptr_t ptr, size_t align) { + uintptr_t p, a, modulo; + + assert(is_power_of_two(align)); + + p = ptr; + a = (uintptr_t)align; + // Same as (p % a) but faster as 'a' is a power of two + modulo = p & (a-1); + + if (modulo != 0) { + // If 'p' address is not aligned, push the address to the + // next value which is aligned + p += a - modulo; + } + return p; +} + +#ifndef DEFAULT_ALIGNMENT +#define DEFAULT_ALIGNMENT (2*sizeof(void *)) +#endif + +typedef struct Arena Arena; +struct Arena { + unsigned char *buf; + size_t buf_len; + size_t prev_offset; // This will be useful for later on + size_t curr_offset; +}; + +static Arena gumbo_arena; + +void gumbo_arena_init(size_t backing_buffer_length) { + void* backing_buffer = malloc(backing_buffer_length); + gumbo_arena.buf = (unsigned char *)backing_buffer; + gumbo_arena.buf_len = backing_buffer_length; + gumbo_arena.curr_offset = 0; + gumbo_arena.prev_offset = 0; +} + +void gumbo_arena_free_all(void) { + free(gumbo_arena.buf); + gumbo_arena.buf = 0; + gumbo_arena.buf_len = 0; + gumbo_arena.curr_offset = 0; + gumbo_arena.prev_offset = 0; +} + +static void *gumbo_arena_alloc_align(size_t size, size_t align) { + // Align 'curr_offset' forward to the specified alignment + uintptr_t curr_ptr = (uintptr_t)gumbo_arena.buf + (uintptr_t)gumbo_arena.curr_offset; + uintptr_t offset = align_forward(curr_ptr, align); + offset -= (uintptr_t)gumbo_arena.buf; // Change to relative offset + + // Check to see if the backing memory has space left + if (offset+size <= gumbo_arena.buf_len) { + void *ptr = &gumbo_arena.buf[offset]; + gumbo_arena.prev_offset = offset; + gumbo_arena.curr_offset = offset+size; + + // Zero new memory by default + memset(ptr, 0, size); + return ptr; + } + // Return NULL if the arena is out of memory (or handle differently) + assert(0 && "arena out of memory"); + return NULL; +} + +// Because C doesn't have default parameters +static void *gumbo_arena_alloc(size_t size) { + return gumbo_arena_alloc_align(size, DEFAULT_ALIGNMENT); +} + +static void gumbo_arena_free(void *ptr) { + // Do nothing +} + +static void *gumbo_arena_resize_align(void *old_memory, size_t old_size, size_t new_size, size_t align) { + unsigned char *old_mem = (unsigned char *)old_memory; + + assert(is_power_of_two(align)); + + if (old_mem == NULL || old_size == 0) { + return gumbo_arena_alloc_align(new_size, align); + } else if (gumbo_arena.buf <= old_mem && old_mem < gumbo_arena.buf+gumbo_arena.buf_len) { + if (gumbo_arena.buf+gumbo_arena.prev_offset == old_mem) { + gumbo_arena.curr_offset = gumbo_arena.prev_offset + new_size; + if (new_size > old_size) { + // Zero the new memory by default + memset(&gumbo_arena.buf[gumbo_arena.curr_offset], 0, new_size-old_size); + } + return old_memory; + } else { + void *new_memory = gumbo_arena_alloc_align(new_size, align); + size_t copy_size = old_size < new_size ? old_size : new_size; + // Copy across old memory to the new memory + memmove(new_memory, old_memory, copy_size); + return new_memory; + } + + } else { + assert(0 && "Memory is out of bounds of the buffer in this arena"); + return NULL; + } + +} + +// Because C doesn't have default parameters +static void *gumbo_arena_resize(void *old_memory, size_t old_size, size_t new_size) { + return gumbo_arena_resize_align(old_memory, old_size, new_size, DEFAULT_ALIGNMENT); +} +#else +void gumbo_arena_init(size_t backing_buffer_length) { +} + +void gumbo_arena_free_all(void) { +} +#endif /* GUMBO_USE_ARENA */ + void* gumbo_alloc(size_t size) { +#if GUMBO_USE_ARENA + void* ptr = gumbo_arena_alloc(size); +#else void* ptr = malloc(size); +#endif if (unlikely(ptr == NULL)) { perror(__func__); abort(); @@ -31,7 +163,11 @@ void* gumbo_alloc(size_t size) { } void* gumbo_realloc(void* prev_ptr, size_t prev_size, size_t size) { +#if GUMBO_USE_ARENA + void* ptr = gumbo_arena_resize(prev_ptr, prev_size, size); +#else void* ptr = realloc(prev_ptr, size); +#endif if (unlikely(ptr == NULL)) { perror(__func__); abort(); @@ -40,7 +176,11 @@ void* gumbo_realloc(void* prev_ptr, size_t prev_size, size_t size) { } void gumbo_free(void* ptr) { +#if GUMBO_USE_ARENA + gumbo_arena_free(ptr); +#else free(ptr); +#endif } char* gumbo_strdup(const char* str) { diff --git a/gumbo-parser/src/util.h b/gumbo-parser/src/util.h index 128b3653be4..da3014d0be5 100644 --- a/gumbo-parser/src/util.h +++ b/gumbo-parser/src/util.h @@ -5,6 +5,8 @@ #include #include "macros.h" +#define GUMBO_USE_ARENA 1 + #ifdef __cplusplus extern "C" { #endif