From 1aa7d799daacf33f1de85648a79365fea43d0b0a Mon Sep 17 00:00:00 2001 From: Water-Melon Date: Tue, 10 Oct 2023 11:16:20 +0000 Subject: [PATCH] optimize regular expression component --- docs/Melon Developer Guide.txt | 15 ++- include/mln_regexp.h | 26 ++++-- src/mln_regexp.c | 163 ++++++++++++--------------------- src/mln_websocket.c | 10 +- 4 files changed, 96 insertions(+), 118 deletions(-) diff --git a/docs/Melon Developer Guide.txt b/docs/Melon Developer Guide.txt index 1b3498d6..b6bcb486 100644 --- a/docs/Melon Developer Guide.txt +++ b/docs/Melon Developer Guide.txt @@ -1541,10 +1541,10 @@ Their definitions can be found in melon/include/mln_types.h. own component. Providing this feature is trying to make the capability of parsing HTTP URI better. There are only two functions here. - a) int mln_reg_match(mln_string_t *exp, mln_string_t *text, mln_reg_match_t **head, mln_reg_match_t **tail); + a) int mln_reg_match(mln_string_t *exp, mln_string_t *text, mln_reg_match_result_t *matches); 'exp' is the regular expression that user provided. 'text' is the string that will be matched by 'exp'. - 'head' and 'tail' are the chain to store matched string. If you don't need matched result, just set them NULL. + 'matches' is an array to store matched pieces. Return value: 0 - no matched result >0 - the number of matched result @@ -1554,8 +1554,15 @@ Their definitions can be found in melon/include/mln_types.h. This function will return a non-zero if 'text' is completely matched by 'exp'. Otherwise, zero will be returned. - c) mln_reg_match_result_free(mln_reg_match_t *results); - Free the result chain generated by 'mln_reg_match'. + c) mln_reg_match_result_new(prealloc); + Create a new matched result object. + 'prealloc' is the pre-allocated array elements. + + d) mln_reg_match_result_free(res); + Free the matched results. + + e) mln_reg_match_result_get(res); + Get the array pointer of matched results. Melon support these symbols below, a) * diff --git a/include/mln_regexp.h b/include/mln_regexp.h index 83e5796f..3de6d0e9 100644 --- a/include/mln_regexp.h +++ b/include/mln_regexp.h @@ -7,6 +7,9 @@ #define __MLN_REGEXP_H #include "mln_string.h" +#include "mln_array.h" + +typedef mln_array_t mln_reg_match_result_t; #define M_REGEXP_MASK_SQUARE ((unsigned int)0x00800000) #define M_REGEXP_MASK_OR ((unsigned int)0x01000000) @@ -33,15 +36,22 @@ #define M_REGEXP_SUB 173 #define M_REGEXP_OR 174 -typedef struct mln_reg_match_s { - mln_string_t data; - struct mln_reg_match_s *prev; - struct mln_reg_match_s *next; -} mln_reg_match_t; -extern int mln_reg_match(mln_string_t *exp, mln_string_t *text, mln_reg_match_t **head, mln_reg_match_t **tail); -extern int mln_reg_equal(mln_string_t *exp, mln_string_t *text); -extern void mln_reg_match_result_free(mln_reg_match_t *results); +#define mln_reg_match_result_new(prealloc) ({\ + struct mln_array_attr attr;\ + attr.pool = NULL;\ + attr.pool_alloc = NULL;\ + attr.pool_free = NULL;\ + attr.free = NULL;\ + attr.size = sizeof(mln_string_t);\ + attr.nalloc = (prealloc);\ + mln_array_new(&attr);\ +}) +#define mln_reg_match_result_free(res) mln_array_free(res) +#define mln_reg_match_result_get(res) (mln_string_t *)mln_array_elts(res) + +extern int mln_reg_match(mln_string_t *exp, mln_string_t *text, mln_reg_match_result_t *matches) __NONNULL3(1,2,3); +extern int mln_reg_equal(mln_string_t *exp, mln_string_t *text) __NONNULL2(1,2); #endif diff --git a/src/mln_regexp.c b/src/mln_regexp.c index 89d2738d..4178f7a3 100644 --- a/src/mln_regexp.c +++ b/src/mln_regexp.c @@ -10,37 +10,29 @@ #include #include "mln_regexp.h" -MLN_CHAIN_FUNC_DECLARE(mln_reg_match, \ - mln_reg_match_t, \ - static inline void, ); -MLN_CHAIN_FUNC_DEFINE(mln_reg_match, \ - mln_reg_match_t, \ - static inline void, \ - prev, next); - static int mln_match_star(char *mregexp, int mreglen, \ char *regexp, char *text, \ int reglen, int textlen, \ - mln_reg_match_t **head, mln_reg_match_t **tail); + mln_reg_match_result_t *matches); static int mln_match_here(unsigned int flag, \ char *regexp, char *text, \ int reglen, int textlen, \ - mln_reg_match_t **head, mln_reg_match_t **tail); + mln_reg_match_result_t *matches); static int mln_match_plus(char *mregexp, int mreglen, \ char *regexp, char *text, \ int reglen, int textlen, \ - mln_reg_match_t **head, mln_reg_match_t **tail); + mln_reg_match_result_t *matches); static int mln_match_question(char *mregexp, int mreglen, \ char *regexp, char *text, \ int reglen, int textlen, \ - mln_reg_match_t **head, mln_reg_match_t **tail); + mln_reg_match_result_t *matches); static int mln_match_brace(char *mregexp, int mreglen, \ char *regexp, char *text, \ int reglen, int textlen, \ int min, int max, \ - mln_reg_match_t **head, mln_reg_match_t **tail); + mln_reg_match_result_t *matches); static inline int -mln_match_square(char *regexp, int reglen, char **text, int *textlen, mln_reg_match_t **head, mln_reg_match_t **tail); +mln_match_square(char *regexp, int reglen, char **text, int *textlen, mln_reg_match_result_t *matches); static inline void mln_match_get_limit(char *regexp, int reglen, int *min, int *max); static inline int mln_get_char(unsigned int flag, char *s, int len); @@ -49,18 +41,16 @@ static inline int mln_process_or(unsigned int flag, \ char **regexp, int *reglen, \ char **text, int *textlen, \ - mln_reg_match_t **head, mln_reg_match_t **tail); + mln_reg_match_result_t *matches); static int mln_or_return_val(char **regexp, int *reglen, char *rexp, int rlen, int rv); static inline void mln_adjust_or_pos(unsigned int flag, char **rexp, int *rlen); -static inline mln_reg_match_t *mln_reg_match_new(mln_u8ptr_t data, mln_size_t len); -static inline void mln_reg_match_free(mln_reg_match_t *match); static int mln_match_here(unsigned int flag, \ char *regexp, char *text, \ int reglen, int textlen, \ - mln_reg_match_t **head, mln_reg_match_t **tail) + mln_reg_match_result_t *matches) { int steplen, count, c_0, len_0, c_n, len_n, ret; @@ -75,7 +65,7 @@ static int mln_match_here(unsigned int flag, \ if (!(flag & M_REGEXP_SPECIAL_MASK)) { if (!(flag & M_REGEXP_MASK_OR)) { - ret = mln_process_or(flag, ®exp, ®len, &text, &textlen, head, tail); + ret = mln_process_or(flag, ®exp, ®len, &text, &textlen, matches); if (ret < 0) { return -1; } else if (ret > 0) { @@ -131,19 +121,19 @@ static int mln_match_here(unsigned int flag, \ return mln_match_star(regexp, steplen, \ regexp+steplen+len_n, text, \ reglen-steplen-len_n, textlen, \ - head, tail); + matches); } if (c_n == M_REGEXP_PLUS) { return mln_match_plus(regexp, steplen, \ regexp+steplen+len_n, text, \ reglen-steplen-len_n, textlen, \ - head, tail); + matches); } if (c_n == M_REGEXP_QUES) { return mln_match_question(regexp, steplen, \ regexp+steplen+len_n, text, \ reglen-steplen-len_n, textlen, \ - head, tail); + matches); } if (c_n == M_REGEXP_LBRACE) { int part = 1, min, max, existent = 0; @@ -182,7 +172,7 @@ static int mln_match_here(unsigned int flag, \ return mln_match_brace(regexp, steplen, \ regexp+steplen+(reglen-len)+l, text, \ len-steplen-l, textlen, \ - min, max, head, tail); + min, max, matches); } } @@ -248,7 +238,7 @@ static int mln_match_here(unsigned int flag, \ } if (c_0 == M_REGEXP_LSQUAR) { - if (mln_match_square(regexp, steplen, &text, &textlen, head, tail) < 0) { + if (mln_match_square(regexp, steplen, &text, &textlen, matches) < 0) { return -1; } regexp += steplen; @@ -257,16 +247,16 @@ static int mln_match_here(unsigned int flag, \ } if (c_0 == M_REGEXP_LPAR) { - int left = mln_match_here(M_REGEXP_MASK_NEW, regexp+len_0, text, steplen-(len_0<<1), textlen, head, tail); + int left = mln_match_here(M_REGEXP_MASK_NEW, regexp+len_0, text, steplen-(len_0<<1), textlen, matches); if (left < 0) { return -1; } - if (head != NULL && tail != NULL) { - mln_reg_match_t *match; - if ((match = mln_reg_match_new((mln_u8ptr_t)text, textlen-left)) == NULL) { + if (matches != NULL) { + mln_string_t *s; + if ((s = (mln_string_t *)mln_array_push(matches)) == NULL) { return -1; } - mln_reg_match_chain_add(head, tail, match); + mln_string_nset(s, text, textlen - left); } regexp += steplen; reglen -= steplen; @@ -291,7 +281,7 @@ static inline int mln_process_or(unsigned int flag, \ char **regexp, int *reglen, \ char **text, int *textlen, \ - mln_reg_match_t **head, mln_reg_match_t **tail) + mln_reg_match_result_t *matches) { char *rexp = *regexp; int rlen = *reglen; @@ -372,7 +362,7 @@ mln_process_or(unsigned int flag, \ match_len = rlen - left; match: - ret = mln_match_here(flag|M_REGEXP_MASK_OR, rexp, *text, match_len, *textlen, head, tail); + ret = mln_match_here(flag|M_REGEXP_MASK_OR, rexp, *text, match_len, *textlen, matches); rexp += match_len; rlen -= match_len; @@ -478,7 +468,7 @@ mln_adjust_or_pos(unsigned int flag, char **rexp, int *rlen) } static inline int -mln_match_square(char *regexp, int reglen, char **text, int *textlen, mln_reg_match_t **head, mln_reg_match_t **tail) +mln_match_square(char *regexp, int reglen, char **text, int *textlen, mln_reg_match_result_t *matches) { int c, len, reverse = 0, count, left, steplen; int end_c, tmp_c, tmp_len; @@ -555,7 +545,7 @@ mln_match_square(char *regexp, int reglen, char **text, int *textlen, mln_reg_ma } if (*textlen <= 0) return -1; - left = mln_match_here(M_REGEXP_MASK_SQUARE, regexp, *text, steplen, *textlen, head, tail); + left = mln_match_here(M_REGEXP_MASK_SQUARE, regexp, *text, steplen, *textlen, matches); if (left >= 0) { if (!reverse) { (*text) += (*textlen - left); @@ -605,7 +595,7 @@ mln_match_get_limit(char *regexp, int reglen, int *min, int *max) static int mln_match_star(char *mregexp, int mreglen, \ char *regexp, char *text, \ int reglen, int textlen, \ - mln_reg_match_t **head, mln_reg_match_t **tail) + mln_reg_match_result_t *matches) { int ret; char dot = (char)M_REGEXP_DOT; @@ -617,10 +607,10 @@ static int mln_match_star(char *mregexp, int mreglen, \ if (mreglen > 1) { int found = 0; again: - ret = mln_match_here(0, mregexp, text, mreglen, textlen, head, tail); + ret = mln_match_here(0, mregexp, text, mreglen, textlen, matches); if (ret < 0) { if (reglen <= 0) return found? textlen: ret; - ret = mln_match_here(0, regexp, text, reglen, textlen, head, tail); + ret = mln_match_here(0, regexp, text, reglen, textlen, matches); if (found) { return ret < 0? textlen: ret; } else { @@ -631,19 +621,19 @@ static int mln_match_star(char *mregexp, int mreglen, \ text += (textlen - ret); textlen = ret; if (textlen > 0) goto again; - if (reglen > 0) return mln_match_here(0, regexp, text, reglen, textlen, head, tail); + if (reglen > 0) return mln_match_here(0, regexp, text, reglen, textlen, matches); return 0; } } while (textlen > 0 && \ - (mln_match_here(M_REGEXP_STAR, mregexp, text, mreglen, textlen, head, tail) >= 0 || \ - mln_match_here(M_REGEXP_STAR, mregexp, &dot, mreglen, 1, head, tail) >= 0)) + (mln_match_here(M_REGEXP_STAR, mregexp, text, mreglen, textlen, matches) >= 0 || \ + mln_match_here(M_REGEXP_STAR, mregexp, &dot, mreglen, 1, matches) >= 0)) { ++text; --textlen; if (reglen > 0) { - if (mln_match_here(0, regexp, text, reglen, textlen, NULL, NULL) >= 0) { + if (mln_match_here(0, regexp, text, reglen, textlen, NULL) >= 0) { record_text = text; record_len = textlen; } @@ -655,7 +645,7 @@ static int mln_match_star(char *mregexp, int mreglen, \ text = record_text; textlen = record_len; } - return mln_match_here(0, regexp, text, reglen, textlen, head, tail); + return mln_match_here(0, regexp, text, reglen, textlen, matches); } return textlen; @@ -664,30 +654,30 @@ static int mln_match_star(char *mregexp, int mreglen, \ static int mln_match_plus(char *mregexp, int mreglen, \ char *regexp, char *text, \ int reglen, int textlen, \ - mln_reg_match_t **head, mln_reg_match_t **tail) + mln_reg_match_result_t *matches) { int ret, found = 0; char dot = (char)M_REGEXP_DOT; if (mreglen > 1) { again: - ret = mln_match_here(0, mregexp, text, mreglen, textlen, head, tail); + ret = mln_match_here(0, mregexp, text, mreglen, textlen, matches); if (ret < 0) { if (found == 0) return ret; - return mln_match_here(0, regexp, text, reglen, textlen, head, tail); + return mln_match_here(0, regexp, text, reglen, textlen, matches); } else { found = 1; text += (textlen - ret); textlen = ret; if (textlen > 0) goto again; - if (reglen > 0) return mln_match_here(0, regexp, text, reglen, textlen, head, tail); + if (reglen > 0) return mln_match_here(0, regexp, text, reglen, textlen, matches); return 0; } } while (textlen > 0 && \ - (mln_match_here(M_REGEXP_PLUS, mregexp, text, mreglen, textlen, head, tail) >= 0 || \ - mln_match_here(M_REGEXP_PLUS, mregexp, &dot, mreglen, 1, head, tail) >= 0)) + (mln_match_here(M_REGEXP_PLUS, mregexp, text, mreglen, textlen, matches) >= 0 || \ + mln_match_here(M_REGEXP_PLUS, mregexp, &dot, mreglen, 1, matches) >= 0)) { found = 1; ++text; @@ -695,7 +685,7 @@ static int mln_match_plus(char *mregexp, int mreglen, \ } if (found) { if (textlen > 0) - return mln_match_here(0, regexp, text, reglen, textlen, head, tail); + return mln_match_here(0, regexp, text, reglen, textlen, matches); return textlen; } @@ -705,52 +695,52 @@ static int mln_match_plus(char *mregexp, int mreglen, \ static int mln_match_question(char *mregexp, int mreglen, \ char *regexp, char *text, \ int reglen, int textlen, \ - mln_reg_match_t **head, mln_reg_match_t **tail) + mln_reg_match_result_t *matches) { int ret; if (mreglen > 1) { - ret = mln_match_here(0, mregexp, text, mreglen, textlen, head, tail); + ret = mln_match_here(0, mregexp, text, mreglen, textlen, matches); if (ret >= 0) { text += (textlen - ret); textlen = ret; } - return mln_match_here(0, regexp, text, reglen, textlen, head, tail); + return mln_match_here(0, regexp, text, reglen, textlen, matches); } - if (mln_match_here(M_REGEXP_QUES, mregexp, text, mreglen, textlen, head, tail) >= 0) - return mln_match_here(0, regexp, text+1, reglen, textlen-1, head, tail); - return mln_match_here(0, regexp, text, reglen, textlen, head, tail); + if (mln_match_here(M_REGEXP_QUES, mregexp, text, mreglen, textlen, matches) >= 0) + return mln_match_here(0, regexp, text+1, reglen, textlen-1, matches); + return mln_match_here(0, regexp, text, reglen, textlen, matches); } static int mln_match_brace(char *mregexp, int mreglen, \ char *regexp, char *text, \ int reglen, int textlen, \ int min, int max, \ - mln_reg_match_t **head, mln_reg_match_t **tail) + mln_reg_match_result_t *matches) { int ret, found = 0; char dot = (char)M_REGEXP_DOT; if (mreglen > 1) { again: - ret = mln_match_here(0, mregexp, text, mreglen, textlen, head, tail); + ret = mln_match_here(0, mregexp, text, mreglen, textlen, matches); if (ret < 0) { if (reglen <= 0 || found < min) return ret; - return mln_match_here(0, regexp, text, reglen, textlen, head, tail); + return mln_match_here(0, regexp, text, reglen, textlen, matches); } else { ++found; text += (textlen - ret); textlen = ret; if (textlen > 0 && (max < 0 || found < max)) goto again; if (textlen <= 0 && reglen <= 0) return 0; - return mln_match_here(0, regexp, text, reglen, textlen, head, tail); + return mln_match_here(0, regexp, text, reglen, textlen, matches); } } while (textlen > 0 && \ - (mln_match_here(M_REGEXP_LBRACE, mregexp, text, mreglen, textlen, head, tail) >= 0 || \ - mln_match_here(M_REGEXP_LBRACE, mregexp, &dot, mreglen, 1, head, tail) >= 0)) + (mln_match_here(M_REGEXP_LBRACE, mregexp, text, mreglen, textlen, matches) >= 0 || \ + mln_match_here(M_REGEXP_LBRACE, mregexp, &dot, mreglen, 1, matches) >= 0)) { ++found; ++text; @@ -759,7 +749,7 @@ static int mln_match_brace(char *mregexp, int mreglen, \ } if (found >= min) { if (textlen > 0 || reglen > 0) - return mln_match_here(0, regexp, text, reglen, textlen, head, tail); + return mln_match_here(0, regexp, text, reglen, textlen, matches); return textlen; } @@ -851,64 +841,27 @@ static inline int mln_get_length(char *s, int len) return 1; } -static inline mln_reg_match_t *mln_reg_match_new(mln_u8ptr_t data, mln_size_t len) -{ - mln_reg_match_t *match; - if ((match = (mln_reg_match_t *)malloc(sizeof(mln_reg_match_t))) == NULL) { - return NULL; - } - match->data.data = data; - match->data.len = len; - match->prev = match->next = NULL; - return match; -} - -static inline void mln_reg_match_free(mln_reg_match_t *match) -{ - if (match == NULL) return; - free(match); -} - -int mln_reg_match(mln_string_t *exp, mln_string_t *text, mln_reg_match_t **head, mln_reg_match_t **tail) +int mln_reg_match(mln_string_t *exp, mln_string_t *text, mln_reg_match_result_t *matches) { int ret; - mln_reg_match_t *match, *h = NULL, *t = NULL; + mln_string_t *s; - ret = mln_match_here(M_REGEXP_MASK_NEW, (char *)(exp->data), (char *)(text->data), exp->len, text->len, &h, &t); + ret = mln_match_here(M_REGEXP_MASK_NEW, (char *)(exp->data), (char *)(text->data), exp->len, text->len, matches); if (ret < 0) { - mln_reg_match_result_free(h); return -1; } if (text->len - ret > 0) { - if ((match = mln_reg_match_new(text->data, text->len-ret)) == NULL) { - mln_reg_match_result_free(h); + if ((s = (mln_string_t *)mln_array_push(matches)) == NULL) { return -1; } - mln_reg_match_chain_add(&h, &t, match); - for (ret = 0, match = h; match != NULL; match = match->next, ++ret) - ; - if (head != NULL) *head = h; - if (tail != NULL) *tail = t; - if (head == NULL && tail == NULL) { - mln_reg_match_result_free(h); - } - return ret; + mln_string_nset(s, text->data, text->len - ret); + return mln_array_nelts(matches); } return 0; } int mln_reg_equal(mln_string_t *exp, mln_string_t *text) { - return !mln_match_here(M_REGEXP_MASK_NEW, (char *)(exp->data), (char *)(text->data), exp->len, text->len, NULL, NULL); -} - -void mln_reg_match_result_free(mln_reg_match_t *results) -{ - mln_reg_match_t *fr; - while ((fr = results) != NULL) { - results = results->next; - mln_reg_match_free(fr); - } - mln_reg_match_chain_del(NULL, NULL, NULL); + return !mln_match_here(M_REGEXP_MASK_NEW, (char *)(exp->data), (char *)(text->data), exp->len, text->len, NULL); } diff --git a/src/mln_websocket.c b/src/mln_websocket.c index f7245740..58a71354 100644 --- a/src/mln_websocket.c +++ b/src/mln_websocket.c @@ -235,9 +235,17 @@ int mln_websocket_match(mln_websocket_t *ws) static int mln_websocket_match_iterate_handler(void *key, void *val, void *data) { + mln_reg_match_result_t *res = NULL; mln_string_t *tmp = mln_http_field_get((mln_http_t *)data, (mln_string_t *)key); if (tmp == NULL) return -1; - if (val != NULL && mln_reg_match((mln_string_t *)val, tmp, NULL, NULL) <= 0) return -1; + if ((res = mln_reg_match_result_new(1)) == NULL) { + return -1; + } + if (val != NULL && mln_reg_match((mln_string_t *)val, tmp, res) <= 0) { + mln_reg_match_result_free(res); + return -1; + } + mln_reg_match_result_free(res); return 0; }