diff options
author | robot-contrib <robot-contrib@yandex-team.com> | 2025-05-12 22:18:09 +0300 |
---|---|---|
committer | robot-contrib <robot-contrib@yandex-team.com> | 2025-05-12 22:33:22 +0300 |
commit | d22c08a7d678e53cb36b1a6068f0e3c14a1713b3 (patch) | |
tree | 8702e6f4148d64b22a1915a38f0cdc815e76efb7 /contrib/restricted/aws/aws-c-sdkutils/source/endpoints_regex.c | |
parent | 8925676461a506dd08a6f6ad0c37cb858df5c4da (diff) | |
download | ydb-d22c08a7d678e53cb36b1a6068f0e3c14a1713b3.tar.gz |
Update contrib/restricted/aws/aws-c-sdkutils to 0.1.16
commit_hash:e8c3b737590c152923b35ced443a6b28db57d1d1
Diffstat (limited to 'contrib/restricted/aws/aws-c-sdkutils/source/endpoints_regex.c')
-rw-r--r-- | contrib/restricted/aws/aws-c-sdkutils/source/endpoints_regex.c | 435 |
1 files changed, 435 insertions, 0 deletions
diff --git a/contrib/restricted/aws/aws-c-sdkutils/source/endpoints_regex.c b/contrib/restricted/aws/aws-c-sdkutils/source/endpoints_regex.c new file mode 100644 index 00000000000..38dd369252f --- /dev/null +++ b/contrib/restricted/aws/aws-c-sdkutils/source/endpoints_regex.c @@ -0,0 +1,435 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include <aws/sdkutils/private/endpoints_regex.h> + +/* + * Minimal regex implementation. + * Inspired by + * https://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html and + * https://github.com/kokke/tiny-regex-c. + * + * Why write our own regex implementation? + * Unfortunately, state of cross-platform regex support for c is rather limited. + * Posix has regex support, but implementation support varies cross platform. + * Windows supports regex, but only exposes it through C++ interface. + * For 3p implementations tiny-regex-c comes closest to what we need, but has + * several deal-breaking limitations, ex. not being thread safe, lack of + * alternations support. + * Other 3p C implementations are very bloated for what we need. + * Hence, since we need a very minimal regex support for endpoint resolution we + * just implement our own. + * + * What is supported? + * - ascii only matching (no unicode or other encoding support) + * - multithread safe iterative matching (stack friendly, since this is + * typically called deep in call stack) + * - char matching (plain ascii chars, alpha/digit wildcards) + * - star and plus (refer to limitations sections for limitations on how they work) + * - alternation groups + * + * Limitations? + * - star and plus are greedy (match as much as they can), but do not backtrace. + * This is major deviation from how regex matching should work. + * Note: regions in aws have a predefined pattern where sections are separated + * by '-', so current implementation just matches until it hits separator. + * - grouping using ( and ) is only supported for alternations. + * - regex must match the whole text, i.e. start with ^ and end with $ + * - features not called out above are not supported + * - alternations pick first occurrence that matches and do not backtrack to see + * if there are any other occurrences + * + * Examples + * current implementation is targeted towards matching typical aws region + * patterns like "^(us|eu|ap|sa|ca|me|af|il)\\-\\w+\\-\\d+$" (aws partition) or + * "^us\\-gov\\-\\w+\\-\\d+$" (aws gov partition). + * All current regions follow + * "country code(2 chars)-meta(like gov or iso, optional)-direction-digit" + * and implementation should provide enough features to match those regions. + * Patterns that would not match correctly are things like "a*a" (star will + * exhaustively match a and will not give last a back) or (ab|abc), which will + * not match abc because alternation will lock into ab. + */ + +enum regex_symbol_type { + AWS_ENDPOINTS_REGEX_SYMBOL_DOT, + AWS_ENDPOINTS_REGEX_SYMBOL_STAR, + AWS_ENDPOINTS_REGEX_SYMBOL_PLUS, + AWS_ENDPOINTS_REGEX_SYMBOL_DIGIT, + AWS_ENDPOINTS_REGEX_SYMBOL_ALPHA, + AWS_ENDPOINTS_REGEX_SYMBOL_CHAR, + AWS_ENDPOINTS_REGEX_SYMBOL_ALTERNATION_GROUP, +}; + +struct aws_endpoints_regex_symbol { + enum regex_symbol_type type; + + union { + uint8_t ch; + struct aws_string *alternation; + } info; +}; + +struct aws_endpoints_regex { + struct aws_array_list symbols; +}; + +/* Somewhat arbitrary limits on size of regex and text to avoid overly large + * inputs. */ +enum { + s_max_regex_length = 60, + s_max_text_length = 50, + s_max_elements_per_alteration = 20, +}; + +static void s_clean_up_symbols(struct aws_array_list *symbols) { + for (size_t i = 0; i < aws_array_list_length(symbols); ++i) { + struct aws_endpoints_regex_symbol *element = NULL; + aws_array_list_get_at_ptr(symbols, (void **)&element, i); + + if (element->type == AWS_ENDPOINTS_REGEX_SYMBOL_ALTERNATION_GROUP) { + aws_string_destroy(element->info.alternation); + } + } +} + +int s_validate_regex(const struct aws_endpoints_regex *regex) { + AWS_FATAL_PRECONDITION(regex != NULL); + + for (size_t sym_idx = 0; sym_idx < aws_array_list_length(®ex->symbols); ++sym_idx) { + struct aws_endpoints_regex_symbol *symbol = NULL; + aws_array_list_get_at_ptr(®ex->symbols, (void **)&symbol, sym_idx); + + if (symbol->type == AWS_ENDPOINTS_REGEX_SYMBOL_PLUS || symbol->type == AWS_ENDPOINTS_REGEX_SYMBOL_STAR) { + + /* first symbol */ + if (sym_idx == 0) { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, "Invalid regex pattern. Regex cannot start with star or plus."); + return aws_raise_error(AWS_ERROR_INVALID_ARGUMENT); + } + + struct aws_endpoints_regex_symbol *prev_symbol = NULL; + aws_array_list_get_at_ptr(®ex->symbols, (void **)&prev_symbol, sym_idx - 1); + + /* reasonable symbol before */ + enum regex_symbol_type prev_type = prev_symbol->type; + if (!(prev_type == AWS_ENDPOINTS_REGEX_SYMBOL_DOT || prev_type == AWS_ENDPOINTS_REGEX_SYMBOL_DIGIT || + prev_type == AWS_ENDPOINTS_REGEX_SYMBOL_ALPHA || prev_type == AWS_ENDPOINTS_REGEX_SYMBOL_CHAR || + prev_type == AWS_ENDPOINTS_REGEX_SYMBOL_ALTERNATION_GROUP)) { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, + "Unsupported regex pattern. Star or plus after unsupported character."); + return aws_raise_error(AWS_ERROR_SDKUTILS_ENDPOINTS_UNSUPPORTED_REGEX); + } + + /* ends with - delimiter */ + if (sym_idx != aws_array_list_length(®ex->symbols) - 1) { + struct aws_endpoints_regex_symbol *next_symbol = NULL; + aws_array_list_get_at_ptr(®ex->symbols, (void **)&next_symbol, sym_idx + 1); + + if (next_symbol->type != AWS_ENDPOINTS_REGEX_SYMBOL_CHAR || next_symbol->info.ch != '-') { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, + "Unsupported regex pattern. Star or plus must be followed by - delimiter."); + return aws_raise_error(AWS_ERROR_SDKUTILS_ENDPOINTS_UNSUPPORTED_REGEX); + } + } + } else if (symbol->type == AWS_ENDPOINTS_REGEX_SYMBOL_ALTERNATION_GROUP) { + + struct aws_byte_cursor alternation = aws_byte_cursor_from_string(symbol->info.alternation); + + /* Not empty */ + if (alternation.len == 0) { + AWS_LOGF_ERROR(AWS_LS_SDKUTILS_ENDPOINTS_REGEX, "Invalid regex pattern. Empty group."); + return aws_raise_error(AWS_ERROR_INVALID_ARGUMENT); + } + + /* Verify that group is only used for alternation. */ + for (size_t i = 0; i < alternation.len; ++i) { + if (!aws_isalnum(alternation.ptr[i]) && alternation.ptr[i] != '|') { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, + "Unsupported regex pattern. Only alternation groups are supported."); + return aws_raise_error(AWS_ERROR_SDKUTILS_ENDPOINTS_UNSUPPORTED_REGEX); + } + } + + /* alternation elements are unique and not subsets of each other */ + struct aws_byte_cursor elements[s_max_elements_per_alteration]; + size_t num_elements = 0; + struct aws_byte_cursor split = {0}; + while (aws_byte_cursor_next_split(&alternation, '|', &split)) { + if (num_elements == s_max_elements_per_alteration) { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, "Unsupported regex pattern. Too many element in alternation"); + return aws_raise_error(AWS_ERROR_SDKUTILS_ENDPOINTS_UNSUPPORTED_REGEX); + } + + if (split.len == 0) { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, "Invalid regex pattern. Alternation element cannot be empty"); + return aws_raise_error(AWS_ERROR_INVALID_ARGUMENT); + } + + elements[num_elements] = split; + ++num_elements; + } + + struct aws_byte_cursor input; + struct aws_byte_cursor prefix; + for (size_t i = 0; i < num_elements; ++i) { + for (size_t j = i + 1; j < num_elements; ++j) { + + if (elements[i].len <= elements[j].len) { + input = elements[j]; + prefix = elements[i]; + } else { + input = elements[i]; + prefix = elements[j]; + } + + if (aws_byte_cursor_starts_with(&input, &prefix)) { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, + "Unsupported regex pattern. One alternation element cannot be a prefix of another " + "element."); + return aws_raise_error(AWS_ERROR_SDKUTILS_ENDPOINTS_UNSUPPORTED_REGEX); + } + } + } + } + } + + return AWS_OP_SUCCESS; +} + +struct aws_endpoints_regex *aws_endpoints_regex_new( + struct aws_allocator *allocator, + struct aws_byte_cursor regex_pattern) { + + if (regex_pattern.len == 0 || regex_pattern.len > s_max_regex_length) { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, + "Invalid regex pattern size. Must be between 1 and %d", + s_max_regex_length); + aws_raise_error(AWS_ERROR_INVALID_ARGUMENT); + return NULL; + } + + if (regex_pattern.ptr[0] != '^' || regex_pattern.ptr[regex_pattern.len - 1] != '$') { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, + "Unsupported regex pattern. Supported patterns must match the whole text."); + aws_raise_error(AWS_ERROR_SDKUTILS_ENDPOINTS_UNSUPPORTED_REGEX); + return NULL; + } + + /* Ignore begin/end chars */ + aws_byte_cursor_advance(®ex_pattern, 1); + --regex_pattern.len; + + struct aws_endpoints_regex *re = aws_mem_calloc(allocator, 1, sizeof(struct aws_endpoints_regex)); + aws_array_list_init_dynamic(&re->symbols, allocator, regex_pattern.len, sizeof(struct aws_endpoints_regex_symbol)); + + while (regex_pattern.len > 0) { + uint8_t ch = regex_pattern.ptr[0]; + aws_byte_cursor_advance(®ex_pattern, 1); + + struct aws_endpoints_regex_symbol symbol; + switch (ch) { + case '.': + symbol.type = AWS_ENDPOINTS_REGEX_SYMBOL_DOT; + break; + case '*': + symbol.type = AWS_ENDPOINTS_REGEX_SYMBOL_STAR; + break; + case '+': + symbol.type = AWS_ENDPOINTS_REGEX_SYMBOL_PLUS; + break; + case '\\': + if (regex_pattern.len == 0) { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, "Invalid regex pattern. Pattern ends with escape character."); + aws_raise_error(AWS_ERROR_INVALID_ARGUMENT); + goto on_error; + } + switch (regex_pattern.ptr[0]) { + /* Predefined patterns */ + case 'd': + symbol.type = AWS_ENDPOINTS_REGEX_SYMBOL_DIGIT; + break; + case 'w': + symbol.type = AWS_ENDPOINTS_REGEX_SYMBOL_ALPHA; + break; + + /* Escaped chars, ex. * or + */ + default: + symbol.type = AWS_ENDPOINTS_REGEX_SYMBOL_CHAR; + symbol.info.ch = regex_pattern.ptr[0]; + break; + } + aws_byte_cursor_advance(®ex_pattern, 1); + break; + case '(': { + struct aws_byte_cursor group = {0}; + if (!aws_byte_cursor_next_split(®ex_pattern, ')', &group)) { + AWS_LOGF_ERROR(AWS_LS_SDKUTILS_ENDPOINTS_REGEX, "Invalid regex pattern. Invalid group syntax."); + aws_raise_error(AWS_ERROR_INVALID_ARGUMENT); + goto on_error; + } + + aws_byte_cursor_advance(®ex_pattern, group.len); + if (regex_pattern.len == 0 || regex_pattern.ptr[0] != ')') { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, "Invalid regex pattern. Missing closing parenthesis."); + aws_raise_error(AWS_ERROR_INVALID_ARGUMENT); + goto on_error; + } + aws_byte_cursor_advance(®ex_pattern, 1); + + symbol.type = AWS_ENDPOINTS_REGEX_SYMBOL_ALTERNATION_GROUP; + symbol.info.alternation = aws_string_new_from_cursor(allocator, &group); + break; + } + + default: { + if (!aws_isalnum(ch)) { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, "Unsupported regex pattern. Unknown character %c", ch); + aws_raise_error(AWS_ERROR_SDKUTILS_ENDPOINTS_UNSUPPORTED_REGEX); + goto on_error; + } + + symbol.type = AWS_ENDPOINTS_REGEX_SYMBOL_CHAR; + symbol.info.ch = ch; + break; + } + } + + aws_array_list_push_back(&re->symbols, &symbol); + } + + if (s_validate_regex(re)) { + goto on_error; + } + + return re; + +on_error: + aws_endpoints_regex_destroy(re); + return NULL; +} + +void aws_endpoints_regex_destroy(struct aws_endpoints_regex *regex) { + if (regex == NULL) { + return; + } + + struct aws_allocator *allocator = regex->symbols.alloc; + s_clean_up_symbols(®ex->symbols); + aws_array_list_clean_up(®ex->symbols); + aws_mem_release(allocator, ®ex->symbols); +} + +static bool s_match_one(const struct aws_endpoints_regex_symbol *symbol, struct aws_byte_cursor *text) { + if (text->len == 0) { + return false; + } + + uint8_t ch = text->ptr[0]; + switch (symbol->type) { + case AWS_ENDPOINTS_REGEX_SYMBOL_ALPHA: + return aws_isalpha(ch); + case AWS_ENDPOINTS_REGEX_SYMBOL_DIGIT: + return aws_isdigit(ch); + case AWS_ENDPOINTS_REGEX_SYMBOL_CHAR: + return ch == symbol->info.ch; + case AWS_ENDPOINTS_REGEX_SYMBOL_DOT: + return true; + default: + AWS_FATAL_ASSERT(true); + } + + return false; +} + +static void s_match_star(const struct aws_endpoints_regex_symbol *symbol, struct aws_byte_cursor *text) { + while (s_match_one(symbol, text)) { + aws_byte_cursor_advance(text, 1); + } +} + +static bool s_match_plus(const struct aws_endpoints_regex_symbol *symbol, struct aws_byte_cursor *text) { + if (!s_match_one(symbol, text)) { + return false; + } + + aws_byte_cursor_advance(text, 1); + s_match_star(symbol, text); + return true; +} + +int aws_endpoints_regex_match(const struct aws_endpoints_regex *regex, struct aws_byte_cursor text) { + AWS_PRECONDITION(regex); + + if (text.len == 0 || text.len > s_max_text_length) { + AWS_LOGF_ERROR( + AWS_LS_SDKUTILS_ENDPOINTS_REGEX, "Invalid text size. Must be between 1 and %d", s_max_text_length); + return aws_raise_error(AWS_ERROR_INVALID_ARGUMENT); + } + + for (size_t i = 0; i < aws_array_list_length(®ex->symbols); ++i) { + struct aws_endpoints_regex_symbol *symbol = NULL; + aws_array_list_get_at_ptr(®ex->symbols, (void **)&symbol, i); + + /* looks forward to check if symbol has * or + modifier */ + if (i + 1 < aws_array_list_length(®ex->symbols)) { + struct aws_endpoints_regex_symbol *next_symbol = NULL; + aws_array_list_get_at_ptr(®ex->symbols, (void **)&next_symbol, i + 1); + + if (next_symbol->type == AWS_ENDPOINTS_REGEX_SYMBOL_STAR || + next_symbol->type == AWS_ENDPOINTS_REGEX_SYMBOL_PLUS) { + if (next_symbol->type == AWS_ENDPOINTS_REGEX_SYMBOL_STAR) { + s_match_star(symbol, &text); + } else if (next_symbol->type == AWS_ENDPOINTS_REGEX_SYMBOL_PLUS) { + if (!s_match_plus(symbol, &text)) { + return aws_raise_error(AWS_ERROR_SDKUTILS_ENDPOINTS_REGEX_NO_MATCH); + } + } + ++i; + continue; + } + } + + switch (symbol->type) { + case AWS_ENDPOINTS_REGEX_SYMBOL_ALTERNATION_GROUP: { + struct aws_byte_cursor variant = {0}; + struct aws_byte_cursor alternation = aws_byte_cursor_from_string(symbol->info.alternation); + size_t chars_in_match = 0; + while (aws_byte_cursor_next_split(&alternation, '|', &variant)) { + if (aws_byte_cursor_starts_with(&text, &variant)) { + chars_in_match = variant.len; + break; + } + } + + if (chars_in_match == 0) { + return aws_raise_error(AWS_ERROR_SDKUTILS_ENDPOINTS_REGEX_NO_MATCH); + } + aws_byte_cursor_advance(&text, chars_in_match); + break; + } + default: + if (!s_match_one(symbol, &text)) { + return aws_raise_error(AWS_ERROR_SDKUTILS_ENDPOINTS_REGEX_NO_MATCH); + } + aws_byte_cursor_advance(&text, 1); + break; + } + } + + return AWS_OP_SUCCESS; +} |