summaryrefslogtreecommitdiff
path: root/yarp/regexp.c
diff options
context:
space:
mode:
authorJemma Issroff <[email protected]>2023-06-20 11:53:02 -0400
committerTakashi Kokubun <[email protected]>2023-06-21 11:25:39 -0700
commitcc7f765f2c12a9ba050b0d95f9d85f3923c8d944 (patch)
tree5b5c60c1950240900dc749773083324a0e39748a /yarp/regexp.c
parent08478fefca827276d68e33f2e6a5940c85957a51 (diff)
[Feature #19741] Sync all files in yarp
This commit is the initial sync of all files from ruby/yarp into ruby/ruby. Notably, it does the following: * Sync all ruby/yarp/lib/ files to ruby/ruby/lib/yarp * Sync all ruby/yarp/src/ files to ruby/ruby/yarp/ * Sync all ruby/yarp/test/ files to ruby/ruby/test/yarp
Notes
Notes: Merged: https://github.com/ruby/ruby/pull/7964
Diffstat (limited to 'yarp/regexp.c')
-rw-r--r--yarp/regexp.c518
1 files changed, 518 insertions, 0 deletions
diff --git a/yarp/regexp.c b/yarp/regexp.c
new file mode 100644
index 0000000000..e1de6de7c1
--- /dev/null
+++ b/yarp/regexp.c
@@ -0,0 +1,518 @@
+#include "yarp/regexp.h"
+
+// This is the parser that is going to handle parsing regular expressions.
+typedef struct {
+ const char *start;
+ const char *cursor;
+ const char *end;
+ yp_string_list_t *named_captures;
+} yp_regexp_parser_t;
+
+// This initializes a new parser with the given source.
+static void
+yp_regexp_parser_init(yp_regexp_parser_t *parser, const char *start, const char *end, yp_string_list_t *named_captures) {
+ *parser = (yp_regexp_parser_t) {
+ .start = start,
+ .cursor = start,
+ .end = end,
+ .named_captures = named_captures
+ };
+}
+
+// This appends a new string to the list of named captures.
+static void
+yp_regexp_parser_named_capture(yp_regexp_parser_t *parser, const char *start, const char *end) {
+ yp_string_t string;
+ yp_string_shared_init(&string, start, end);
+ yp_string_list_append(parser->named_captures, &string);
+ yp_string_free(&string);
+}
+
+// Returns true if the next character is the end of the source.
+static inline bool
+yp_regexp_char_is_eof(yp_regexp_parser_t *parser) {
+ return parser->cursor >= parser->end;
+}
+
+// Optionally accept a char and consume it if it exists.
+static inline bool
+yp_regexp_char_accept(yp_regexp_parser_t *parser, char value) {
+ if (!yp_regexp_char_is_eof(parser) && *parser->cursor == value) {
+ parser->cursor++;
+ return true;
+ }
+ return false;
+}
+
+// Expect a character to be present and consume it.
+static inline bool
+yp_regexp_char_expect(yp_regexp_parser_t *parser, char value) {
+ if (!yp_regexp_char_is_eof(parser) && *parser->cursor == value) {
+ parser->cursor++;
+ return true;
+ }
+ return false;
+}
+
+// This advances the current token to the next instance of the given character.
+static bool
+yp_regexp_char_find(yp_regexp_parser_t *parser, char value) {
+ const char *end = (const char *) memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor));
+ if (end == NULL) {
+ return false;
+ }
+
+ parser->cursor = end + 1;
+ return true;
+}
+
+// Range quantifiers are a special class of quantifiers that look like
+//
+// * {digit}
+// * {digit,}
+// * {digit,digit}
+// * {,digit}
+//
+// Unfortunately, if there are any spaces in between, then this just becomes a
+// regular character match expression and we have to backtrack. So when this
+// function first starts running, we'll create a "save" point and then attempt
+// to parse the quantifier. If it fails, we'll restore the save point and
+// return.
+//
+// The properly track everything, we're going to build a little state machine.
+// It looks something like the following:
+//
+// ┌───────┐ ┌─────────┐ ────────────┐
+// ──── lbrace ───> │ start │ ──── digit ───> │ minimum │ │
+// └───────┘ └─────────┘ <─── digit ─┘
+// │ │ │
+// ┌───────┐ │ │ rbrace
+// │ comma │ <───── comma ┌──── comma ───────┘ │
+// └───────┘ V V
+// │ ┌─────────┐ ┌─────────┐
+// └── digit ──> │ maximum │ ── rbrace ──> │| final |│
+// └─────────┘ └─────────┘
+// │ ^
+// └─ digit ─┘
+//
+// Note that by the time we've hit this function, the lbrace has already been
+// consumed so we're in the start state.
+static bool
+yp_regexp_parse_range_quantifier(yp_regexp_parser_t *parser) {
+ const char *savepoint = parser->cursor;
+
+ enum {
+ YP_REGEXP_RANGE_QUANTIFIER_STATE_START,
+ YP_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
+ YP_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
+ YP_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
+ } state = YP_REGEXP_RANGE_QUANTIFIER_STATE_START;
+
+ while (1) {
+ switch (state) {
+ case YP_REGEXP_RANGE_QUANTIFIER_STATE_START:
+ switch (*parser->cursor) {
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+ parser->cursor++;
+ state = YP_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
+ break;
+ case ',':
+ parser->cursor++;
+ state = YP_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
+ break;
+ default:
+ parser->cursor = savepoint;
+ return true;
+ }
+ break;
+ case YP_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
+ switch (*parser->cursor) {
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+ parser->cursor++;
+ break;
+ case ',':
+ parser->cursor++;
+ state = YP_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
+ break;
+ case '}':
+ parser->cursor++;
+ return true;
+ default:
+ parser->cursor = savepoint;
+ return true;
+ }
+ break;
+ case YP_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
+ switch (*parser->cursor) {
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+ parser->cursor++;
+ state = YP_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
+ break;
+ default:
+ parser->cursor = savepoint;
+ return true;
+ }
+ break;
+ case YP_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
+ switch (*parser->cursor) {
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+ parser->cursor++;
+ break;
+ case '}':
+ parser->cursor++;
+ return true;
+ default:
+ parser->cursor = savepoint;
+ return true;
+ }
+ break;
+ }
+ }
+
+ return true;
+}
+
+// quantifier : star-quantifier
+// | plus-quantifier
+// | optional-quantifier
+// | range-quantifier
+// | <empty>
+// ;
+static bool
+yp_regexp_parse_quantifier(yp_regexp_parser_t *parser) {
+ switch (*parser->cursor) {
+ case '*':
+ case '+':
+ case '?':
+ parser->cursor++;
+ return true;
+ case '{':
+ parser->cursor++;
+ return yp_regexp_parse_range_quantifier(parser);
+ default:
+ // In this case there is no quantifier.
+ return true;
+ }
+}
+
+// match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
+// ;
+static bool
+yp_regexp_parse_posix_class(yp_regexp_parser_t *parser) {
+ if (!yp_regexp_char_expect(parser, ':')) {
+ return false;
+ }
+
+ yp_regexp_char_accept(parser, '^');
+
+ return (
+ yp_regexp_char_find(parser, ':') &&
+ yp_regexp_char_expect(parser, ']') &&
+ yp_regexp_char_expect(parser, ']')
+ );
+}
+
+// Forward declaration because character sets can be nested.
+static bool
+yp_regexp_parse_lbracket(yp_regexp_parser_t *parser);
+
+// match-char-set : '[' '^'? (match-range | match-char)* ']'
+// ;
+static bool
+yp_regexp_parse_character_set(yp_regexp_parser_t *parser) {
+ yp_regexp_char_accept(parser, '^');
+
+ while (!yp_regexp_char_is_eof(parser) && *parser->cursor != ']') {
+ switch (*parser->cursor++) {
+ case '[':
+ yp_regexp_parse_lbracket(parser);
+ break;
+ case '\\':
+ if (!yp_regexp_char_is_eof(parser)) {
+ parser->cursor++;
+ }
+ break;
+ default:
+ // do nothing, we've already advanced the cursor
+ break;
+ }
+ }
+
+ return yp_regexp_char_expect(parser, ']');
+}
+
+// A left bracket can either mean a POSIX class or a character set.
+static bool
+yp_regexp_parse_lbracket(yp_regexp_parser_t *parser) {
+ const char *reset = parser->cursor;
+
+ if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
+ parser->cursor++;
+ if (yp_regexp_parse_posix_class(parser)) return true;
+
+ parser->cursor = reset;
+ }
+
+ return yp_regexp_parse_character_set(parser);
+}
+
+// Forward declaration here since parsing groups needs to go back up the grammar
+// to parse expressions within them.
+static bool
+yp_regexp_parse_expression(yp_regexp_parser_t *parser);
+
+// These are the states of the options that are configurable on the regular
+// expression (or from within a group).
+typedef enum {
+ YP_REGEXP_OPTION_STATE_INVALID,
+ YP_REGEXP_OPTION_STATE_TOGGLEABLE,
+ YP_REGEXP_OPTION_STATE_ADDABLE
+} yp_regexp_option_state_t;
+
+// This is the set of options that are configurable on the regular expression.
+typedef yp_regexp_option_state_t yp_regexp_options_t[128];
+
+// Initialize a new set of options to their default values.
+static void
+yp_regexp_options_init(yp_regexp_options_t *options) {
+ memset(options, YP_REGEXP_OPTION_STATE_INVALID, sizeof(yp_regexp_option_state_t) * 128);
+ (*options)['i'] = YP_REGEXP_OPTION_STATE_TOGGLEABLE;
+ (*options)['m'] = YP_REGEXP_OPTION_STATE_TOGGLEABLE;
+ (*options)['x'] = YP_REGEXP_OPTION_STATE_TOGGLEABLE;
+ (*options)['d'] = YP_REGEXP_OPTION_STATE_ADDABLE;
+ (*options)['a'] = YP_REGEXP_OPTION_STATE_ADDABLE;
+ (*options)['u'] = YP_REGEXP_OPTION_STATE_ADDABLE;
+}
+
+// Attempt to add the given option to the set of options. Returns true if it was
+// added, false if it was already present.
+static bool
+yp_regexp_options_add(yp_regexp_options_t *options, unsigned char option) {
+ switch ((*options)[option]) {
+ case YP_REGEXP_OPTION_STATE_INVALID:
+ return false;
+ case YP_REGEXP_OPTION_STATE_TOGGLEABLE:
+ case YP_REGEXP_OPTION_STATE_ADDABLE:
+ (*options)[option] = YP_REGEXP_OPTION_STATE_INVALID;
+ return true;
+ }
+ return false;
+}
+
+// Attempt to remove the given option from the set of options. Returns true if
+// it was removed, false if it was already absent.
+static bool
+yp_regexp_options_remove(yp_regexp_options_t *options, unsigned char option) {
+ switch ((*options)[option]) {
+ case YP_REGEXP_OPTION_STATE_INVALID:
+ case YP_REGEXP_OPTION_STATE_ADDABLE:
+ return false;
+ case YP_REGEXP_OPTION_STATE_TOGGLEABLE:
+ (*options)[option] = YP_REGEXP_OPTION_STATE_INVALID;
+ return true;
+ }
+ return false;
+}
+
+// Groups can have quite a few different patterns for syntax. They basically
+// just wrap a set of expressions, but they can potentially have options after a
+// question mark. If there _isn't_ a question mark, then it's just a set of
+// expressions. If there _is_, then here are the options:
+//
+// * (?#...) - inline comments
+// * (?:subexp) - non-capturing group
+// * (?=subexp) - positive lookahead
+// * (?!subexp) - negative lookahead
+// * (?>subexp) - atomic group
+// * (?~subexp) - absence operator
+// * (?<=subexp) - positive lookbehind
+// * (?<!subexp) - negative lookbehind
+// * (?<name>subexp) - named capturing group
+// * (?'name'subexp) - named capturing group
+// * (?(cond)yes-subexp) - conditional expression
+// * (?(cond)yes-subexp|no-subexp) - conditional expression
+// * (?imxdau-imx) - turn on and off configuration
+// * (?imxdau-imx:subexp) - turn on and off configuration for an expression
+//
+static bool
+yp_regexp_parse_group(yp_regexp_parser_t *parser) {
+ // First, parse any options for the group.
+ if (yp_regexp_char_accept(parser, '?')) {
+ yp_regexp_options_t options;
+ yp_regexp_options_init(&options);
+
+ switch (*parser->cursor) {
+ case '#': { // inline comments
+ bool found = yp_regexp_char_find(parser, ')');
+ // the close paren we found is escaped, we need to find another
+ while (parser->start <= parser->cursor - 2 && *(parser->cursor - 2) == '\\') {
+ found = yp_regexp_char_find(parser, ')');
+ }
+ return found;
+ }
+ case ':': // non-capturing group
+ case '=': // positive lookahead
+ case '!': // negative lookahead
+ case '>': // atomic group
+ case '~': // absence operator
+ parser->cursor++;
+ break;
+ case '<':
+ parser->cursor++;
+ if (yp_regexp_char_is_eof(parser)) {
+ return false;
+ }
+
+ switch (*parser->cursor) {
+ case '=': // positive lookbehind
+ case '!': // negative lookbehind
+ parser->cursor++;
+ break;
+ default: { // named capture group
+ const char *start = parser->cursor;
+ if (!yp_regexp_char_find(parser, '>')) {
+ return false;
+ }
+ yp_regexp_parser_named_capture(parser, start, parser->cursor - 1);
+ break;
+ }
+ }
+ break;
+ case '\'': { // named capture group
+ const char *start = ++parser->cursor;
+ if (!yp_regexp_char_find(parser, '\'')) {
+ return false;
+ }
+
+ yp_regexp_parser_named_capture(parser, start, parser->cursor - 1);
+ break;
+ }
+ case '(': // conditional expression
+ if (!yp_regexp_char_find(parser, ')')) {
+ return false;
+ }
+ break;
+ case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options
+ while (!yp_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
+ if (!yp_regexp_options_add(&options, (unsigned char) *parser->cursor)) {
+ return false;
+ }
+ parser->cursor++;
+ }
+
+ if (yp_regexp_char_is_eof(parser)) {
+ return false;
+ }
+
+ // If we hit a -, then we're done parsing options.
+ if (*parser->cursor != '-') break;
+
+ // Otherwise, fallthrough to the - case.
+ /* fallthrough */
+ case '-':
+ parser->cursor++;
+ while (!yp_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
+ if (!yp_regexp_options_remove(&options, (unsigned char) *parser->cursor)) {
+ return false;
+ }
+ parser->cursor++;
+ }
+
+ if (yp_regexp_char_is_eof(parser)) {
+ return false;
+ }
+ break;
+ default:
+ return false;
+ }
+ }
+
+ // Now, parse the expressions within this group.
+ while (!yp_regexp_char_is_eof(parser) && *parser->cursor != ')') {
+ if (!yp_regexp_parse_expression(parser)) {
+ return false;
+ }
+ yp_regexp_char_accept(parser, '|');
+ }
+
+ // Finally, make sure we have a closing parenthesis.
+ return yp_regexp_char_expect(parser, ')');
+}
+
+// item : anchor
+// | match-posix-class
+// | match-char-set
+// | match-char-class
+// | match-char-prop
+// | match-char
+// | match-any
+// | group
+// | quantified
+// ;
+static bool
+yp_regexp_parse_item(yp_regexp_parser_t *parser) {
+ switch (*parser->cursor++) {
+ case '^':
+ case '$':
+ return true;
+ case '\\':
+ if (!yp_regexp_char_is_eof(parser)) {
+ parser->cursor++;
+ }
+ return yp_regexp_parse_quantifier(parser);
+ case '(':
+ return yp_regexp_parse_group(parser) && yp_regexp_parse_quantifier(parser);
+ case '[':
+ return yp_regexp_parse_lbracket(parser) && yp_regexp_parse_quantifier(parser);
+ default:
+ return yp_regexp_parse_quantifier(parser);
+ }
+}
+
+// expression : item+
+// ;
+static bool
+yp_regexp_parse_expression(yp_regexp_parser_t *parser) {
+ if (!yp_regexp_parse_item(parser)) {
+ return false;
+ }
+
+ while (!yp_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
+ if (!yp_regexp_parse_item(parser)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// pattern : EOF
+// | expression EOF
+// | expression '|' pattern
+// ;
+static bool
+yp_regexp_parse_pattern(yp_regexp_parser_t *parser) {
+ return (
+ (
+ // Exit early if the pattern is empty.
+ yp_regexp_char_is_eof(parser) ||
+ // Parse the first expression in the pattern.
+ yp_regexp_parse_expression(parser)
+ ) &&
+ (
+ // Return now if we've parsed the entire pattern.
+ yp_regexp_char_is_eof(parser) ||
+ // Otherwise, we should have a pipe character.
+ (yp_regexp_char_expect(parser, '|') && yp_regexp_parse_pattern(parser))
+ )
+ );
+}
+
+// Parse a regular expression and extract the names of all of the named capture
+// groups.
+YP_EXPORTED_FUNCTION bool
+yp_regexp_named_capture_group_names(const char *source, size_t size, yp_string_list_t *named_captures) {
+ yp_regexp_parser_t parser;
+ yp_regexp_parser_init(&parser, source, source + size, named_captures);
+ return yp_regexp_parse_pattern(&parser);
+}