diff options
author | yui-knk <[email protected]> | 2024-02-10 10:05:18 +0900 |
---|---|---|
committer | Yuichiro Kaneko <[email protected]> | 2024-02-21 08:06:48 +0900 |
commit | e7ab5d891c3272e72caef6879e90ad8ae4e13dea (patch) | |
tree | 018b9b38b1ef3aaf87b6a6a8baf8a72d78460b27 | |
parent | 97d4363d3b4125d30120ab07cb69d3fc34b44bfc (diff) |
Introduce NODE_REGX to manage regexp literal
-rw-r--r-- | ast.c | 2 | ||||
-rw-r--r-- | common.mk | 1 | ||||
-rw-r--r-- | compile.c | 20 | ||||
-rw-r--r-- | internal/ruby_parser.h | 1 | ||||
-rw-r--r-- | misc/lldb_rb/utils.py | 2 | ||||
-rw-r--r-- | node.c | 6 | ||||
-rw-r--r-- | node_dump.c | 11 | ||||
-rw-r--r-- | parse.y | 267 | ||||
-rw-r--r-- | ruby_parser.c | 11 | ||||
-rw-r--r-- | rubyparser.h | 14 |
10 files changed, 283 insertions, 52 deletions
@@ -567,6 +567,8 @@ node_children(rb_ast_t *ast, const NODE *node) return rb_ary_new_from_args(1, rb_node_rational_literal_val(node)); case NODE_IMAGINARY: return rb_ary_new_from_args(1, rb_node_imaginary_literal_val(node)); + case NODE_REGX: + return rb_ary_new_from_args(1, rb_node_regx_string_val(node)); case NODE_ONCE: return rb_ary_new_from_node_args(ast, 1, RNODE_ONCE(node)->nd_body); case NODE_DSTR: @@ -15931,6 +15931,7 @@ ruby_parser.$(OBJEXT): $(top_srcdir)/internal/fixnum.h ruby_parser.$(OBJEXT): $(top_srcdir)/internal/imemo.h ruby_parser.$(OBJEXT): $(top_srcdir)/internal/numeric.h ruby_parser.$(OBJEXT): $(top_srcdir)/internal/rational.h +ruby_parser.$(OBJEXT): $(top_srcdir)/internal/re.h ruby_parser.$(OBJEXT): $(top_srcdir)/internal/ruby_parser.h ruby_parser.$(OBJEXT): $(top_srcdir)/internal/serial.h ruby_parser.$(OBJEXT): $(top_srcdir)/internal/static_assert.h @@ -1931,6 +1931,9 @@ iseq_set_arguments_keywords(rb_iseq_t *iseq, LINK_ANCHOR *const optargs, case NODE_SYM: dv = rb_node_sym_string_val(val_node); break; + case NODE_REGX: + dv = rb_node_regx_string_val(val_node); + break; case NODE_LINE: dv = rb_node_line_lineno_val(val_node); break; @@ -4499,6 +4502,7 @@ compile_branch_condition(rb_iseq_t *iseq, LINK_ANCHOR *ret, const NODE *cond, case NODE_IMAGINARY: /* NODE_IMAGINARY is always true */ case NODE_TRUE: case NODE_STR: + case NODE_REGX: case NODE_ZLIST: case NODE_LAMBDA: /* printf("useless condition eliminate (%s)\n", ruby_node_name(nd_type(cond))); */ @@ -4702,6 +4706,7 @@ static_literal_node_p(const NODE *node, const rb_iseq_t *iseq, bool hash_key) switch (nd_type(node)) { case NODE_LIT: case NODE_SYM: + case NODE_REGX: case NODE_LINE: case NODE_ENCODING: case NODE_INTEGER: @@ -4740,6 +4745,8 @@ static_literal_value(const NODE *node, rb_iseq_t *iseq) return Qfalse; case NODE_SYM: return rb_node_sym_string_val(node); + case NODE_REGX: + return rb_node_regx_string_val(node); case NODE_LINE: return rb_node_line_lineno_val(node); case NODE_ENCODING: @@ -5785,6 +5792,7 @@ defined_expr0(rb_iseq_t *iseq, LINK_ANCHOR *const ret, case NODE_STR: case NODE_LIT: case NODE_SYM: + case NODE_REGX: case NODE_LINE: case NODE_FILE: case NODE_ENCODING: @@ -7212,6 +7220,7 @@ iseq_compile_pattern_each(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *c } case NODE_LIT: case NODE_SYM: + case NODE_REGX: case NODE_LINE: case NODE_INTEGER: case NODE_FLOAT: @@ -9637,7 +9646,7 @@ compile_match(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const node, i INIT_ANCHOR(val); switch ((int)type) { case NODE_MATCH: - ADD_INSN1(recv, node, putobject, RNODE_MATCH(node)->nd_lit); + ADD_INSN1(recv, node, putobject, rb_node_regx_string_val(node)); ADD_INSN2(val, node, getspecial, INT2FIX(0), INT2FIX(0)); break; @@ -9799,6 +9808,7 @@ compile_kw_arg(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const node, } else if (nd_type_p(default_value, NODE_LIT) || nd_type_p(default_value, NODE_SYM) || + nd_type_p(default_value, NODE_REGX) || nd_type_p(default_value, NODE_LINE) || nd_type_p(default_value, NODE_INTEGER) || nd_type_p(default_value, NODE_FLOAT) || @@ -10385,6 +10395,14 @@ iseq_compile_each0(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const no case NODE_EVSTR: CHECK(compile_evstr(iseq, ret, RNODE_EVSTR(node)->nd_body, popped)); break; + case NODE_REGX:{ + if (!popped) { + VALUE lit = rb_node_regx_string_val(node); + ADD_INSN1(ret, node, putobject, lit); + RB_OBJ_WRITTEN(iseq, Qundef, lit); + } + break; + } case NODE_DREGX: compile_dregx(iseq, ret, node, popped); break; diff --git a/internal/ruby_parser.h b/internal/ruby_parser.h index 2435c207df..7b4c715268 100644 --- a/internal/ruby_parser.h +++ b/internal/ruby_parser.h @@ -23,6 +23,7 @@ VALUE rb_str_new_parser_string(rb_parser_string_t *str); VALUE rb_node_str_string_val(const NODE *); VALUE rb_node_sym_string_val(const NODE *); VALUE rb_node_dstr_string_val(const NODE *); +VALUE rb_node_regx_string_val(const NODE *); VALUE rb_node_dregx_string_val(const NODE *); VALUE rb_node_line_lineno_val(const NODE *); VALUE rb_node_file_path_val(const NODE *); diff --git a/misc/lldb_rb/utils.py b/misc/lldb_rb/utils.py index b52c3906ba..a321426234 100644 --- a/misc/lldb_rb/utils.py +++ b/misc/lldb_rb/utils.py @@ -379,6 +379,8 @@ class RbInspector(LLDBInterface): self._append_expression("*(struct RNode_DXSTR *) %0#x" % val.GetValueAsUnsigned()) elif nd_type == self.ruby_globals["NODE_EVSTR"]: self._append_expression("*(struct RNode_EVSTR *) %0#x" % val.GetValueAsUnsigned()) + elif nd_type == self.ruby_globals["NODE_REGX"]: + self._append_expression("*(struct RNode_REGX *) %0#x" % val.GetValueAsUnsigned()) elif nd_type == self.ruby_globals["NODE_DREGX"]: self._append_expression("*(struct RNode_DREGX *) %0#x" % val.GetValueAsUnsigned()) elif nd_type == self.ruby_globals["NODE_ONCE"]: @@ -195,6 +195,10 @@ free_ast_value(rb_ast_t *ast, void *ctx, NODE *node) case NODE_SYM: parser_string_free(ast, RNODE_SYM(node)->string); break; + case NODE_REGX: + case NODE_MATCH: + parser_string_free(ast, RNODE_REGX(node)->string); + break; case NODE_DSYM: parser_string_free(ast, RNODE_DSYM(node)->string); break; @@ -268,7 +272,6 @@ static bool nodetype_markable_p(enum node_type type) { switch (type) { - case NODE_MATCH: case NODE_LIT: return true; default: @@ -374,7 +377,6 @@ mark_and_move_ast_value(rb_ast_t *ast, void *ctx, NODE *node) #endif switch (nd_type(node)) { - case NODE_MATCH: case NODE_LIT: rb_gc_mark_and_move(&RNODE_LIT(node)->nd_lit); break; diff --git a/node_dump.c b/node_dump.c index fd6fa20aa4..bf13e6d5d1 100644 --- a/node_dump.c +++ b/node_dump.c @@ -678,7 +678,8 @@ dump_node(VALUE buf, VALUE indent, int comment, const NODE * node) ANN("match expression (against $_ implicitly)"); ANN("format: [nd_lit] (in condition)"); ANN("example: if /foo/; foo; end"); - F_LIT(nd_lit, RNODE_MATCH, "regexp"); + LAST_NODE; + F_VALUE(string, rb_node_regx_string_val(node), "string"); return; case NODE_MATCH2: @@ -750,6 +751,14 @@ dump_node(VALUE buf, VALUE indent, int comment, const NODE * node) F_VALUE(val, rb_node_imaginary_literal_val(node), "val"); return; + case NODE_REGX: + ANN("regexp literal"); + ANN("format: [string]"); + ANN("example: /foo/"); + LAST_NODE; + F_VALUE(string, rb_node_regx_string_val(node), "string"); + return; + case NODE_ONCE: ANN("once evaluation"); ANN("format: [nd_body]"); @@ -88,6 +88,7 @@ hash_literal_key_p(VALUE k) case NODE_IMAGINARY: case NODE_STR: case NODE_SYM: + case NODE_REGX: case NODE_LINE: case NODE_FILE: case NODE_ENCODING: @@ -137,6 +138,13 @@ node_imaginary_cmp(rb_node_imaginary_t *n1, rb_node_imaginary_t *n2) } static int +rb_parser_regx_hash_cmp(rb_node_regx_t *n1, rb_node_regx_t *n2) +{ + return (n1->options != n2->options || + rb_parser_string_hash_cmp(n1->string, n2->string)); +} + +static int node_integer_line_cmp(const NODE *node_i, const NODE *line) { VALUE num = rb_node_integer_literal_val(node_i); @@ -190,6 +198,8 @@ node_cdhash_cmp(VALUE val, VALUE lit) return rb_parser_string_hash_cmp(RNODE_STR(node_val)->string, RNODE_STR(node_lit)->string); case NODE_SYM: return rb_parser_string_hash_cmp(RNODE_SYM(node_val)->string, RNODE_SYM(node_lit)->string); + case NODE_REGX: + return rb_parser_regx_hash_cmp(RNODE_REGX(node_val), RNODE_REGX(node_lit)); case NODE_LINE: return node_val->nd_loc.beg_pos.lineno != node_lit->nd_loc.beg_pos.lineno; case NODE_FILE: @@ -236,6 +246,8 @@ node_cdhash_hash(VALUE a) return rb_parser_str_hash(RNODE_STR(node)->string); case NODE_SYM: return rb_parser_str_hash(RNODE_SYM(node)->string); + case NODE_REGX: + return rb_parser_str_hash(RNODE_REGX(node)->string); case NODE_LINE: /* Same with NODE_INTEGER FIXNUM case */ return (st_index_t)node->nd_loc.beg_pos.lineno; @@ -1211,6 +1223,7 @@ static rb_node_dstr_t *rb_node_dstr_new(struct parser_params *p, rb_parser_strin static rb_node_xstr_t *rb_node_xstr_new(struct parser_params *p, rb_parser_string_t *string, const YYLTYPE *loc); static rb_node_dxstr_t *rb_node_dxstr_new(struct parser_params *p, rb_parser_string_t *string, long nd_alen, NODE *nd_next, const YYLTYPE *loc); static rb_node_evstr_t *rb_node_evstr_new(struct parser_params *p, NODE *nd_body, const YYLTYPE *loc); +static rb_node_regx_t *rb_node_regx_new(struct parser_params *p, rb_parser_string_t *string, int options, const YYLTYPE *loc); static rb_node_once_t *rb_node_once_new(struct parser_params *p, NODE *nd_body, const YYLTYPE *loc); static rb_node_args_t *rb_node_args_new(struct parser_params *p, const YYLTYPE *loc); static rb_node_args_aux_t *rb_node_args_aux_new(struct parser_params *p, ID nd_pid, long nd_plen, const YYLTYPE *loc); @@ -1319,6 +1332,7 @@ static rb_node_error_t *rb_node_error_new(struct parser_params *p, const YYLTYPE #define NEW_XSTR(s,loc) (NODE *)rb_node_xstr_new(p,s,loc) #define NEW_DXSTR(s,l,n,loc) (NODE *)rb_node_dxstr_new(p,s,l,n,loc) #define NEW_EVSTR(n,loc) (NODE *)rb_node_evstr_new(p,n,loc) +#define NEW_REGX(str,opts,loc) (NODE *)rb_node_regx_new(p,str,opts,loc) #define NEW_ONCE(b,loc) (NODE *)rb_node_once_new(p,b,loc) #define NEW_ARGS(loc) rb_node_args_new(p,loc) #define NEW_ARGS_AUX(r,b,loc) rb_node_args_aux_new(p,r,b,loc) @@ -1567,8 +1581,8 @@ static NODE *match_op(struct parser_params*,NODE*,NODE*,const YYLTYPE*,const YYL static rb_ast_id_table_t *local_tbl(struct parser_params*); -static VALUE reg_compile(struct parser_params*, VALUE, int); -static void reg_fragment_setenc(struct parser_params*, VALUE, int); +static VALUE reg_compile(struct parser_params*, rb_parser_string_t*, int); +static void reg_fragment_setenc(struct parser_params*, rb_parser_string_t*, int); #define reg_fragment_check rb_parser_reg_fragment_check int reg_fragment_check(struct parser_params*, rb_parser_string_t*, int); @@ -1592,7 +1606,7 @@ static int id_is_var(struct parser_params *p, ID id); RUBY_SYMBOL_EXPORT_BEGIN VALUE rb_parser_reg_compile(struct parser_params* p, VALUE str, int options); -int rb_reg_fragment_setenc(struct parser_params*, VALUE, int); +int rb_reg_fragment_setenc(struct parser_params*, rb_parser_string_t *, int); enum lex_state_e rb_parser_trace_lex_state(struct parser_params *, enum lex_state_e, enum lex_state_e, int); VALUE rb_parser_lex_state_name(struct parser_params *p, enum lex_state_e state); void rb_parser_show_bitstack(struct parser_params *, stack_type, const char *, int); @@ -1647,6 +1661,9 @@ static void numparam_pop(struct parser_params *p, NODE *prev_inner); #define idFWD_ALL idDot3 #define arg_FWD_BLOCK idFWD_BLOCK +#define RE_ONIG_OPTION_IGNORECASE 1 +#define RE_ONIG_OPTION_EXTEND (RE_ONIG_OPTION_IGNORECASE<<1) +#define RE_ONIG_OPTION_MULTILINE (RE_ONIG_OPTION_EXTEND<<1) #define RE_OPTION_ONCE (1<<16) #define RE_OPTION_ENCODING_SHIFT 8 #define RE_OPTION_ENCODING(e) (((e)&0xff)<<RE_OPTION_ENCODING_SHIFT) @@ -2237,6 +2254,14 @@ rb_parser_str_get_encoding(rb_parser_string_t *str) return str->enc; } +#ifndef RIPPER +static bool +PARSER_ENCODING_IS_ASCII8BIT(struct parser_params *p, rb_parser_string_t *str) +{ + return rb_parser_str_get_encoding(str) == rb_ascii8bit_encoding(); +} +#endif + static int PARSER_ENC_CODERANGE(rb_parser_string_t *str) { @@ -2257,11 +2282,19 @@ PARSER_ENCODING_CODERANGE_SET(rb_parser_string_t *str, rb_encoding *enc, enum rb } static void -PARSER_ENCODING_CODERANGE_CLEAR(rb_parser_string_t *str) +PARSER_ENC_CODERANGE_CLEAR(rb_parser_string_t *str) { str->coderange = RB_PARSER_ENC_CODERANGE_UNKNOWN; } +#ifndef RIPPER +static bool +PARSER_ENC_CODERANGE_ASCIIONLY(rb_parser_string_t *str) +{ + return PARSER_ENC_CODERANGE(str) == RB_PARSER_ENC_CODERANGE_7BIT; +} +#endif + static bool PARSER_ENC_CODERANGE_CLEAN_P(int cr) { @@ -2325,6 +2358,21 @@ rb_parser_enc_str_coderange(struct parser_params *p, rb_parser_string_t *str) return cr; } +#ifndef RIPPER +static rb_parser_string_t * +rb_parser_enc_associate(struct parser_params *p, rb_parser_string_t *str, rb_encoding *enc) +{ + if (rb_parser_str_get_encoding(str) == enc) + return str; + if (!PARSER_ENC_CODERANGE_ASCIIONLY(str) || + !rb_enc_asciicompat(enc)) { + PARSER_ENC_CODERANGE_CLEAR(str); + } + rb_parser_string_set_encoding(str, enc); + return str; +} +#endif + static bool rb_parser_is_ascii_string(struct parser_params *p, rb_parser_string_t *str) { @@ -2394,7 +2442,7 @@ rb_parser_enc_compatible(struct parser_params *p, rb_parser_string_t *str1, rb_p static void rb_parser_str_modify(rb_parser_string_t *str) { - PARSER_ENCODING_CODERANGE_CLEAR(str); + PARSER_ENC_CODERANGE_CLEAR(str); } static void @@ -2557,7 +2605,7 @@ rb_parser_str_resize(struct parser_params *p, rb_parser_string_t *str, long len) long slen = PARSER_STRING_LEN(str); if (slen > len && PARSER_ENC_CODERANGE(str) != RB_PARSER_ENC_CODERANGE_7BIT) { - PARSER_ENCODING_CODERANGE_CLEAR(str); + PARSER_ENC_CODERANGE_CLEAR(str); } { @@ -6828,6 +6876,7 @@ singleton : var_ref case NODE_DSTR: case NODE_XSTR: case NODE_DXSTR: + case NODE_REGX: case NODE_DREGX: case NODE_LIT: case NODE_SYM: @@ -8394,6 +8443,61 @@ tokadd_escape(struct parser_params *p) } static int +char_to_option(int c) +{ + int val; + + switch (c) { + case 'i': + val = RE_ONIG_OPTION_IGNORECASE; + break; + case 'x': + val = RE_ONIG_OPTION_EXTEND; + break; + case 'm': + val = RE_ONIG_OPTION_MULTILINE; + break; + default: + val = 0; + break; + } + return val; +} + +#define ARG_ENCODING_FIXED 16 +#define ARG_ENCODING_NONE 32 +#define ENC_ASCII8BIT 1 +#define ENC_EUC_JP 2 +#define ENC_Windows_31J 3 +#define ENC_UTF8 4 + +static int +char_to_option_kcode(int c, int *option, int *kcode) +{ + *option = 0; + + switch (c) { + case 'n': + *kcode = ENC_ASCII8BIT; + return (*option = ARG_ENCODING_NONE); + case 'e': + *kcode = ENC_EUC_JP; + break; + case 's': + *kcode = ENC_Windows_31J; + break; + case 'u': + *kcode = ENC_UTF8; + break; + default: + *kcode = -1; + return (*option = char_to_option(c)); + } + *option = ARG_ENCODING_FIXED; + return 1; +} + +static int regx_options(struct parser_params *p) { int kcode = 0; @@ -8406,9 +8510,9 @@ regx_options(struct parser_params *p) if (c == 'o') { options |= RE_OPTION_ONCE; } - else if (rb_char_to_option_kcode(c, &opt, &kc)) { + else if (char_to_option_kcode(c, &opt, &kc)) { if (kc >= 0) { - if (kc != rb_ascii8bit_encindex()) kcode = c; + if (kc != ENC_ASCII8BIT) kcode = c; kopt = opt; } else { @@ -12222,6 +12326,16 @@ rb_node_evstr_new(struct parser_params *p, NODE *nd_body, const YYLTYPE *loc) return n; } +static rb_node_regx_t * +rb_node_regx_new(struct parser_params *p, rb_parser_string_t *string, int options, const YYLTYPE *loc) +{ + rb_node_regx_t *n = NODE_NEWNODE(NODE_REGX, rb_node_regx_t, loc); + n->string = string; + n->options = options & RE_OPTION_MASK; + + return n; +} + static rb_node_call_t * rb_node_call_new(struct parser_params *p, NODE *nd_recv, ID nd_mid, NODE *nd_args, const YYLTYPE *loc) { @@ -12848,6 +12962,18 @@ str2dstr(struct parser_params *p, NODE *node) } static NODE * +str2regx(struct parser_params *p, NODE *node, int options) +{ + NODE *new_node = (NODE *)NODE_NEW_INTERNAL(NODE_REGX, rb_node_regx_t); + nd_copy_flag(new_node, node); + RNODE_REGX(new_node)->string = RNODE_STR(node)->string; + RNODE_REGX(new_node)->options = options; + RNODE_STR(node)->string = 0; + + return new_node; +} + +static NODE * evstr2dstr(struct parser_params *p, NODE *node) { if (nd_type_p(node, NODE_EVSTR)) { @@ -12949,9 +13075,9 @@ match_op(struct parser_params *p, NODE *node1, NODE *node2, const YYLTYPE *op_lo return match; } - case NODE_LIT: - if (RB_TYPE_P(RNODE_LIT(n)->nd_lit, T_REGEXP)) { - const VALUE lit = RNODE_LIT(n)->nd_lit; + case NODE_REGX: + { + const VALUE lit = rb_node_regx_string_val(n); NODE *match = NEW_MATCH2(node1, node2, loc); RNODE_MATCH2(match)->nd_args = reg_named_capture_assign(p, lit, loc); nd_set_line(match, line); @@ -12964,9 +13090,6 @@ match_op(struct parser_params *p, NODE *node1, NODE *node2, const YYLTYPE *op_lo NODE *match3; switch (nd_type(n)) { - case NODE_LIT: - if (!RB_TYPE_P(RNODE_LIT(n)->nd_lit, T_REGEXP)) break; - /* fallthru */ case NODE_DREGX: match3 = NEW_MATCH3(node2, node1, loc); return match3; @@ -13210,16 +13333,18 @@ new_regexp(struct parser_params *p, NODE *node, int options, const YYLTYPE *loc) NODE *prev; if (!node) { - node = NEW_LIT(reg_compile(p, STR_NEW0(), options), loc); - RB_OBJ_WRITTEN(p->ast, Qnil, RNODE_LIT(node)->nd_lit); + /* Check string is valid regex */ + rb_parser_string_t *str = STRING_NEW0(); + reg_compile(p, str, options); + node = NEW_REGX(str, options, loc); return node; } switch (nd_type(node)) { case NODE_STR: { - VALUE src = rb_node_str_string_val(node); - node = NEW_LIT(reg_compile(p, src, options), loc); - RB_OBJ_WRITTEN(p->ast, Qnil, RNODE_LIT(node)->nd_lit); + /* Check string is valid regex */ + reg_compile(p, RNODE_STR(node)->string, options); + node = str2regx(p, node, options); } break; default: @@ -13255,9 +13380,8 @@ new_regexp(struct parser_params *p, NODE *node, int options, const YYLTYPE *loc) } } if (!RNODE_DREGX(node)->nd_next) { - VALUE src = rb_node_dregx_string_val(node); /* Check string is valid regex */ - reg_compile(p, src, options); + reg_compile(p, RNODE_DREGX(node)->string, options); } if (options & RE_OPTION_ONCE) { node = NEW_ONCE(node, loc); @@ -13916,6 +14040,8 @@ shareable_literal_value(struct parser_params *p, NODE *node) return rb_node_imaginary_literal_val(node); case NODE_ENCODING: return rb_node_encoding_val(node); + case NODE_REGX: + return rb_node_regx_string_val(node); case NODE_LIT: return RNODE_LIT(node)->nd_lit; default: @@ -13943,6 +14069,7 @@ shareable_literal_constant(struct parser_params *p, enum shareability shareable, case NODE_NIL: case NODE_LIT: case NODE_SYM: + case NODE_REGX: case NODE_LINE: case NODE_INTEGER: case NODE_FLOAT: @@ -14305,6 +14432,7 @@ void_expr(struct parser_params *p, NODE *node) case NODE_IMAGINARY: case NODE_STR: case NODE_DSTR: + case NODE_REGX: case NODE_DREGX: useless = "a literal"; break; @@ -14441,6 +14569,7 @@ is_static_content(NODE *node) } while ((node = RNODE_LIST(node)->nd_next) != 0); case NODE_LIT: case NODE_SYM: + case NODE_REGX: case NODE_LINE: case NODE_FILE: case NODE_ENCODING: @@ -14537,6 +14666,11 @@ cond0(struct parser_params *p, NODE *node, enum cond_type type, const YYLTYPE *l SWITCH_BY_COND_TYPE(type, warn, "string "); break; + case NODE_REGX: + if (!e_option_supplied(p)) SWITCH_BY_COND_TYPE(type, warn, "regex "); + nd_set_type(node, NODE_MATCH); + break; + case NODE_DREGX: if (!e_option_supplied(p)) SWITCH_BY_COND_TYPE(type, warning, "regex "); @@ -14573,12 +14707,8 @@ cond0(struct parser_params *p, NODE *node, enum cond_type type, const YYLTYPE *l break; case NODE_LIT: - if (RB_TYPE_P(RNODE_LIT(node)->nd_lit, T_REGEXP)) { - if (!e_option_supplied(p)) SWITCH_BY_COND_TYPE(type, warn, "regex "); - nd_set_type(node, NODE_MATCH); - } - else if (RNODE_LIT(node)->nd_lit == Qtrue || - RNODE_LIT(node)->nd_lit == Qfalse) { + if (RNODE_LIT(node)->nd_lit == Qtrue || + RNODE_LIT(node)->nd_lit == Qfalse) { /* booleans are OK, e.g., while true */ } else if (SYMBOL_P(RNODE_LIT(node)->nd_lit)) { @@ -14963,6 +15093,7 @@ nd_type_st_key_enable_p(NODE *node) case NODE_IMAGINARY: case NODE_STR: case NODE_SYM: + case NODE_REGX: case NODE_LINE: case NODE_FILE: case NODE_ENCODING: @@ -14984,6 +15115,7 @@ nd_st_key(struct parser_params *p, NODE *node) case NODE_RATIONAL: case NODE_IMAGINARY: case NODE_SYM: + case NODE_REGX: case NODE_LINE: case NODE_ENCODING: case NODE_FILE: @@ -15012,6 +15144,8 @@ nd_value(struct parser_params *p, NODE *node) return rb_node_imaginary_literal_val(node); case NODE_SYM: return rb_node_sym_string_val(node); + case NODE_REGX: + return rb_node_regx_string_val(node); case NODE_LINE: return rb_node_line_lineno_val(node); case NODE_ENCODING: @@ -15634,43 +15768,83 @@ dvar_curr(struct parser_params *p, ID id) } static void -reg_fragment_enc_error(struct parser_params* p, VALUE str, int c) +reg_fragment_enc_error(struct parser_params* p, rb_parser_string_t *str, int c) { compile_error(p, "regexp encoding option '%c' differs from source encoding '%s'", - c, rb_enc_name(rb_enc_get(str))); + c, rb_enc_name(rb_parser_str_get_encoding(str))); } #ifndef RIPPER +static rb_encoding * +find_enc(struct parser_params* p, const char *name) +{ + int idx = rb_enc_find_index(name); + if (idx < 0) { + rb_bug("unknown encoding name: %s", name); + } + + return rb_enc_from_index(idx); +} + +static rb_encoding * +kcode_to_enc(struct parser_params* p, int kcode) +{ + rb_encoding *enc; + + switch (kcode) { + case ENC_ASCII8BIT: + enc = rb_ascii8bit_encoding(); + break; + case ENC_EUC_JP: + enc = find_enc(p, "EUC-JP"); + break; + case ENC_Windows_31J: + enc = find_enc(p, "Windows-31J"); + break; + case ENC_UTF8: + enc = rb_utf8_encoding(); + break; + default: + enc = NULL; + break; + } + + return enc; +} + int -rb_reg_fragment_setenc(struct parser_params* p, VALUE str, int options) +rb_reg_fragment_setenc(struct parser_params* p, rb_parser_string_t *str, int options) { int c = RE_OPTION_ENCODING_IDX(options); if (c) { int opt, idx; - rb_char_to_option_kcode(c, &opt, &idx); - if (idx != ENCODING_GET(str) && - !is_ascii_string(str)) { + rb_encoding *enc; + + char_to_option_kcode(c, &opt, &idx); + enc = kcode_to_enc(p, idx); + if (enc != rb_parser_str_get_encoding(str) && + !rb_parser_is_ascii_string(p, str)) { goto error; } - ENCODING_SET(str, idx); + rb_parser_string_set_encoding(str, enc); } else if (RE_OPTION_ENCODING_NONE(options)) { - if (!ENCODING_IS_ASCII8BIT(str) && - !is_ascii_string(str)) { + if (!PARSER_ENCODING_IS_ASCII8BIT(p, str) && + !rb_parser_is_ascii_string(p, str)) { c = 'n'; goto error; } - rb_enc_associate(str, rb_ascii8bit_encoding()); + rb_parser_enc_associate(p, str, rb_ascii8bit_encoding()); } else if (rb_is_usascii_enc(p->enc)) { - if (!is_ascii_string(str)) { + if (!rb_parser_is_ascii_string(p, str)) { /* raise in re.c */ - rb_enc_associate(str, rb_usascii_encoding()); + rb_parser_enc_associate(p, str, rb_usascii_encoding()); } else { - rb_enc_associate(str, rb_ascii8bit_encoding()); + rb_parser_enc_associate(p, str, rb_ascii8bit_encoding()); } } return 0; @@ -15681,7 +15855,7 @@ rb_reg_fragment_setenc(struct parser_params* p, VALUE str, int options) #endif static void -reg_fragment_setenc(struct parser_params* p, VALUE str, int options) +reg_fragment_setenc(struct parser_params* p, rb_parser_string_t *str, int options) { int c = rb_reg_fragment_setenc(p, str, options); if (c) reg_fragment_enc_error(p, str, c); @@ -15692,10 +15866,9 @@ int reg_fragment_check(struct parser_params* p, rb_parser_string_t *str, int options) { VALUE err, str2; + reg_fragment_setenc(p, str, options); /* TODO */ str2 = rb_str_new_parser_string(str); - reg_fragment_setenc(p, str2, options); - str->enc = rb_enc_get(str2); err = rb_reg_check_preprocess(str2); if (err != Qnil) { err = rb_obj_as_string(err); @@ -15769,10 +15942,12 @@ rb_reg_named_capture_assign_iter_impl(struct parser_params *p, const char *s, lo #endif static VALUE -parser_reg_compile(struct parser_params* p, VALUE str, int options) +parser_reg_compile(struct parser_params* p, rb_parser_string_t *str, int options) { + VALUE str2; reg_fragment_setenc(p, str, options); - return rb_parser_reg_compile(p, str, options); + str2 = rb_str_new_parser_string(str); + return rb_parser_reg_compile(p, str2, options); } #ifndef RIPPER @@ -15784,7 +15959,7 @@ rb_parser_reg_compile(struct parser_params* p, VALUE str, int options) #endif static VALUE -reg_compile(struct parser_params* p, VALUE str, int options) +reg_compile(struct parser_params* p, rb_parser_string_t *str, int options) { VALUE re; VALUE err; diff --git a/ruby_parser.c b/ruby_parser.c index c4e37e3353..c233b9801b 100644 --- a/ruby_parser.c +++ b/ruby_parser.c @@ -1,5 +1,6 @@ /* This is a wrapper for parse.y */ +#include "internal/re.h" #include "internal/ruby_parser.h" #include "node.h" @@ -1011,6 +1012,16 @@ rb_node_dregx_string_val(const NODE *node) } VALUE +rb_node_regx_string_val(const NODE *node) +{ + rb_node_regx_t *node_reg = RNODE_REGX(node); + rb_parser_string_t *string = node_reg->string; + VALUE str = rb_enc_str_new(string->ptr, string->len, string->enc); + + return rb_reg_compile(str, node_reg->options, NULL, 0); +} + +VALUE rb_node_line_lineno_val(const NODE *node) { return INT2FIX(node->nd_loc.beg_pos.lineno); diff --git a/rubyparser.h b/rubyparser.h index f3bd76d2eb..3fcfd32c9c 100644 --- a/rubyparser.h +++ b/rubyparser.h @@ -127,6 +127,7 @@ enum node_type { NODE_XSTR, NODE_DXSTR, NODE_EVSTR, + NODE_REGX, NODE_DREGX, NODE_ONCE, NODE_ARGS, @@ -612,11 +613,12 @@ typedef struct RNode_BACK_REF { long nd_nth; } rb_node_back_ref_t; -/* RNode_MATCH and RNode_LIT should be same structure */ +/* RNode_MATCH and RNode_REGX should be same structure */ typedef struct RNode_MATCH { NODE node; - VALUE nd_lit; + struct rb_parser_string *string; + int options; } rb_node_match_t; typedef struct RNode_MATCH2 { @@ -719,6 +721,13 @@ typedef struct RNode_EVSTR { struct RNode *nd_body; } rb_node_evstr_t; +typedef struct RNode_REGX { + NODE node; + + struct rb_parser_string *string; + int options; +} rb_node_regx_t; + typedef struct RNode_DREGX { NODE node; @@ -1093,6 +1102,7 @@ typedef struct RNode_ERROR { #define RNODE_XSTR(node) ((struct RNode_XSTR *)(node)) #define RNODE_DXSTR(node) ((struct RNode_DXSTR *)(node)) #define RNODE_EVSTR(node) ((struct RNode_EVSTR *)(node)) +#define RNODE_REGX(node) ((struct RNode_REGX *)(node)) #define RNODE_DREGX(node) ((struct RNode_DREGX *)(node)) #define RNODE_ONCE(node) ((struct RNode_ONCE *)(node)) #define RNODE_ARGS(node) ((struct RNode_ARGS *)(node)) |