[ruby/prism] Faster lex_identifier

https://github.com/ruby/prism/commit/e44a9ae742
author: Haldun Bayhantopcu <[email protected]> 2023-10-30 15:47:46 +0100
committer: git <[email protected]> 2023-10-30 16:19:54 +0000
commit: 3a21da9591d3325d8a14dcbac3ad6aeaadebef81 (patch)
tree: 52e9062060779eeef3479b1acb2803767ee33af9
parent: 2ab247d217c7bc312c3bcbb74636a60328f64109 (diff)
3 files changed, 16 insertions, 7 deletions
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h
index 5236a0b3c4..232bc97dd4 100644
--- a/prism/enc/pm_encoding.h
+++ b/prism/enc/pm_encoding.h
@@ -57,6 +57,7 @@ bool pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptr
 // the parser so they need to be internally visible.
 size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
 size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
+bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
 
 // This lookup table is referenced in both the UTF-8 encoding file and the
 // parser directly in order to speed up the default encoding processing.
diff --git a/prism/enc/pm_unicode.c b/prism/enc/pm_unicode.c
index ab10044424..ee776fa2ad 100644
--- a/prism/enc/pm_unicode.c
+++ b/prism/enc/pm_unicode.c
@@ -2285,7 +2285,7 @@ pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
 
-static bool
+bool
 pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
     if (*b < 0x80) {
         return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false;
diff --git a/prism/prism.c b/prism/prism.c
index 83e75b4b69..1809587eb0 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -6044,16 +6044,21 @@ static pm_token_type_t
 lex_identifier(pm_parser_t *parser, bool previous_command_start) {
     // Lex as far as we can into the current identifier.
     size_t width;
-    while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0) {
-        parser->current.end += width;
+    const uint8_t *end = parser->end;
+    const uint8_t *current_start = parser->current.start;
+    const uint8_t *current_end = parser->current.end;
+
+    while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) {
+        current_end += width;
     }
+    parser->current.end = current_end;
 
     // Now cache the length of the identifier so that we can quickly compare it
     // against known keywords.
-    width = (size_t) (parser->current.end - parser->current.start);
+    width = (size_t) (current_end - current_start);
 
-    if (parser->current.end < parser->end) {
-        if (((parser->current.end + 1 >= parser->end) || (parser->current.end[1] != '=')) && (match(parser, '!') || match(parser, '?'))) {
+    if (current_end < end) {
+        if (((current_end + 1 >= end) || (current_end[1] != '=')) && (match(parser, '!') || match(parser, '?'))) {
             // First we'll attempt to extend the identifier by a ! or ?. Then we'll
             // check if we're returning the defined? keyword or just an identifier.
             width++;
@@ -6163,7 +6168,10 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) {
         }
     }
 
-    return parser->encoding.isupper_char(parser->current.start, parser->end - parser->current.start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
+    if (parser->encoding_changed) {
+        return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
+    }
+    return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
 }
 
 // Returns true if the current token that the parser is considering is at the
author	Haldun Bayhantopcu <[email protected]>	2023-10-30 15:47:46 +0100
committer	git <[email protected]>	2023-10-30 16:19:54 +0000
commit	3a21da9591d3325d8a14dcbac3ad6aeaadebef81 (patch)
tree	52e9062060779eeef3479b1acb2803767ee33af9
parent	2ab247d217c7bc312c3bcbb74636a60328f64109 (diff)