C++: Support for UTF-8 in the lexer

This will save us toLatin1() conversations in CppTools (which already holds UTF-8 encoded QByteArrays) and thus loss of information (see QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers. API-wise the following functions are added to Token. In follow-up patches these will become handy in combination with QStrings. utf16chars() - aequivalent of bytes() utf16charsBegin() - aequivalent of bytesBegin() utf16charsEnd() - aequivalent of bytesEnd() Next steps: * Adapt functions from TranslationUnit. They should work with utf16 chars in order to calculate lines and columns correctly also for UTF-8 multi-byte code points. * Adapt the higher level clients: * Cpp{Tools,Editor} should expect UTF-8 encoded Literals. * Cpp{Tools,Editor}: When dealing with identifiers on the QString/QTextDocument layer, code points represendet by two QChars need to be respected, too. * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report offsets usable in CppEditor/CppTools. Addresses QTCREATORBUG-7356. Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0 Reviewed-by: Erik Verbruggen <[email protected]>
author: Nikolai Kosjar <[email protected]> 2014-02-25 13:44:11 -0300
committer: Nikolai Kosjar <[email protected]> 2014-05-23 14:23:15 +0200
commit: 70122b3061ee3fbb07442beb0158edf849ceb98e (patch)
tree: e8c272ec1df948acd27378a44764dd683ab5b426 /src
parent: 4fefb1ca2a5270752acf00d586393f472fb1b9a3 (diff)
6 files changed, 59 insertions, 19 deletions
diff --git a/src/libs/3rdparty/cplusplus/Lexer.cpp b/src/libs/3rdparty/cplusplus/Lexer.cpp
index f2729fa5319..914b3c22753 100644
--- a/src/libs/3rdparty/cplusplus/Lexer.cpp
+++ b/src/libs/3rdparty/cplusplus/Lexer.cpp
@@ -29,6 +29,13 @@
 
 using namespace CPlusPlus;
 
+/*!
+    \class Lexer
+    \brief The Lexer generates tokens from an UTF-8 encoded source text.
+
+    \sa Token
+*/
+
 Lexer::Lexer(TranslationUnit *unit)
     : _translationUnit(unit),
       _control(unit->control()),
@@ -63,6 +70,7 @@ void Lexer::setSource(const char *firstChar, const char *lastChar)
     _firstChar = firstChar;
     _lastChar = lastChar;
     _currentChar = _firstChar - 1;
+    _currentCharUtf16 = -1;
     _tokenStart = _currentChar;
     _yychar = '\n';
 }
@@ -109,6 +117,7 @@ void Lexer::scan(Token *tok)
     tok->reset();
     scan_helper(tok);
     tok->f.bytes = _currentChar - _tokenStart;
+    tok->f.utf16chars = _currentCharUtf16 - _tokenStartUtf16;
 }
 
 void Lexer::scan_helper(Token *tok)
@@ -143,6 +152,9 @@ void Lexer::scan_helper(Token *tok)
     _tokenStart = _currentChar;
     tok->byteOffset = _currentChar - _firstChar;
 
+    _tokenStartUtf16 = _currentCharUtf16;
+    tok->utf16charOffset = _currentCharUtf16;
+
     if (_yychar) {
         s._newlineExpected = false;
     } else if (s._tokenKind) {
@@ -621,8 +633,8 @@ void Lexer::scan_helper(Token *tok)
             } else {
                 scanIdentifier(tok);
             }
-        } else if (std::isalpha(ch) || ch == '_' || ch == '$') {
-            scanIdentifier(tok);
+        } else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) {
+            scanIdentifier(tok, _currentChar - _tokenStart - 1);
         } else if (std::isdigit(ch)) {
             scanNumericLiteral(tok);
         } else {
@@ -776,8 +788,10 @@ void Lexer::scanNumericLiteral(Token *tok)
 void Lexer::scanIdentifier(Token *tok, unsigned extraProcessedChars)
 {
     const char *yytext = _currentChar - 1 - extraProcessedChars;
-    while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$')
+    while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$'
+            || isByteOfMultiByteCodePoint(_yychar)) {
         yyinp();
+    }
     int yylen = _currentChar - yytext;
     if (f._scanKeywords)
         tok->f.kind = classify(yytext, yylen, _languageFeatures);
diff --git a/src/libs/3rdparty/cplusplus/Lexer.h b/src/libs/3rdparty/cplusplus/Lexer.h
index 43a877e7a84..8d63d2ba1db 100644
--- a/src/libs/3rdparty/cplusplus/Lexer.h
+++ b/src/libs/3rdparty/cplusplus/Lexer.h
@@ -62,6 +62,7 @@ public:
     void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
 
 private:
+    void pushLineStartOffset();
     void scan_helper(Token *tok);
     void setSource(const char *firstChar, const char *lastChar);
     static int classify(const char *string, int length, LanguageFeatures features);
@@ -77,15 +78,32 @@ private:
     void scanBackslash(Kind type);
     void scanCppComment(Kind type);
 
-    inline void yyinp()
+    static bool isByteOfMultiByteCodePoint(unsigned char byte)
+    { return byte & 0x80; } // Check if most significant bit is set
+
+    void yyinp()
     {
-        _yychar = *++_currentChar;
+        ++_currentCharUtf16;
+
+        // Process multi-byte UTF-8 code point (non-latin1)
+        if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) {
+            unsigned trailingBytesCurrentCodePoint = 1;
+            for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
+                ++trailingBytesCurrentCodePoint;
+            // Code points >= 0x00010000 are represented by two UTF16 code units
+            if (trailingBytesCurrentCodePoint >= 3)
+                ++_currentCharUtf16;
+            _yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1);
+
+        // Process single-byte UTF-8 code point (latin1)
+        } else {
+            _yychar = *++_currentChar;
+        }
+
         if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
             pushLineStartOffset();
     }
 
-    void pushLineStartOffset();
-
 private:
     struct Flags {
         unsigned _scanCommentTokens: 1;
@@ -105,6 +123,10 @@ private:
     const char *_lastChar;
     const char *_tokenStart;
     unsigned char _yychar;
+
+    unsigned _currentCharUtf16;
+    unsigned _tokenStartUtf16;
+
     union {
         unsigned char _state;
         State s;
@@ -113,6 +135,7 @@ private:
         unsigned _flags;
         Flags f;
     };
+
     unsigned _currentLine;
     LanguageFeatures _languageFeatures;
 };
diff --git a/src/libs/3rdparty/cplusplus/Token.cpp b/src/libs/3rdparty/cplusplus/Token.cpp
index 57e36c3ea5c..8be67571318 100644
--- a/src/libs/3rdparty/cplusplus/Token.cpp
+++ b/src/libs/3rdparty/cplusplus/Token.cpp
@@ -85,6 +85,7 @@ void Token::reset()
 {
     flags = 0;
     byteOffset = 0;
+    utf16charOffset = 0;
     ptr = 0;
 }
 
diff --git a/src/libs/3rdparty/cplusplus/Token.h b/src/libs/3rdparty/cplusplus/Token.h
index 02d7f5ebe9a..ec104838520 100644
--- a/src/libs/3rdparty/cplusplus/Token.h
+++ b/src/libs/3rdparty/cplusplus/Token.h
@@ -285,7 +285,7 @@ enum Kind {
 class CPLUSPLUS_EXPORT Token
 {
 public:
-    Token() : flags(0), byteOffset(0), ptr(0) {}
+    Token() : flags(0), byteOffset(0), utf16charOffset(0), ptr(0) {}
 
     inline bool is(unsigned k) const    { return f.kind == k; }
     inline bool isNot(unsigned k) const { return f.kind != k; }
@@ -298,13 +298,14 @@ public:
     inline bool joined() const { return f.joined; }
     inline bool expanded() const { return f.expanded; }
     inline bool generated() const { return f.generated; }
-    inline unsigned bytes() const { return f.bytes; }
 
-    inline unsigned bytesBegin() const
-    { return byteOffset; }
+    inline unsigned bytes() const { return f.bytes; }
+    inline unsigned bytesBegin() const { return byteOffset; }
+    inline unsigned bytesEnd() const { return byteOffset + f.bytes; }
 
-    inline unsigned bytesEnd() const
-    { return byteOffset + f.bytes; }
+    inline unsigned utf16chars() const { return f.utf16chars; }
+    inline unsigned utf16charsBegin() const { return utf16charOffset; }
+    inline unsigned utf16charsEnd() const { return utf16charOffset + f.utf16chars; }
 
     inline bool isLiteral() const
     { return f.kind >= T_FIRST_LITERAL && f.kind <= T_LAST_LITERAL; }
@@ -354,15 +355,17 @@ public:
         unsigned generated     : 1;
         // Unused...
         unsigned pad           : 3;
-        // The token length in bytes.
+        // The token length in bytes and UTF16 chars.
         unsigned bytes         : 16;
+        unsigned utf16chars    : 16;
     };
     union {
-        unsigned flags;
+        unsigned long flags;
         Flags f;
     };
 
     unsigned byteOffset;
+    unsigned utf16charOffset;
 
     union {
         void *ptr;
@@ -393,5 +396,4 @@ struct LanguageFeatures
 
 } // namespace CPlusPlus
 
-
 #endif // CPLUSPLUS_TOKEN_H
diff --git a/src/libs/cplusplus/SimpleLexer.cpp b/src/libs/cplusplus/SimpleLexer.cpp
index 8e539acb84a..95c6c051a59 100644
--- a/src/libs/cplusplus/SimpleLexer.cpp
+++ b/src/libs/cplusplus/SimpleLexer.cpp
@@ -61,11 +61,11 @@ bool SimpleLexer::endedJoined() const
     return _endedJoined;
 }
 
-QList<Token> SimpleLexer::operator()(const QString &text, int state)
+QList<Token> SimpleLexer::operator()(const QString &text, int state, bool convertToUtf8)
 {
     QList<Token> tokens;
 
-    const QByteArray bytes = text.toLatin1();
+    const QByteArray bytes = convertToUtf8 ? text.toUtf8() : text.toLatin1();
     const char *firstChar = bytes.constData();
     const char *lastChar = firstChar + bytes.size();
 
diff --git a/src/libs/cplusplus/SimpleLexer.h b/src/libs/cplusplus/SimpleLexer.h
index 1eb4ab6c3bc..a5b7d3e4ac0 100644
--- a/src/libs/cplusplus/SimpleLexer.h
+++ b/src/libs/cplusplus/SimpleLexer.h
@@ -54,7 +54,7 @@ public:
 
     bool endedJoined() const;
 
-    QList<Token> operator()(const QString &text, int state = 0);
+    QList<Token> operator()(const QString &text, int state = 0, bool convertToUtf8 = false);
 
     int state() const
     { return _lastState; }
author	Nikolai Kosjar <[email protected]>	2014-02-25 13:44:11 -0300
committer	Nikolai Kosjar <[email protected]>	2014-05-23 14:23:15 +0200
commit	70122b3061ee3fbb07442beb0158edf849ceb98e (patch)
tree	e8c272ec1df948acd27378a44764dd683ab5b426 /src
parent	4fefb1ca2a5270752acf00d586393f472fb1b9a3 (diff)