diff options
| author | Nikolai Kosjar <[email protected]> | 2014-02-25 13:44:11 -0300 |
|---|---|---|
| committer | Nikolai Kosjar <[email protected]> | 2014-05-23 14:23:15 +0200 |
| commit | 70122b3061ee3fbb07442beb0158edf849ceb98e (patch) | |
| tree | e8c272ec1df948acd27378a44764dd683ab5b426 /src | |
| parent | 4fefb1ca2a5270752acf00d586393f472fb1b9a3 (diff) | |
C++: Support for UTF-8 in the lexer
This will save us toLatin1() conversations in CppTools (which already
holds UTF-8 encoded QByteArrays) and thus loss of information (see
QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers.
API-wise the following functions are added to Token. In follow-up
patches these will become handy in combination with QStrings.
utf16chars() - aequivalent of bytes()
utf16charsBegin() - aequivalent of bytesBegin()
utf16charsEnd() - aequivalent of bytesEnd()
Next steps:
* Adapt functions from TranslationUnit. They should work with utf16
chars in order to calculate lines and columns correctly also for
UTF-8 multi-byte code points.
* Adapt the higher level clients:
* Cpp{Tools,Editor} should expect UTF-8 encoded Literals.
* Cpp{Tools,Editor}: When dealing with identifiers on the
QString/QTextDocument layer, code points
represendet by two QChars need to be respected, too.
* Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report
offsets usable in CppEditor/CppTools.
Addresses QTCREATORBUG-7356.
Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0
Reviewed-by: Erik Verbruggen <[email protected]>
Diffstat (limited to 'src')
| -rw-r--r-- | src/libs/3rdparty/cplusplus/Lexer.cpp | 20 | ||||
| -rw-r--r-- | src/libs/3rdparty/cplusplus/Lexer.h | 31 | ||||
| -rw-r--r-- | src/libs/3rdparty/cplusplus/Token.cpp | 1 | ||||
| -rw-r--r-- | src/libs/3rdparty/cplusplus/Token.h | 20 | ||||
| -rw-r--r-- | src/libs/cplusplus/SimpleLexer.cpp | 4 | ||||
| -rw-r--r-- | src/libs/cplusplus/SimpleLexer.h | 2 |
6 files changed, 59 insertions, 19 deletions
diff --git a/src/libs/3rdparty/cplusplus/Lexer.cpp b/src/libs/3rdparty/cplusplus/Lexer.cpp index f2729fa5319..914b3c22753 100644 --- a/src/libs/3rdparty/cplusplus/Lexer.cpp +++ b/src/libs/3rdparty/cplusplus/Lexer.cpp @@ -29,6 +29,13 @@ using namespace CPlusPlus; +/*! + \class Lexer + \brief The Lexer generates tokens from an UTF-8 encoded source text. + + \sa Token +*/ + Lexer::Lexer(TranslationUnit *unit) : _translationUnit(unit), _control(unit->control()), @@ -63,6 +70,7 @@ void Lexer::setSource(const char *firstChar, const char *lastChar) _firstChar = firstChar; _lastChar = lastChar; _currentChar = _firstChar - 1; + _currentCharUtf16 = -1; _tokenStart = _currentChar; _yychar = '\n'; } @@ -109,6 +117,7 @@ void Lexer::scan(Token *tok) tok->reset(); scan_helper(tok); tok->f.bytes = _currentChar - _tokenStart; + tok->f.utf16chars = _currentCharUtf16 - _tokenStartUtf16; } void Lexer::scan_helper(Token *tok) @@ -143,6 +152,9 @@ void Lexer::scan_helper(Token *tok) _tokenStart = _currentChar; tok->byteOffset = _currentChar - _firstChar; + _tokenStartUtf16 = _currentCharUtf16; + tok->utf16charOffset = _currentCharUtf16; + if (_yychar) { s._newlineExpected = false; } else if (s._tokenKind) { @@ -621,8 +633,8 @@ void Lexer::scan_helper(Token *tok) } else { scanIdentifier(tok); } - } else if (std::isalpha(ch) || ch == '_' || ch == '$') { - scanIdentifier(tok); + } else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) { + scanIdentifier(tok, _currentChar - _tokenStart - 1); } else if (std::isdigit(ch)) { scanNumericLiteral(tok); } else { @@ -776,8 +788,10 @@ void Lexer::scanNumericLiteral(Token *tok) void Lexer::scanIdentifier(Token *tok, unsigned extraProcessedChars) { const char *yytext = _currentChar - 1 - extraProcessedChars; - while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$') + while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$' + || isByteOfMultiByteCodePoint(_yychar)) { yyinp(); + } int yylen = _currentChar - yytext; if (f._scanKeywords) tok->f.kind = classify(yytext, yylen, _languageFeatures); diff --git a/src/libs/3rdparty/cplusplus/Lexer.h b/src/libs/3rdparty/cplusplus/Lexer.h index 43a877e7a84..8d63d2ba1db 100644 --- a/src/libs/3rdparty/cplusplus/Lexer.h +++ b/src/libs/3rdparty/cplusplus/Lexer.h @@ -62,6 +62,7 @@ public: void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; } private: + void pushLineStartOffset(); void scan_helper(Token *tok); void setSource(const char *firstChar, const char *lastChar); static int classify(const char *string, int length, LanguageFeatures features); @@ -77,15 +78,32 @@ private: void scanBackslash(Kind type); void scanCppComment(Kind type); - inline void yyinp() + static bool isByteOfMultiByteCodePoint(unsigned char byte) + { return byte & 0x80; } // Check if most significant bit is set + + void yyinp() { - _yychar = *++_currentChar; + ++_currentCharUtf16; + + // Process multi-byte UTF-8 code point (non-latin1) + if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) { + unsigned trailingBytesCurrentCodePoint = 1; + for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1) + ++trailingBytesCurrentCodePoint; + // Code points >= 0x00010000 are represented by two UTF16 code units + if (trailingBytesCurrentCodePoint >= 3) + ++_currentCharUtf16; + _yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1); + + // Process single-byte UTF-8 code point (latin1) + } else { + _yychar = *++_currentChar; + } + if (CPLUSPLUS_UNLIKELY(_yychar == '\n')) pushLineStartOffset(); } - void pushLineStartOffset(); - private: struct Flags { unsigned _scanCommentTokens: 1; @@ -105,6 +123,10 @@ private: const char *_lastChar; const char *_tokenStart; unsigned char _yychar; + + unsigned _currentCharUtf16; + unsigned _tokenStartUtf16; + union { unsigned char _state; State s; @@ -113,6 +135,7 @@ private: unsigned _flags; Flags f; }; + unsigned _currentLine; LanguageFeatures _languageFeatures; }; diff --git a/src/libs/3rdparty/cplusplus/Token.cpp b/src/libs/3rdparty/cplusplus/Token.cpp index 57e36c3ea5c..8be67571318 100644 --- a/src/libs/3rdparty/cplusplus/Token.cpp +++ b/src/libs/3rdparty/cplusplus/Token.cpp @@ -85,6 +85,7 @@ void Token::reset() { flags = 0; byteOffset = 0; + utf16charOffset = 0; ptr = 0; } diff --git a/src/libs/3rdparty/cplusplus/Token.h b/src/libs/3rdparty/cplusplus/Token.h index 02d7f5ebe9a..ec104838520 100644 --- a/src/libs/3rdparty/cplusplus/Token.h +++ b/src/libs/3rdparty/cplusplus/Token.h @@ -285,7 +285,7 @@ enum Kind { class CPLUSPLUS_EXPORT Token { public: - Token() : flags(0), byteOffset(0), ptr(0) {} + Token() : flags(0), byteOffset(0), utf16charOffset(0), ptr(0) {} inline bool is(unsigned k) const { return f.kind == k; } inline bool isNot(unsigned k) const { return f.kind != k; } @@ -298,13 +298,14 @@ public: inline bool joined() const { return f.joined; } inline bool expanded() const { return f.expanded; } inline bool generated() const { return f.generated; } - inline unsigned bytes() const { return f.bytes; } - inline unsigned bytesBegin() const - { return byteOffset; } + inline unsigned bytes() const { return f.bytes; } + inline unsigned bytesBegin() const { return byteOffset; } + inline unsigned bytesEnd() const { return byteOffset + f.bytes; } - inline unsigned bytesEnd() const - { return byteOffset + f.bytes; } + inline unsigned utf16chars() const { return f.utf16chars; } + inline unsigned utf16charsBegin() const { return utf16charOffset; } + inline unsigned utf16charsEnd() const { return utf16charOffset + f.utf16chars; } inline bool isLiteral() const { return f.kind >= T_FIRST_LITERAL && f.kind <= T_LAST_LITERAL; } @@ -354,15 +355,17 @@ public: unsigned generated : 1; // Unused... unsigned pad : 3; - // The token length in bytes. + // The token length in bytes and UTF16 chars. unsigned bytes : 16; + unsigned utf16chars : 16; }; union { - unsigned flags; + unsigned long flags; Flags f; }; unsigned byteOffset; + unsigned utf16charOffset; union { void *ptr; @@ -393,5 +396,4 @@ struct LanguageFeatures } // namespace CPlusPlus - #endif // CPLUSPLUS_TOKEN_H diff --git a/src/libs/cplusplus/SimpleLexer.cpp b/src/libs/cplusplus/SimpleLexer.cpp index 8e539acb84a..95c6c051a59 100644 --- a/src/libs/cplusplus/SimpleLexer.cpp +++ b/src/libs/cplusplus/SimpleLexer.cpp @@ -61,11 +61,11 @@ bool SimpleLexer::endedJoined() const return _endedJoined; } -QList<Token> SimpleLexer::operator()(const QString &text, int state) +QList<Token> SimpleLexer::operator()(const QString &text, int state, bool convertToUtf8) { QList<Token> tokens; - const QByteArray bytes = text.toLatin1(); + const QByteArray bytes = convertToUtf8 ? text.toUtf8() : text.toLatin1(); const char *firstChar = bytes.constData(); const char *lastChar = firstChar + bytes.size(); diff --git a/src/libs/cplusplus/SimpleLexer.h b/src/libs/cplusplus/SimpleLexer.h index 1eb4ab6c3bc..a5b7d3e4ac0 100644 --- a/src/libs/cplusplus/SimpleLexer.h +++ b/src/libs/cplusplus/SimpleLexer.h @@ -54,7 +54,7 @@ public: bool endedJoined() const; - QList<Token> operator()(const QString &text, int state = 0); + QList<Token> operator()(const QString &text, int state = 0, bool convertToUtf8 = false); int state() const { return _lastState; } |
