From 70122b3061ee3fbb07442beb0158edf849ceb98e Mon Sep 17 00:00:00 2001 From: Nikolai Kosjar Date: Tue, 25 Feb 2014 13:44:11 -0300 Subject: C++: Support for UTF-8 in the lexer This will save us toLatin1() conversations in CppTools (which already holds UTF-8 encoded QByteArrays) and thus loss of information (see QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers. API-wise the following functions are added to Token. In follow-up patches these will become handy in combination with QStrings. utf16chars() - aequivalent of bytes() utf16charsBegin() - aequivalent of bytesBegin() utf16charsEnd() - aequivalent of bytesEnd() Next steps: * Adapt functions from TranslationUnit. They should work with utf16 chars in order to calculate lines and columns correctly also for UTF-8 multi-byte code points. * Adapt the higher level clients: * Cpp{Tools,Editor} should expect UTF-8 encoded Literals. * Cpp{Tools,Editor}: When dealing with identifiers on the QString/QTextDocument layer, code points represendet by two QChars need to be respected, too. * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report offsets usable in CppEditor/CppTools. Addresses QTCREATORBUG-7356. Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0 Reviewed-by: Erik Verbruggen --- src/libs/cplusplus/SimpleLexer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/libs/cplusplus/SimpleLexer.cpp') diff --git a/src/libs/cplusplus/SimpleLexer.cpp b/src/libs/cplusplus/SimpleLexer.cpp index 8e539acb84a..95c6c051a59 100644 --- a/src/libs/cplusplus/SimpleLexer.cpp +++ b/src/libs/cplusplus/SimpleLexer.cpp @@ -61,11 +61,11 @@ bool SimpleLexer::endedJoined() const return _endedJoined; } -QList SimpleLexer::operator()(const QString &text, int state) +QList SimpleLexer::operator()(const QString &text, int state, bool convertToUtf8) { QList tokens; - const QByteArray bytes = text.toLatin1(); + const QByteArray bytes = convertToUtf8 ? text.toUtf8() : text.toLatin1(); const char *firstChar = bytes.constData(); const char *lastChar = firstChar + bytes.size(); -- cgit v1.2.3