Skip to content

Commit 3eb67d2

Browse files
committed
[Clang] Handle non-ASCII after line splicing
int a\ ス; Failed to be parsed as a valid identifier. Fixes #65156 Reviewed By: tahonermann Differential Revision: https://reviews.llvm.org/D159345
1 parent 89a81ec commit 3eb67d2

File tree

4 files changed

+70
-18
lines changed

4 files changed

+70
-18
lines changed

clang/docs/ReleaseNotes.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,8 @@ Bug Fixes in This Version
214214
(`#64987 <https://github.com/llvm/llvm-project/issues/64987>`_)
215215
- Support MSVC predefined macro expressions in constant expressions and in
216216
local structs.
217+
- Correctly parse non-ascii identifiers that appear immediately after a line splicing
218+
(`#65156 <https://github.com/llvm/llvm-project/issues/65156>`_`)
217219

218220
Bug Fixes to Compiler Builtins
219221
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

clang/include/clang/Lex/Lexer.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -805,9 +805,10 @@ class Lexer : public PreprocessorLexer {
805805
/// Try to consume an identifier character encoded in UTF-8.
806806
/// \param CurPtr Points to the start of the (potential) UTF-8 code unit
807807
/// sequence. On success, updated to point past the end of it.
808+
/// \param Result The token being formed.
808809
/// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
809810
/// character was lexed, \c false otherwise.
810-
bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
811+
bool tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result);
811812
};
812813

813814
} // namespace clang

clang/lib/Lex/Lexer.cpp

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1750,15 +1750,21 @@ bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
17501750
return true;
17511751
}
17521752

1753-
bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1754-
const char *UnicodePtr = CurPtr;
1753+
bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
17551754
llvm::UTF32 CodePoint;
1756-
llvm::ConversionResult Result =
1757-
llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1758-
(const llvm::UTF8 *)BufferEnd,
1759-
&CodePoint,
1760-
llvm::strictConversion);
1761-
if (Result != llvm::conversionOK)
1755+
1756+
// If a UTF-8 codepoint appears immediately after an escaped new line,
1757+
// CurPtr may point to the splicing \ on the preceding line,
1758+
// so we need to skip it.
1759+
unsigned FirstCodeUnitSize;
1760+
getCharAndSize(CurPtr, FirstCodeUnitSize);
1761+
const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1762+
const char *UnicodePtr = CharStart;
1763+
1764+
llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1765+
(const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1766+
&CodePoint, llvm::strictConversion);
1767+
if (ConvResult != llvm::conversionOK)
17621768
return false;
17631769

17641770
bool IsExtension = false;
@@ -1771,21 +1777,26 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
17711777
!PP->isPreprocessedOutput())
17721778
diagnoseInvalidUnicodeCodepointInIdentifier(
17731779
PP->getDiagnostics(), LangOpts, CodePoint,
1774-
makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false);
1780+
makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
17751781
// We got a unicode codepoint that is neither a space nor a
17761782
// a valid identifier part. Carry on as if the codepoint was
17771783
// valid for recovery purposes.
17781784
} else if (!isLexingRawMode()) {
17791785
if (IsExtension)
1780-
diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
1781-
makeCharRange(*this, CurPtr, UnicodePtr));
1786+
diagnoseExtensionInIdentifier(
1787+
PP->getDiagnostics(), CodePoint,
1788+
makeCharRange(*this, CharStart, UnicodePtr));
17821789
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1783-
makeCharRange(*this, CurPtr, UnicodePtr),
1790+
makeCharRange(*this, CharStart, UnicodePtr),
17841791
/*IsFirst=*/false);
17851792
maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1786-
makeCharRange(*this, CurPtr, UnicodePtr));
1793+
makeCharRange(*this, CharStart, UnicodePtr));
17871794
}
17881795

1796+
// Once we sucessfully parsed some UTF-8,
1797+
// calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1798+
// being lexed, and that warnings about trailing spaces are emitted.
1799+
ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
17891800
CurPtr = UnicodePtr;
17901801
return true;
17911802
}
@@ -1865,7 +1876,7 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
18651876
}
18661877
if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
18671878
continue;
1868-
if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1879+
if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
18691880
continue;
18701881
// Neither an expected Unicode codepoint nor a UCN.
18711882
break;
@@ -1985,7 +1996,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
19851996
// If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
19861997
if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
19871998
return LexNumericConstant(Result, CurPtr);
1988-
if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1999+
if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
19892000
return LexNumericConstant(Result, CurPtr);
19902001

19912002
// Update the location of token as well as BufferPtr.
@@ -2009,7 +2020,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
20092020
if (!isAsciiIdentifierStart(C)) {
20102021
if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
20112022
Consumed = true;
2012-
else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
2023+
else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
20132024
Consumed = true;
20142025
else
20152026
return CurPtr;
@@ -2079,7 +2090,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
20792090
if (isAsciiIdentifierContinue(C)) {
20802091
CurPtr = ConsumeChar(CurPtr, Size, Result);
20812092
} else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2082-
} else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
2093+
} else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
20832094
} else
20842095
break;
20852096
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// RUN: %clang_cc1 -verify=expected,c -x c -Wunused %s
2+
// RUN: %clang_cc1 -verify=expected,cpp -x c++ -Wunused %s
3+
4+
void gh65156(void) {
5+
6+
int a\
7+
= 42;
8+
// expected-warning@-2 {{unused variable 'aス'}}
9+
10+
int b\
11+
\
12+
= 42;
13+
// expected-warning@-2 {{backslash and newline separated by space}}
14+
// expected-warning@-4 {{backslash and newline separated by space}}
15+
// expected-warning@-5 {{unused variable 'bス'}}
16+
17+
int \
18+
= 42;
19+
// expected-warning@-2 {{unused variable 'スス'}}
20+
21+
int \
22+
= 42;
23+
// expected-warning@-2 {{unused variable 'ス'}}
24+
25+
}
26+
27+
void gh65156_err(void) {
28+
29+
int \
30+
= 0;
31+
// cpp-error@-2 {{expected unqualified-id}}
32+
// c-error@-3 {{expected identifier}}
33+
34+
35+
int a\
36+
= 0;
37+
// expected-error@-1 {{character <U+274C> not allowed in an identifier}}
38+
}

0 commit comments

Comments
 (0)