@@ -1750,15 +1750,21 @@ bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1750
1750
return true ;
1751
1751
}
1752
1752
1753
- bool Lexer::tryConsumeIdentifierUTF8Char (const char *&CurPtr) {
1754
- const char *UnicodePtr = CurPtr;
1753
+ bool Lexer::tryConsumeIdentifierUTF8Char (const char *&CurPtr, Token &Result) {
1755
1754
llvm::UTF32 CodePoint;
1756
- llvm::ConversionResult Result =
1757
- llvm::convertUTF8Sequence ((const llvm::UTF8 **)&UnicodePtr,
1758
- (const llvm::UTF8 *)BufferEnd,
1759
- &CodePoint,
1760
- llvm::strictConversion);
1761
- if (Result != llvm::conversionOK)
1755
+
1756
+ // If a UTF-8 codepoint appears immediately after an escaped new line,
1757
+ // CurPtr may point to the splicing \ on the preceding line,
1758
+ // so we need to skip it.
1759
+ unsigned FirstCodeUnitSize;
1760
+ getCharAndSize (CurPtr, FirstCodeUnitSize);
1761
+ const char *CharStart = CurPtr + FirstCodeUnitSize - 1 ;
1762
+ const char *UnicodePtr = CharStart;
1763
+
1764
+ llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence (
1765
+ (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1766
+ &CodePoint, llvm::strictConversion);
1767
+ if (ConvResult != llvm::conversionOK)
1762
1768
return false ;
1763
1769
1764
1770
bool IsExtension = false ;
@@ -1771,21 +1777,26 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1771
1777
!PP->isPreprocessedOutput ())
1772
1778
diagnoseInvalidUnicodeCodepointInIdentifier (
1773
1779
PP->getDiagnostics (), LangOpts, CodePoint,
1774
- makeCharRange (*this , CurPtr , UnicodePtr), /* IsFirst=*/ false );
1780
+ makeCharRange (*this , CharStart , UnicodePtr), /* IsFirst=*/ false );
1775
1781
// We got a unicode codepoint that is neither a space nor a
1776
1782
// a valid identifier part. Carry on as if the codepoint was
1777
1783
// valid for recovery purposes.
1778
1784
} else if (!isLexingRawMode ()) {
1779
1785
if (IsExtension)
1780
- diagnoseExtensionInIdentifier (PP->getDiagnostics (), CodePoint,
1781
- makeCharRange (*this , CurPtr, UnicodePtr));
1786
+ diagnoseExtensionInIdentifier (
1787
+ PP->getDiagnostics (), CodePoint,
1788
+ makeCharRange (*this , CharStart, UnicodePtr));
1782
1789
maybeDiagnoseIDCharCompat (PP->getDiagnostics (), CodePoint,
1783
- makeCharRange (*this , CurPtr , UnicodePtr),
1790
+ makeCharRange (*this , CharStart , UnicodePtr),
1784
1791
/* IsFirst=*/ false );
1785
1792
maybeDiagnoseUTF8Homoglyph (PP->getDiagnostics (), CodePoint,
1786
- makeCharRange (*this , CurPtr , UnicodePtr));
1793
+ makeCharRange (*this , CharStart , UnicodePtr));
1787
1794
}
1788
1795
1796
+ // Once we sucessfully parsed some UTF-8,
1797
+ // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1798
+ // being lexed, and that warnings about trailing spaces are emitted.
1799
+ ConsumeChar (CurPtr, FirstCodeUnitSize, Result);
1789
1800
CurPtr = UnicodePtr;
1790
1801
return true ;
1791
1802
}
@@ -1865,7 +1876,7 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1865
1876
}
1866
1877
if (C == ' \\ ' && tryConsumeIdentifierUCN (CurPtr, Size , Result))
1867
1878
continue ;
1868
- if (!isASCII (C) && tryConsumeIdentifierUTF8Char (CurPtr))
1879
+ if (!isASCII (C) && tryConsumeIdentifierUTF8Char (CurPtr, Result ))
1869
1880
continue ;
1870
1881
// Neither an expected Unicode codepoint nor a UCN.
1871
1882
break ;
@@ -1985,7 +1996,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1985
1996
// If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1986
1997
if (C == ' \\ ' && tryConsumeIdentifierUCN (CurPtr, Size , Result))
1987
1998
return LexNumericConstant (Result, CurPtr);
1988
- if (!isASCII (C) && tryConsumeIdentifierUTF8Char (CurPtr))
1999
+ if (!isASCII (C) && tryConsumeIdentifierUTF8Char (CurPtr, Result ))
1989
2000
return LexNumericConstant (Result, CurPtr);
1990
2001
1991
2002
// Update the location of token as well as BufferPtr.
@@ -2009,7 +2020,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2009
2020
if (!isAsciiIdentifierStart (C)) {
2010
2021
if (C == ' \\ ' && tryConsumeIdentifierUCN (CurPtr, Size , Result))
2011
2022
Consumed = true ;
2012
- else if (!isASCII (C) && tryConsumeIdentifierUTF8Char (CurPtr))
2023
+ else if (!isASCII (C) && tryConsumeIdentifierUTF8Char (CurPtr, Result ))
2013
2024
Consumed = true ;
2014
2025
else
2015
2026
return CurPtr;
@@ -2079,7 +2090,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2079
2090
if (isAsciiIdentifierContinue (C)) {
2080
2091
CurPtr = ConsumeChar (CurPtr, Size , Result);
2081
2092
} else if (C == ' \\ ' && tryConsumeIdentifierUCN (CurPtr, Size , Result)) {
2082
- } else if (!isASCII (C) && tryConsumeIdentifierUTF8Char (CurPtr)) {
2093
+ } else if (!isASCII (C) && tryConsumeIdentifierUTF8Char (CurPtr, Result )) {
2083
2094
} else
2084
2095
break ;
2085
2096
}
0 commit comments