13 #include "llvm/ADT/StringExtras.h" 14 #include "llvm/ADT/StringSwitch.h" 15 #include "llvm/Support/ConvertUTF.h" 16 #include "llvm/Support/ErrorHandling.h" 22 llvm::errs() <<
"comments::Token Kind=" <<
Kind <<
" ";
23 Loc.
print(llvm::errs(), SM);
24 llvm::errs() <<
" " << Length <<
" \"" << L.
getSpelling(*
this, SM) <<
"\"\n";
40 llvm::BumpPtrAllocator &Allocator,
42 char *Resolved = Allocator.Allocate<
char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43 char *ResolvedPtr = Resolved;
44 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45 return StringRef(Resolved, ResolvedPtr - Resolved);
52 #include "clang/AST/CommentHTMLTags.inc" 53 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 57 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name)
const {
59 return llvm::StringSwitch<StringRef>(Name)
66 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
69 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name)
const {
70 unsigned CodePoint = 0;
71 for (
unsigned i = 0, e = Name.size(); i != e; ++i) {
74 CodePoint += Name[i] -
'0';
79 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name)
const {
80 unsigned CodePoint = 0;
81 for (
unsigned i = 0, e = Name.size(); i != e; ++i) {
83 const char C = Name[i];
85 CodePoint += llvm::hexDigitValue(C);
90 void Lexer::skipLineStartingDecorations() {
92 assert(CommentState == LCS_InsideCComment);
94 if (BufferPtr == CommentEnd)
102 const char *NewBufferPtr = BufferPtr;
104 if (NewBufferPtr == CommentEnd)
107 char C = *NewBufferPtr;
110 if (NewBufferPtr == CommentEnd)
115 BufferPtr = NewBufferPtr + 1;
126 const char *findNewline(
const char *BufferPtr,
const char *BufferEnd) {
127 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
134 const char *
skipNewline(
const char *BufferPtr,
const char *BufferEnd) {
135 if (BufferPtr == BufferEnd)
138 if (*BufferPtr ==
'\n')
141 assert(*BufferPtr ==
'\r');
143 if (BufferPtr != BufferEnd && *BufferPtr ==
'\n')
149 const char *skipNamedCharacterReference(
const char *BufferPtr,
150 const char *BufferEnd) {
151 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
158 const char *skipDecimalCharacterReference(
const char *BufferPtr,
159 const char *BufferEnd) {
160 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
167 const char *skipHexCharacterReference(
const char *BufferPtr,
168 const char *BufferEnd) {
169 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
176 bool isHTMLIdentifierStartingCharacter(
char C) {
180 bool isHTMLIdentifierCharacter(
char C) {
184 const char *skipHTMLIdentifier(
const char *BufferPtr,
const char *BufferEnd) {
185 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
186 if (!isHTMLIdentifierCharacter(*BufferPtr))
196 const char *skipHTMLQuotedString(
const char *BufferPtr,
const char *BufferEnd)
198 const char Quote = *BufferPtr;
199 assert(Quote ==
'\"' || Quote ==
'\'');
202 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
203 const char C = *BufferPtr;
204 if (C == Quote && BufferPtr[-1] !=
'\\')
210 const char *
skipWhitespace(
const char *BufferPtr,
const char *BufferEnd) {
211 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
218 bool isWhitespace(
const char *BufferPtr,
const char *BufferEnd) {
222 bool isCommandNameStartCharacter(
char C) {
226 bool isCommandNameCharacter(
char C) {
230 const char *skipCommandName(
const char *BufferPtr,
const char *BufferEnd) {
231 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
232 if (!isCommandNameCharacter(*BufferPtr))
240 const char *findBCPLCommentEnd(
const char *BufferPtr,
const char *BufferEnd) {
241 const char *CurPtr = BufferPtr;
242 while (CurPtr != BufferEnd) {
245 if (CurPtr == BufferEnd)
249 const char *EscapePtr = CurPtr - 1;
253 if (*EscapePtr ==
'\\' ||
254 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] ==
'/' &&
255 EscapePtr[-1] ==
'?' && EscapePtr[-2] ==
'?')) {
266 const char *findCCommentEnd(
const char *BufferPtr,
const char *BufferEnd) {
267 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
268 if (*BufferPtr ==
'*') {
269 assert(BufferPtr + 1 != BufferEnd);
270 if (*(BufferPtr + 1) ==
'/')
274 llvm_unreachable(
"buffer end hit before '*/' was seen");
279 void Lexer::formTokenWithChars(
Token &Result,
const char *TokEnd,
281 const unsigned TokLen = TokEnd - BufferPtr;
286 Result.TextPtr =
"<UNSET>";
292 void Lexer::lexCommentText(
Token &T) {
293 assert(CommentState == LCS_InsideBCPLComment ||
294 CommentState == LCS_InsideCComment);
297 auto HandleNonCommandToken = [&]() ->
void {
298 assert(
State == LS_Normal);
300 const char *TokenPtr = BufferPtr;
301 assert(TokenPtr < CommentEnd);
308 if (CommentState == LCS_InsideCComment)
309 skipLineStartingDecorations();
313 StringRef TokStartSymbols = ParseCommands ?
"\n\r\\@&<" :
"\n\r";
314 size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
315 .find_first_of(TokStartSymbols);
316 if (End != StringRef::npos)
319 TokenPtr = CommentEnd;
320 formTextToken(T, TokenPtr);
327 return HandleNonCommandToken();
332 case LS_VerbatimBlockFirstLine:
333 lexVerbatimBlockFirstLine(T);
335 case LS_VerbatimBlockBody:
336 lexVerbatimBlockBody(T);
338 case LS_VerbatimLineText:
339 lexVerbatimLineText(T);
341 case LS_HTMLStartTag:
349 assert(
State == LS_Normal);
350 const char *TokenPtr = BufferPtr;
351 assert(TokenPtr < CommentEnd);
361 if (TokenPtr == CommentEnd) {
362 formTextToken(T, TokenPtr);
370 case '\\':
case '@':
case '&':
case '$':
371 case '#':
case '<':
case '>':
case '%':
372 case '\"':
case '.':
case ':':
375 if (C ==
':' && TokenPtr != CommentEnd && *TokenPtr ==
':') {
379 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
380 formTokenWithChars(T, TokenPtr,
tok::text);
386 if (!isCommandNameStartCharacter(*TokenPtr)) {
387 formTextToken(T, TokenPtr);
391 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
392 unsigned Length = TokenPtr - (BufferPtr + 1);
396 if (Length == 1 && TokenPtr[-1] ==
'f' && TokenPtr != CommentEnd) {
398 if (C ==
'$' || C ==
'[' || C ==
']' || C ==
'{' || C ==
'}') {
404 StringRef CommandName(BufferPtr + 1, Length);
406 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
408 if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
409 StringRef CorrectedName = Info->
Name;
414 Diag(Loc, diag::warn_correct_comment_command_name)
415 << FullRange << CommandName << CorrectedName
426 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
430 setupAndLexVerbatimLine(T, TokenPtr, Info);
433 formTokenWithChars(T, TokenPtr, CommandKind);
439 lexHTMLCharacterReference(T);
444 if (TokenPtr == CommentEnd) {
445 formTextToken(T, TokenPtr);
448 const char C = *TokenPtr;
449 if (isHTMLIdentifierStartingCharacter(C))
450 setupAndLexHTMLStartTag(T);
452 setupAndLexHTMLEndTag(T);
454 formTextToken(T, TokenPtr);
459 return HandleNonCommandToken();
463 void Lexer::setupAndLexVerbatimBlock(
Token &T,
464 const char *TextBegin,
468 VerbatimBlockEndCommandName.clear();
469 VerbatimBlockEndCommandName.append(Marker ==
'\\' ?
"\\" :
"@");
478 if (BufferPtr != CommentEnd &&
481 State = LS_VerbatimBlockBody;
485 State = LS_VerbatimBlockFirstLine;
488 void Lexer::lexVerbatimBlockFirstLine(
Token &T) {
490 assert(BufferPtr < CommentEnd);
496 const char *Newline = findNewline(BufferPtr, CommentEnd);
497 StringRef
Line(BufferPtr, Newline - BufferPtr);
500 size_t Pos = Line.find(VerbatimBlockEndCommandName);
502 const char *NextLine;
503 if (Pos == StringRef::npos) {
507 }
else if (Pos == 0) {
509 const char *
End = BufferPtr + VerbatimBlockEndCommandName.size();
510 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
517 TextEnd = BufferPtr + Pos;
526 StringRef
Text(BufferPtr, TextEnd - BufferPtr);
530 State = LS_VerbatimBlockBody;
533 void Lexer::lexVerbatimBlockBody(
Token &T) {
534 assert(
State == LS_VerbatimBlockBody);
536 if (CommentState == LCS_InsideCComment)
537 skipLineStartingDecorations();
539 if (BufferPtr == CommentEnd) {
545 lexVerbatimBlockFirstLine(T);
548 void Lexer::setupAndLexVerbatimLine(
Token &T,
const char *TextBegin,
554 State = LS_VerbatimLineText;
557 void Lexer::lexVerbatimLineText(
Token &T) {
558 assert(
State == LS_VerbatimLineText);
561 const char *Newline = findNewline(BufferPtr, CommentEnd);
562 StringRef
Text(BufferPtr, Newline - BufferPtr);
569 void Lexer::lexHTMLCharacterReference(
Token &T) {
570 const char *TokenPtr = BufferPtr;
571 assert(*TokenPtr ==
'&');
573 if (TokenPtr == CommentEnd) {
574 formTextToken(T, TokenPtr);
579 bool isDecimal =
false;
583 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
585 }
else if (C ==
'#') {
587 if (TokenPtr == CommentEnd) {
588 formTextToken(T, TokenPtr);
594 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
596 }
else if (C ==
'x' || C ==
'X') {
599 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
601 formTextToken(T, TokenPtr);
605 formTextToken(T, TokenPtr);
608 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
610 formTextToken(T, TokenPtr);
613 StringRef Name(NamePtr, TokenPtr - NamePtr);
617 Resolved = resolveHTMLNamedCharacterReference(Name);
619 Resolved = resolveHTMLDecimalCharacterReference(Name);
621 Resolved = resolveHTMLHexCharacterReference(Name);
623 if (Resolved.empty()) {
624 formTextToken(T, TokenPtr);
627 formTokenWithChars(T, TokenPtr,
tok::text);
631 void Lexer::setupAndLexHTMLStartTag(
Token &T) {
632 assert(BufferPtr[0] ==
'<' &&
633 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
634 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
635 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
636 if (!isHTMLTagName(Name)) {
637 formTextToken(T, TagNameEnd);
646 const char C = *BufferPtr;
647 if (BufferPtr != CommentEnd &&
648 (C ==
'>' || C ==
'/' || isHTMLIdentifierStartingCharacter(C)))
649 State = LS_HTMLStartTag;
652 void Lexer::lexHTMLStartTag(
Token &T) {
653 assert(
State == LS_HTMLStartTag);
655 const char *TokenPtr = BufferPtr;
657 if (isHTMLIdentifierCharacter(C)) {
658 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
659 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
670 const char *OpenQuote = TokenPtr;
671 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
672 const char *ClosingQuote = TokenPtr;
673 if (TokenPtr != CommentEnd)
677 ClosingQuote - (OpenQuote + 1)));
687 if (TokenPtr != CommentEnd && *TokenPtr ==
'>') {
691 formTextToken(T, TokenPtr);
701 if (BufferPtr == CommentEnd) {
707 if (!isHTMLIdentifierStartingCharacter(C) &&
708 C !=
'=' && C !=
'\"' && C !=
'\'' && C !=
'>') {
714 void Lexer::setupAndLexHTMLEndTag(
Token &T) {
715 assert(BufferPtr[0] ==
'<' && BufferPtr[1] ==
'/');
717 const char *TagNameBegin =
skipWhitespace(BufferPtr + 2, CommentEnd);
718 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
719 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
720 if (!isHTMLTagName(Name)) {
721 formTextToken(T, TagNameEnd);
730 if (BufferPtr != CommentEnd && *BufferPtr ==
'>')
731 State = LS_HTMLEndTag;
734 void Lexer::lexHTMLEndTag(
Token &T) {
735 assert(BufferPtr != CommentEnd && *BufferPtr ==
'>');
743 const char *BufferStart,
const char *BufferEnd,
745 : Allocator(Allocator), Diags(Diags), Traits(Traits),
746 BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
747 BufferPtr(BufferStart), CommentState(LCS_BeforeComment),
State(LS_Normal),
748 ParseCommands(ParseCommands) {}
752 switch (CommentState) {
753 case LCS_BeforeComment:
754 if (BufferPtr == BufferEnd) {
755 formTokenWithChars(T, BufferPtr,
tok::eof);
759 assert(*BufferPtr ==
'/');
765 if (BufferPtr != BufferEnd) {
770 const char C = *BufferPtr;
771 if (C ==
'/' || C ==
'!')
778 if (BufferPtr != BufferEnd && *BufferPtr ==
'<')
781 CommentState = LCS_InsideBCPLComment;
782 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
784 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
791 const char C = *BufferPtr;
792 if ((C ==
'*' && *(BufferPtr + 1) !=
'/') || C ==
'!')
796 if (BufferPtr != BufferEnd && *BufferPtr ==
'<')
799 CommentState = LCS_InsideCComment;
801 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
805 llvm_unreachable(
"second character of comment should be '/' or '*'");
808 case LCS_BetweenComments: {
811 const char *EndWhitespace = BufferPtr;
812 while(EndWhitespace != BufferEnd && *EndWhitespace !=
'/')
821 CommentState = LCS_BeforeComment;
825 case LCS_InsideBCPLComment:
826 case LCS_InsideCComment:
827 if (BufferPtr != CommentEnd) {
832 if (CommentState == LCS_InsideCComment) {
833 assert(BufferPtr[0] ==
'*' && BufferPtr[1] ==
'/');
835 assert(BufferPtr <= BufferEnd);
841 CommentState = LCS_BetweenComments;
845 CommentState = LCS_BetweenComments;
857 bool InvalidTemp =
false;
858 StringRef File = SourceMgr.
getBufferData(LocInfo.first, &InvalidTemp);
862 const char *
Begin = File.data() + LocInfo.second;
863 return StringRef(Begin, Tok.
getLength());
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc TokLoc, const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd, unsigned DiagID)
Produce a diagnostic highlighting some portion of a literal.
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
void print(raw_ostream &OS, const SourceManager &SM) const
LLVM_READONLY bool isLetter(unsigned char c)
Return true if this character is an ASCII letter: [a-zA-Z].
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t', '\f', '\v', '\n', '\r'.
Concrete class used by the front-end to report problems and issues.
static unsigned skipWhitespace(unsigned Idx, StringRef Str, unsigned Length)
Skip over whitespace in the string, starting at the given index.
static unsigned skipNewline(const char *&First, const char *End)
const AnnotatedLine * Line
LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
static bool isNamed(const NamedDecl *ND, const char(&Str)[Len])
Encodes a location in the source.
Dataflow Directional Tag Classes.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
LLVM_READONLY bool isHexDigit(unsigned char c)
Return true if this character is an ASCII hex digit: [0-9a-fA-F].
LLVM_READONLY bool isDigit(unsigned char c)
Return true if this character is an ASCII digit: [0-9].
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string...
A trivial tuple used to represent a source range.
This class handles loading and caching of source files into memory.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.