20 #include "llvm/Support/Regex.h" 29 Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
31 Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
32 FormattingDisabled(
false), MacroBlockBeginRegex(Style.MacroBlockBegin),
33 MacroBlockEndRegex(Style.MacroBlockEnd) {
36 Lex->SetKeepWhitespaceMode(
true);
38 for (
const std::string &ForEachMacro : Style.ForEachMacros)
39 Macros.insert({&IdentTable.
get(ForEachMacro), TT_ForEachMacro});
40 for (
const std::string &StatementMacro : Style.StatementMacros)
41 Macros.insert({&IdentTable.
get(StatementMacro), TT_StatementMacro});
42 for (
const std::string &TypenameMacro : Style.TypenameMacros)
43 Macros.insert({&IdentTable.
get(TypenameMacro), TT_TypenameMacro});
44 for (
const std::string &NamespaceMacro : Style.NamespaceMacros)
45 Macros.insert({&IdentTable.
get(NamespaceMacro), TT_NamespaceMacro});
49 assert(Tokens.empty());
50 assert(FirstInLineIndex == 0);
52 Tokens.push_back(getNextToken());
53 if (Style.Language == FormatStyle::LK_JavaScript) {
54 tryParseJSRegexLiteral();
55 handleTemplateStrings();
57 if (Style.Language == FormatStyle::LK_TextProto)
58 tryParsePythonComment();
59 tryMergePreviousTokens();
60 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
61 FirstInLineIndex = Tokens.size() - 1;
62 }
while (Tokens.back()->Tok.isNot(
tok::eof));
66 void FormatTokenLexer::tryMergePreviousTokens() {
67 if (tryMerge_TMacro())
69 if (tryMergeConflictMarkers())
71 if (tryMergeLessLess())
74 if (Style.isCSharp()) {
75 if (tryMergeCSharpKeywordVariables())
77 if (tryMergeCSharpVerbatimStringLiteral())
79 if (tryMergeCSharpDoubleQuestion())
81 if (tryMergeCSharpNullConditionals())
83 if (tryTransformCSharpForEach())
85 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
86 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
90 if (tryMergeNSStringLiteral())
93 if (Style.Language == FormatStyle::LK_JavaScript) {
94 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
97 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
99 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
100 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
103 static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question,
109 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
111 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
113 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
115 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
117 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
119 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
120 Tokens.back()->Tok.setKind(tok::starequal);
123 if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator))
125 if (tryMergeTokens(JSNullPropagatingOperator,
126 TT_JsNullPropagatingOperator)) {
128 Tokens.back()->Tok.setKind(tok::period);
131 if (tryMergeJSPrivateIdentifier())
135 if (Style.Language == FormatStyle::LK_Java) {
137 tok::greater, tok::greater, tok::greaterequal};
138 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
143 bool FormatTokenLexer::tryMergeNSStringLiteral() {
144 if (Tokens.size() < 2)
146 auto &At = *(Tokens.end() - 2);
147 auto &String = *(Tokens.end() - 1);
148 if (!At->is(tok::at) || !String->is(tok::string_literal))
150 At->Tok.setKind(tok::string_literal);
151 At->TokenText = StringRef(At->TokenText.begin(),
152 String->TokenText.end() - At->TokenText.begin());
153 At->ColumnWidth += String->ColumnWidth;
154 At->Type = TT_ObjCStringLiteral;
155 Tokens.erase(Tokens.end() - 1);
159 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
162 if (Tokens.size() < 2)
164 auto &Hash = *(Tokens.end() - 2);
166 if (!Hash->is(tok::hash) || !
Identifier->is(tok::identifier))
168 Hash->Tok.setKind(tok::identifier);
170 StringRef(Hash->TokenText.begin(),
171 Identifier->TokenText.end() - Hash->TokenText.begin());
173 Hash->Type = TT_JsPrivateIdentifier;
174 Tokens.erase(Tokens.end() - 1);
181 bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() {
182 if (Tokens.size() < 2)
184 auto &At = *(Tokens.end() - 2);
185 auto &String = *(Tokens.end() - 1);
188 if (!(At->is(tok::at) || At->TokenText ==
"$") ||
189 !String->is(tok::string_literal))
192 if (Tokens.size() >= 2 && At->is(tok::at)) {
193 auto &Dollar = *(Tokens.end() - 3);
194 if (Dollar->TokenText ==
"$") {
196 Dollar->Tok.setKind(tok::string_literal);
198 StringRef(Dollar->TokenText.begin(),
199 String->TokenText.end() - Dollar->TokenText.begin());
200 Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
201 Dollar->Type = TT_CSharpStringLiteral;
202 Tokens.erase(Tokens.end() - 2);
203 Tokens.erase(Tokens.end() - 1);
209 At->Tok.setKind(tok::string_literal);
210 At->TokenText = StringRef(At->TokenText.begin(),
211 String->TokenText.end() - At->TokenText.begin());
212 At->ColumnWidth += String->ColumnWidth;
213 At->Type = TT_CSharpStringLiteral;
214 Tokens.erase(Tokens.end() - 1);
218 bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
219 if (Tokens.size() < 2)
221 auto &FirstQuestion = *(Tokens.end() - 2);
222 auto &SecondQuestion = *(Tokens.end() - 1);
223 if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question))
225 FirstQuestion->Tok.setKind(tok::question);
226 FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
227 SecondQuestion->TokenText.end() -
228 FirstQuestion->TokenText.begin());
229 FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
230 FirstQuestion->Type = TT_CSharpNullCoalescing;
231 Tokens.erase(Tokens.end() - 1);
235 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
236 if (Tokens.size() < 2)
238 auto &At = *(Tokens.end() - 2);
239 auto &Keyword = *(Tokens.end() - 1);
240 if (!At->is(tok::at))
245 At->Tok.setKind(tok::identifier);
246 At->TokenText = StringRef(At->TokenText.begin(),
247 Keyword->TokenText.end() - At->TokenText.begin());
248 At->ColumnWidth += Keyword->ColumnWidth;
249 At->Type = Keyword->Type;
250 Tokens.erase(Tokens.end() - 1);
255 bool FormatTokenLexer::tryMergeCSharpNullConditionals() {
256 if (Tokens.size() < 2)
259 auto &Question = *(Tokens.end() - 1);
260 if (!
Identifier->isOneOf(tok::r_square, tok::identifier) ||
261 !Question->is(tok::question))
265 Question->TokenText.end() -
Identifier->TokenText.begin());
266 Identifier->ColumnWidth += Question->ColumnWidth;
267 Tokens.erase(Tokens.end() - 1);
272 bool FormatTokenLexer::tryTransformCSharpForEach() {
273 if (Tokens.size() < 1)
286 bool FormatTokenLexer::tryMergeLessLess() {
288 if (Tokens.size() < 3)
291 bool FourthTokenIsLess =
false;
292 if (Tokens.size() > 3)
293 FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
295 auto First = Tokens.end() - 3;
296 if (
First[2]->is(tok::less) ||
First[1]->isNot(tok::less) ||
297 First[0]->isNot(tok::less) || FourthTokenIsLess)
301 if (
First[1]->WhitespaceRange.getBegin() !=
302 First[1]->WhitespaceRange.getEnd())
305 First[0]->Tok.setKind(tok::lessless);
306 First[0]->TokenText =
"<<";
307 First[0]->ColumnWidth += 1;
308 Tokens.erase(Tokens.end() - 2);
314 if (Tokens.size() < Kinds.size())
318 Tokens.end() - Kinds.size();
319 if (!First[0]->is(Kinds[0]))
321 unsigned AddLength = 0;
322 for (
unsigned i = 1; i < Kinds.size(); ++i) {
323 if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
324 First[i]->WhitespaceRange.getEnd())
326 AddLength += First[i]->TokenText.size();
328 Tokens.resize(Tokens.size() - Kinds.size() + 1);
329 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
330 First[0]->TokenText.size() + AddLength);
331 First[0]->ColumnWidth += AddLength;
332 First[0]->Type = NewType;
341 return Tok->
isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
342 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
343 tok::colon, tok::question, tok::tilde) ||
344 Tok->
isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
345 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
350 bool FormatTokenLexer::canPrecedeRegexLiteral(
FormatToken *Prev) {
360 if (Prev->
isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
361 return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
365 if (!precedesOperand(Prev))
375 void FormatTokenLexer::tryParseJSRegexLiteral() {
377 if (!RegexToken->
isOneOf(tok::slash, tok::slashequal))
381 for (
auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
384 if ((*I)->isNot(tok::comment)) {
390 if (!canPrecedeRegexLiteral(Prev))
394 const char *
Offset = Lex->getBufferLocation();
395 const char *RegexBegin = Offset - RegexToken->
TokenText.size();
396 StringRef Buffer = Lex->getBuffer();
397 bool InCharacterClass =
false;
398 bool HaveClosingSlash =
false;
399 for (; !HaveClosingSlash && Offset != Buffer.end(); ++
Offset) {
409 InCharacterClass =
true;
412 InCharacterClass =
false;
415 if (!InCharacterClass)
416 HaveClosingSlash =
true;
421 RegexToken->
Type = TT_RegexLiteral;
424 RegexToken->
TokenText = StringRef(RegexBegin, Offset - RegexBegin);
427 resetLexer(SourceMgr.
getFileOffset(Lex->getSourceLocation(Offset)));
430 void FormatTokenLexer::handleTemplateStrings() {
433 if (BacktickToken->
is(tok::l_brace)) {
437 if (BacktickToken->
is(tok::r_brace)) {
438 if (StateStack.size() == 1)
444 }
else if (BacktickToken->
is(tok::unknown) &&
452 const char *
Offset = Lex->getBufferLocation();
453 const char *TmplBegin = Offset - BacktickToken->
TokenText.size();
454 for (; Offset != Lex->getBuffer().end(); ++
Offset) {
455 if (Offset[0] ==
'`') {
459 if (Offset[0] ==
'\\') {
461 }
else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] ==
'$' &&
470 StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
471 BacktickToken->
Type = TT_TemplateString;
472 BacktickToken->
Tok.
setKind(tok::string_literal);
476 size_t FirstBreak = LiteralText.find(
'\n');
477 StringRef FirstLineText = FirstBreak == StringRef::npos
479 : LiteralText.substr(0, FirstBreak);
481 FirstLineText, BacktickToken->
OriginalColumn, Style.TabWidth, Encoding);
482 size_t LastBreak = LiteralText.rfind(
'\n');
483 if (LastBreak != StringRef::npos) {
485 unsigned StartColumn = 0;
487 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
488 Style.TabWidth, Encoding);
492 ? Lex->getSourceLocation(Offset + 1)
497 void FormatTokenLexer::tryParsePythonComment() {
499 if (!HashToken->
isOneOf(tok::hash, tok::hashhash))
502 const char *CommentBegin =
503 Lex->getBufferLocation() - HashToken->
TokenText.size();
504 size_t From = CommentBegin - Lex->getBuffer().begin();
505 size_t To = Lex->getBuffer().find_first_of(
'\n', From);
506 if (To == StringRef::npos)
507 To = Lex->getBuffer().size();
508 size_t Len = To - From;
509 HashToken->
Type = TT_LineComment;
511 HashToken->
TokenText = Lex->getBuffer().substr(From, Len);
513 ? Lex->getSourceLocation(CommentBegin + Len)
518 bool FormatTokenLexer::tryMerge_TMacro() {
519 if (Tokens.size() < 4)
522 if (!Last->
is(tok::r_paren))
529 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
536 const char *Start = Macro->
TokenText.data();
538 String->
TokenText = StringRef(Start, End - Start);
551 Tokens.back() = String;
555 bool FormatTokenLexer::tryMergeConflictMarkers() {
556 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(
tok::eof))
570 unsigned FirstInLineOffset;
572 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
573 StringRef Buffer = SourceMgr.
getBuffer(ID)->getBuffer();
575 auto LineOffset = Buffer.rfind(
'\n', FirstInLineOffset);
576 if (LineOffset == StringRef::npos) {
582 auto FirstSpace = Buffer.find_first_of(
" \n", LineOffset);
584 if (FirstSpace == StringRef::npos) {
585 LineStart = Buffer.substr(LineOffset);
587 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
591 if (LineStart ==
"<<<<<<<" || LineStart ==
">>>>") {
592 Type = TT_ConflictStart;
593 }
else if (LineStart ==
"|||||||" || LineStart ==
"=======" ||
594 LineStart ==
"====") {
595 Type = TT_ConflictAlternative;
596 }
else if (LineStart ==
">>>>>>>" || LineStart ==
"<<<<") {
597 Type = TT_ConflictEnd;
600 if (Type != TT_Unknown) {
603 Tokens.resize(FirstInLineIndex + 1);
607 Tokens.back()->Type = Type;
608 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
610 Tokens.push_back(Next);
620 StringRef TokenText = FormatTok->
TokenText;
623 FormatTok =
new (Allocator.Allocate())
FormatToken;
639 return getStashedToken();
642 FormatTok =
new (Allocator.Allocate())
FormatToken;
643 readRawToken(*FormatTok);
646 FormatTok->
IsFirst = IsFirstToken;
647 IsFirstToken =
false;
650 unsigned WhitespaceLength = TrailingWhitespace;
651 while (FormatTok->
Tok.
is(tok::unknown)) {
653 auto EscapesNewline = [&](
int pos) {
655 if (pos >= 0 && Text[pos] ==
'\r')
662 for (; pos >= 0; --pos, ++count)
663 if (Text[pos] !=
'\\')
669 for (
int i = 0, e = Text.size(); i != e; ++i) {
690 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
693 if (i + 1 == e || (Text[i + 1] !=
'\r' && Text[i + 1] !=
'\n'))
694 FormatTok->
Type = TT_ImplicitStringLiteral;
697 FormatTok->
Type = TT_ImplicitStringLiteral;
700 if (FormatTok->
Type == TT_ImplicitStringLiteral)
704 if (FormatTok->
is(TT_ImplicitStringLiteral))
708 readRawToken(*FormatTok);
718 if ((Style.Language == FormatStyle::LK_JavaScript ||
719 Style.Language == FormatStyle::LK_Java) &&
720 FormatTok->
is(tok::comment) && FormatTok->
TokenText.startswith(
"//")) {
721 size_t BackslashPos = FormatTok->
TokenText.find(
'\\');
722 while (BackslashPos != StringRef::npos) {
723 if (BackslashPos + 1 < FormatTok->
TokenText.size() &&
724 FormatTok->
TokenText[BackslashPos + 1] ==
'\n') {
725 const char *
Offset = Lex->getBufferLocation();
727 Offset += BackslashPos + 1;
728 resetLexer(SourceMgr.
getFileOffset(Lex->getSourceLocation(Offset)));
735 BackslashPos = FormatTok->
TokenText.find(
'\\', BackslashPos + 1);
744 unsigned SkippedWhitespace = 0;
747 SkippedWhitespace = 3;
748 else if (FormatTok->
TokenText[1] ==
'\n')
749 SkippedWhitespace = 2;
754 WhitespaceLength += SkippedWhitespace;
765 TrailingWhitespace = 0;
766 if (FormatTok->
Tok.
is(tok::comment)) {
768 StringRef UntrimmedText = FormatTok->
TokenText;
770 TrailingWhitespace = UntrimmedText.size() - FormatTok->
TokenText.size();
771 }
else if (FormatTok->
Tok.
is(tok::raw_identifier)) {
775 if (Style.Language == FormatStyle::LK_Java &&
776 FormatTok->
isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
780 }
else if (Style.Language == FormatStyle::LK_JavaScript &&
781 FormatTok->
isOneOf(tok::kw_struct, tok::kw_union,
786 }
else if (FormatTok->
Tok.
is(tok::greatergreater)) {
791 }
else if (FormatTok->
Tok.
is(tok::lessless)) {
801 size_t FirstNewlinePos = Text.find(
'\n');
802 if (FirstNewlinePos == StringRef::npos) {
813 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
818 Text.substr(Text.find_last_of(
'\n') + 1), 0, Style.TabWidth, Encoding);
824 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
825 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
827 it != Macros.end()) {
828 FormatTok->
Type = it->second;
829 }
else if (FormatTok->
is(tok::identifier)) {
830 if (MacroBlockBeginRegex.match(Text)) {
831 FormatTok->
Type = TT_MacroBlockBegin;
832 }
else if (MacroBlockEndRegex.match(Text)) {
833 FormatTok->
Type = TT_MacroBlockEnd;
841 void FormatTokenLexer::readRawToken(
FormatToken &Tok) {
842 Lex->LexFromRawLexer(Tok.
Tok);
847 if (Tok.
is(tok::unknown)) {
851 }
else if (Style.Language == FormatStyle::LK_JavaScript &&
857 if ((Style.Language == FormatStyle::LK_JavaScript ||
858 Style.Language == FormatStyle::LK_Proto ||
859 Style.Language == FormatStyle::LK_TextProto) &&
860 Tok.
is(tok::char_constant)) {
864 if (Tok.
is(tok::comment) && (Tok.
TokenText ==
"// clang-format on" ||
865 Tok.
TokenText ==
"/* clang-format on */")) {
866 FormattingDisabled =
false;
871 if (Tok.
is(tok::comment) && (Tok.
TokenText ==
"// clang-format off" ||
872 Tok.
TokenText ==
"/* clang-format off */")) {
873 FormattingDisabled =
true;
877 void FormatTokenLexer::resetLexer(
unsigned Offset) {
881 Buffer.begin() +
Offset, Buffer.end()));
882 Lex->SetKeepWhitespaceMode(
true);
883 TrailingWhitespace = 0;
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Defines the SourceManager interface.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
One of these records is kept for each identifier that is lexed.
Token - This structure provides full information about a lexed token.
void setKind(tok::TokenKind K)
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
IdentifierInfo * getIdentifierInfo() const
void setIdentifierInfo(IdentifierInfo *II)
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
const llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Dataflow Directional Tag Classes.
unsigned getLength() const
Defines the clang::SourceLocation class and associated facilities.
void setLocation(SourceLocation L)
A trivial tuple used to represent a source range.
This class handles loading and caching of source files into memory.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.