21 #include "llvm/Support/Regex.h" 30 Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
32 Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
37 Lex->SetKeepWhitespaceMode(
true);
40 Macros.insert({&IdentTable.
get(ForEachMacro), TT_ForEachMacro});
42 Macros.insert({&IdentTable.
get(StatementMacro), TT_StatementMacro});
46 assert(Tokens.empty());
47 assert(FirstInLineIndex == 0);
49 Tokens.push_back(getNextToken());
51 tryParseJSRegexLiteral();
52 handleTemplateStrings();
55 tryParsePythonComment();
56 tryMergePreviousTokens();
57 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
58 FirstInLineIndex = Tokens.size() - 1;
59 }
while (Tokens.back()->Tok.isNot(
tok::eof));
63 void FormatTokenLexer::tryMergePreviousTokens() {
64 if (tryMerge_TMacro())
66 if (tryMergeConflictMarkers())
68 if (tryMergeLessLess())
70 if (tryMergeNSStringLiteral())
74 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
77 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
79 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
80 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
85 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
87 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
89 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
91 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
93 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
95 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
96 Tokens.back()->Tok.setKind(tok::starequal);
103 tok::greater, tok::greater, tok::greaterequal};
104 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
109 bool FormatTokenLexer::tryMergeNSStringLiteral() {
110 if (Tokens.size() < 2)
112 auto &At = *(Tokens.end() - 2);
113 auto &String = *(Tokens.end() - 1);
114 if (!At->is(tok::at) || !String->is(tok::string_literal))
116 At->Tok.setKind(tok::string_literal);
117 At->TokenText = StringRef(At->TokenText.begin(),
118 String->TokenText.end() - At->TokenText.begin());
119 At->ColumnWidth += String->ColumnWidth;
120 At->Type = TT_ObjCStringLiteral;
121 Tokens.erase(Tokens.end() - 1);
125 bool FormatTokenLexer::tryMergeLessLess() {
127 if (Tokens.size() < 3)
130 bool FourthTokenIsLess =
false;
131 if (Tokens.size() > 3)
132 FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
134 auto First = Tokens.end() - 3;
135 if (
First[2]->is(tok::less) ||
First[1]->isNot(tok::less) ||
136 First[0]->isNot(tok::less) || FourthTokenIsLess)
140 if (
First[1]->WhitespaceRange.getBegin() !=
141 First[1]->WhitespaceRange.getEnd())
144 First[0]->Tok.setKind(tok::lessless);
145 First[0]->TokenText =
"<<";
146 First[0]->ColumnWidth += 1;
147 Tokens.erase(Tokens.end() - 2);
153 if (Tokens.size() < Kinds.size())
157 Tokens.end() - Kinds.size();
158 if (!First[0]->is(Kinds[0]))
160 unsigned AddLength = 0;
161 for (
unsigned i = 1; i < Kinds.size(); ++i) {
162 if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
163 First[i]->WhitespaceRange.getEnd())
165 AddLength += First[i]->TokenText.size();
167 Tokens.resize(Tokens.size() - Kinds.size() + 1);
168 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
169 First[0]->TokenText.size() + AddLength);
170 First[0]->ColumnWidth += AddLength;
171 First[0]->Type = NewType;
180 return Tok->
isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
181 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
182 tok::colon, tok::question, tok::tilde) ||
183 Tok->
isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
184 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
189 bool FormatTokenLexer::canPrecedeRegexLiteral(
FormatToken *Prev) {
199 if (Prev->
isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
200 return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
204 if (!precedesOperand(Prev))
214 void FormatTokenLexer::tryParseJSRegexLiteral() {
216 if (!RegexToken->
isOneOf(tok::slash, tok::slashequal))
220 for (
auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
223 if ((*I)->isNot(tok::comment)) {
229 if (!canPrecedeRegexLiteral(Prev))
233 const char *
Offset = Lex->getBufferLocation();
234 const char *RegexBegin = Offset - RegexToken->
TokenText.size();
235 StringRef Buffer = Lex->getBuffer();
236 bool InCharacterClass =
false;
237 bool HaveClosingSlash =
false;
238 for (; !HaveClosingSlash && Offset != Buffer.end(); ++
Offset) {
248 InCharacterClass =
true;
251 InCharacterClass =
false;
254 if (!InCharacterClass)
255 HaveClosingSlash =
true;
260 RegexToken->
Type = TT_RegexLiteral;
263 RegexToken->
TokenText = StringRef(RegexBegin, Offset - RegexBegin);
266 resetLexer(SourceMgr.
getFileOffset(Lex->getSourceLocation(Offset)));
269 void FormatTokenLexer::handleTemplateStrings() {
272 if (BacktickToken->
is(tok::l_brace)) {
276 if (BacktickToken->
is(tok::r_brace)) {
277 if (StateStack.size() == 1)
283 }
else if (BacktickToken->
is(tok::unknown) &&
291 const char *
Offset = Lex->getBufferLocation();
292 const char *TmplBegin = Offset - BacktickToken->
TokenText.size();
293 for (; Offset != Lex->getBuffer().end(); ++
Offset) {
294 if (Offset[0] ==
'`') {
298 if (Offset[0] ==
'\\') {
300 }
else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] ==
'$' &&
309 StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
310 BacktickToken->
Type = TT_TemplateString;
311 BacktickToken->
Tok.
setKind(tok::string_literal);
315 size_t FirstBreak = LiteralText.find(
'\n');
316 StringRef FirstLineText = FirstBreak == StringRef::npos
318 : LiteralText.substr(0, FirstBreak);
321 size_t LastBreak = LiteralText.rfind(
'\n');
322 if (LastBreak != StringRef::npos) {
324 unsigned StartColumn = 0;
326 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
331 ? Lex->getSourceLocation(Offset + 1)
336 void FormatTokenLexer::tryParsePythonComment() {
338 if (!HashToken->
isOneOf(tok::hash, tok::hashhash))
341 const char *CommentBegin =
342 Lex->getBufferLocation() - HashToken->
TokenText.size();
343 size_t From = CommentBegin - Lex->getBuffer().begin();
344 size_t To = Lex->getBuffer().find_first_of(
'\n', From);
345 if (To == StringRef::npos)
346 To = Lex->getBuffer().size();
347 size_t Len = To - From;
348 HashToken->
Type = TT_LineComment;
350 HashToken->
TokenText = Lex->getBuffer().substr(From, Len);
352 ? Lex->getSourceLocation(CommentBegin + Len)
357 bool FormatTokenLexer::tryMerge_TMacro() {
358 if (Tokens.size() < 4)
361 if (!Last->
is(tok::r_paren))
368 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
375 const char *Start = Macro->
TokenText.data();
377 String->
TokenText = StringRef(Start, End - Start);
390 Tokens.back() = String;
394 bool FormatTokenLexer::tryMergeConflictMarkers() {
395 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(
tok::eof))
409 unsigned FirstInLineOffset;
411 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
412 StringRef Buffer = SourceMgr.
getBuffer(ID)->getBuffer();
414 auto LineOffset = Buffer.rfind(
'\n', FirstInLineOffset);
415 if (LineOffset == StringRef::npos) {
421 auto FirstSpace = Buffer.find_first_of(
" \n", LineOffset);
423 if (FirstSpace == StringRef::npos) {
424 LineStart = Buffer.substr(LineOffset);
426 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
430 if (LineStart ==
"<<<<<<<" || LineStart ==
">>>>") {
431 Type = TT_ConflictStart;
432 }
else if (LineStart ==
"|||||||" || LineStart ==
"=======" ||
433 LineStart ==
"====") {
434 Type = TT_ConflictAlternative;
435 }
else if (LineStart ==
">>>>>>>" || LineStart ==
"<<<<") {
436 Type = TT_ConflictEnd;
439 if (Type != TT_Unknown) {
442 Tokens.resize(FirstInLineIndex + 1);
446 Tokens.back()->Type = Type;
447 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
449 Tokens.push_back(Next);
459 StringRef TokenText = FormatTok->
TokenText;
462 FormatTok =
new (Allocator.Allocate())
FormatToken;
478 return getStashedToken();
481 FormatTok =
new (Allocator.Allocate())
FormatToken;
482 readRawToken(*FormatTok);
485 FormatTok->
IsFirst = IsFirstToken;
486 IsFirstToken =
false;
489 unsigned WhitespaceLength = TrailingWhitespace;
490 while (FormatTok->
Tok.
is(tok::unknown)) {
492 auto EscapesNewline = [&](
int pos) {
494 if (pos >= 0 && Text[pos] ==
'\r')
501 for (; pos >= 0; --pos, ++count)
502 if (Text[pos] !=
'\\')
508 for (
int i = 0, e = Text.size(); i != e; ++i) {
531 if (i + 1 == e || (Text[i + 1] !=
'\r' && Text[i + 1] !=
'\n'))
532 FormatTok->
Type = TT_ImplicitStringLiteral;
535 FormatTok->
Type = TT_ImplicitStringLiteral;
538 if (FormatTok->
Type == TT_ImplicitStringLiteral)
542 if (FormatTok->
is(TT_ImplicitStringLiteral))
546 readRawToken(*FormatTok);
558 FormatTok->
is(tok::comment) && FormatTok->
TokenText.startswith(
"//")) {
559 size_t BackslashPos = FormatTok->
TokenText.find(
'\\');
560 while (BackslashPos != StringRef::npos) {
561 if (BackslashPos + 1 < FormatTok->
TokenText.size() &&
562 FormatTok->
TokenText[BackslashPos + 1] ==
'\n') {
563 const char *
Offset = Lex->getBufferLocation();
565 Offset += BackslashPos + 1;
566 resetLexer(SourceMgr.
getFileOffset(Lex->getSourceLocation(Offset)));
573 BackslashPos = FormatTok->
TokenText.find(
'\\', BackslashPos + 1);
582 unsigned SkippedWhitespace = 0;
585 SkippedWhitespace = 3;
586 else if (FormatTok->
TokenText[1] ==
'\n')
587 SkippedWhitespace = 2;
592 WhitespaceLength += SkippedWhitespace;
603 TrailingWhitespace = 0;
604 if (FormatTok->
Tok.
is(tok::comment)) {
606 StringRef UntrimmedText = FormatTok->
TokenText;
608 TrailingWhitespace = UntrimmedText.size() - FormatTok->
TokenText.size();
609 }
else if (FormatTok->
Tok.
is(tok::raw_identifier)) {
614 FormatTok->
isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
619 FormatTok->
isOneOf(tok::kw_struct, tok::kw_union,
624 }
else if (FormatTok->
Tok.
is(tok::greatergreater)) {
629 }
else if (FormatTok->
Tok.
is(tok::lessless)) {
639 size_t FirstNewlinePos = Text.find(
'\n');
640 if (FirstNewlinePos == StringRef::npos) {
651 Text.substr(0, FirstNewlinePos), Column, Style.
TabWidth, Encoding);
656 Text.substr(Text.find_last_of(
'\n') + 1), 0, Style.
TabWidth, Encoding);
662 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
663 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
665 it != Macros.end()) {
666 FormatTok->
Type = it->second;
667 }
else if (FormatTok->
is(tok::identifier)) {
668 if (MacroBlockBeginRegex.match(Text)) {
669 FormatTok->
Type = TT_MacroBlockBegin;
670 }
else if (MacroBlockEndRegex.match(Text)) {
671 FormatTok->
Type = TT_MacroBlockEnd;
679 void FormatTokenLexer::readRawToken(
FormatToken &Tok) {
680 Lex->LexFromRawLexer(Tok.
Tok);
685 if (Tok.
is(tok::unknown)) {
698 Tok.
is(tok::char_constant)) {
702 if (Tok.
is(tok::comment) && (Tok.
TokenText ==
"// clang-format on" ||
703 Tok.
TokenText ==
"/* clang-format on */")) {
704 FormattingDisabled =
false;
709 if (Tok.
is(tok::comment) && (Tok.
TokenText ==
"// clang-format off" ||
710 Tok.
TokenText ==
"/* clang-format off */")) {
711 FormattingDisabled =
true;
715 void FormatTokenLexer::resetLexer(
unsigned Offset) {
719 Buffer.begin() +
Offset, Buffer.end()));
720 Lex->SetKeepWhitespaceMode(
true);
721 TrailingWhitespace = 0;
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Defines the SourceManager interface.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
One of these records is kept for each identifier that is lexed.
Token - This structure provides full information about a lexed token.
void setKind(tok::TokenKind K)
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
IdentifierInfo * getIdentifierInfo() const
void setIdentifierInfo(IdentifierInfo *II)
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Dataflow Directional Tag Classes.
unsigned getLength() const
Defines the clang::SourceLocation class and associated facilities.
void setLocation(SourceLocation L)
A trivial tuple used to represent a source range.
This class handles loading and caching of source files into memory.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.