clang  10.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
21 
22 namespace clang {
23 namespace format {
24 
26  unsigned Column, const FormatStyle &Style,
28  : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
29  Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
30  Style(Style), IdentTable(getFormattingLangOpts(Style)),
31  Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
32  FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
33  MacroBlockEndRegex(Style.MacroBlockEnd) {
34  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
35  getFormattingLangOpts(Style)));
36  Lex->SetKeepWhitespaceMode(true);
37 
38  for (const std::string &ForEachMacro : Style.ForEachMacros)
39  Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
40  for (const std::string &StatementMacro : Style.StatementMacros)
41  Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
42  for (const std::string &TypenameMacro : Style.TypenameMacros)
43  Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
44  for (const std::string &NamespaceMacro : Style.NamespaceMacros)
45  Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
46 }
47 
49  assert(Tokens.empty());
50  assert(FirstInLineIndex == 0);
51  do {
52  Tokens.push_back(getNextToken());
53  if (Style.Language == FormatStyle::LK_JavaScript) {
54  tryParseJSRegexLiteral();
55  handleTemplateStrings();
56  }
57  if (Style.Language == FormatStyle::LK_TextProto)
58  tryParsePythonComment();
59  tryMergePreviousTokens();
60  if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
61  FirstInLineIndex = Tokens.size() - 1;
62  } while (Tokens.back()->Tok.isNot(tok::eof));
63  return Tokens;
64 }
65 
66 void FormatTokenLexer::tryMergePreviousTokens() {
67  if (tryMerge_TMacro())
68  return;
69  if (tryMergeConflictMarkers())
70  return;
71  if (tryMergeLessLess())
72  return;
73 
74  if (Style.isCSharp()) {
75  if (tryMergeCSharpKeywordVariables())
76  return;
77  if (tryMergeCSharpVerbatimStringLiteral())
78  return;
79  if (tryMergeCSharpDoubleQuestion())
80  return;
81  if (tryMergeCSharpNullConditionals())
82  return;
83  if (tryTransformCSharpForEach())
84  return;
85  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
86  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
87  return;
88  }
89 
90  if (tryMergeNSStringLiteral())
91  return;
92 
93  if (Style.Language == FormatStyle::LK_JavaScript) {
94  static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
95  static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
96  tok::equal};
97  static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
98  tok::greaterequal};
99  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
100  static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
101  static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
102  tok::starequal};
103  static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question,
104  tok::period};
105  static const tok::TokenKind JSNullishOperator[] = {tok::question,
106  tok::question};
107 
108  // FIXME: Investigate what token type gives the correct operator priority.
109  if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
110  return;
111  if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
112  return;
113  if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
114  return;
115  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
116  return;
117  if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
118  return;
119  if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
120  Tokens.back()->Tok.setKind(tok::starequal);
121  return;
122  }
123  if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator))
124  return;
125  if (tryMergeTokens(JSNullPropagatingOperator,
126  TT_JsNullPropagatingOperator)) {
127  // Treat like a regular "." access.
128  Tokens.back()->Tok.setKind(tok::period);
129  return;
130  }
131  if (tryMergeJSPrivateIdentifier())
132  return;
133  }
134 
135  if (Style.Language == FormatStyle::LK_Java) {
136  static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
137  tok::greater, tok::greater, tok::greaterequal};
138  if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
139  return;
140  }
141 }
142 
143 bool FormatTokenLexer::tryMergeNSStringLiteral() {
144  if (Tokens.size() < 2)
145  return false;
146  auto &At = *(Tokens.end() - 2);
147  auto &String = *(Tokens.end() - 1);
148  if (!At->is(tok::at) || !String->is(tok::string_literal))
149  return false;
150  At->Tok.setKind(tok::string_literal);
151  At->TokenText = StringRef(At->TokenText.begin(),
152  String->TokenText.end() - At->TokenText.begin());
153  At->ColumnWidth += String->ColumnWidth;
154  At->Type = TT_ObjCStringLiteral;
155  Tokens.erase(Tokens.end() - 1);
156  return true;
157 }
158 
159 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
160  // Merges #idenfier into a single identifier with the text #identifier
161  // but the token tok::identifier.
162  if (Tokens.size() < 2)
163  return false;
164  auto &Hash = *(Tokens.end() - 2);
165  auto &Identifier = *(Tokens.end() - 1);
166  if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
167  return false;
168  Hash->Tok.setKind(tok::identifier);
169  Hash->TokenText =
170  StringRef(Hash->TokenText.begin(),
171  Identifier->TokenText.end() - Hash->TokenText.begin());
172  Hash->ColumnWidth += Identifier->ColumnWidth;
173  Hash->Type = TT_JsPrivateIdentifier;
174  Tokens.erase(Tokens.end() - 1);
175  return true;
176 }
177 
178 // Search for verbatim or interpolated string literals @"ABC" or
179 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
180 // prevent splitting of @, $ and ".
181 bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() {
182  if (Tokens.size() < 2)
183  return false;
184  auto &At = *(Tokens.end() - 2);
185  auto &String = *(Tokens.end() - 1);
186 
187  // Look for $"aaaaaa" @"aaaaaa".
188  if (!(At->is(tok::at) || At->TokenText == "$") ||
189  !String->is(tok::string_literal))
190  return false;
191 
192  if (Tokens.size() >= 2 && At->is(tok::at)) {
193  auto &Dollar = *(Tokens.end() - 3);
194  if (Dollar->TokenText == "$") {
195  // This looks like $@"aaaaa" so we need to combine all 3 tokens.
196  Dollar->Tok.setKind(tok::string_literal);
197  Dollar->TokenText =
198  StringRef(Dollar->TokenText.begin(),
199  String->TokenText.end() - Dollar->TokenText.begin());
200  Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
201  Dollar->Type = TT_CSharpStringLiteral;
202  Tokens.erase(Tokens.end() - 2);
203  Tokens.erase(Tokens.end() - 1);
204  return true;
205  }
206  }
207 
208  // Convert back into just a string_literal.
209  At->Tok.setKind(tok::string_literal);
210  At->TokenText = StringRef(At->TokenText.begin(),
211  String->TokenText.end() - At->TokenText.begin());
212  At->ColumnWidth += String->ColumnWidth;
213  At->Type = TT_CSharpStringLiteral;
214  Tokens.erase(Tokens.end() - 1);
215  return true;
216 }
217 
218 bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
219  if (Tokens.size() < 2)
220  return false;
221  auto &FirstQuestion = *(Tokens.end() - 2);
222  auto &SecondQuestion = *(Tokens.end() - 1);
223  if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question))
224  return false;
225  FirstQuestion->Tok.setKind(tok::question);
226  FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
227  SecondQuestion->TokenText.end() -
228  FirstQuestion->TokenText.begin());
229  FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
230  FirstQuestion->Type = TT_CSharpNullCoalescing;
231  Tokens.erase(Tokens.end() - 1);
232  return true;
233 }
234 
235 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
236  if (Tokens.size() < 2)
237  return false;
238  auto &At = *(Tokens.end() - 2);
239  auto &Keyword = *(Tokens.end() - 1);
240  if (!At->is(tok::at))
241  return false;
242  if (!Keywords.isCSharpKeyword(*Keyword))
243  return false;
244 
245  At->Tok.setKind(tok::identifier);
246  At->TokenText = StringRef(At->TokenText.begin(),
247  Keyword->TokenText.end() - At->TokenText.begin());
248  At->ColumnWidth += Keyword->ColumnWidth;
249  At->Type = Keyword->Type;
250  Tokens.erase(Tokens.end() - 1);
251  return true;
252 }
253 
254 // In C# merge the Identifier and the ? together e.g. arg?.
255 bool FormatTokenLexer::tryMergeCSharpNullConditionals() {
256  if (Tokens.size() < 2)
257  return false;
258  auto &Identifier = *(Tokens.end() - 2);
259  auto &Question = *(Tokens.end() - 1);
260  if (!Identifier->isOneOf(tok::r_square, tok::identifier) ||
261  !Question->is(tok::question))
262  return false;
263  Identifier->TokenText =
264  StringRef(Identifier->TokenText.begin(),
265  Question->TokenText.end() - Identifier->TokenText.begin());
266  Identifier->ColumnWidth += Question->ColumnWidth;
267  Tokens.erase(Tokens.end() - 1);
268  return true;
269 }
270 
271 // In C# transform identifier foreach into kw_foreach
272 bool FormatTokenLexer::tryTransformCSharpForEach() {
273  if (Tokens.size() < 1)
274  return false;
275  auto &Identifier = *(Tokens.end() - 1);
276  if (!Identifier->is(tok::identifier))
277  return false;
278  if (Identifier->TokenText != "foreach")
279  return false;
280 
281  Identifier->Type = TT_ForEachMacro;
282  Identifier->Tok.setKind(tok::kw_for);
283  return true;
284 }
285 
286 bool FormatTokenLexer::tryMergeLessLess() {
287  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
288  if (Tokens.size() < 3)
289  return false;
290 
291  bool FourthTokenIsLess = false;
292  if (Tokens.size() > 3)
293  FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
294 
295  auto First = Tokens.end() - 3;
296  if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
297  First[0]->isNot(tok::less) || FourthTokenIsLess)
298  return false;
299 
300  // Only merge if there currently is no whitespace between the two "<".
301  if (First[1]->WhitespaceRange.getBegin() !=
302  First[1]->WhitespaceRange.getEnd())
303  return false;
304 
305  First[0]->Tok.setKind(tok::lessless);
306  First[0]->TokenText = "<<";
307  First[0]->ColumnWidth += 1;
308  Tokens.erase(Tokens.end() - 2);
309  return true;
310 }
311 
312 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
313  TokenType NewType) {
314  if (Tokens.size() < Kinds.size())
315  return false;
316 
318  Tokens.end() - Kinds.size();
319  if (!First[0]->is(Kinds[0]))
320  return false;
321  unsigned AddLength = 0;
322  for (unsigned i = 1; i < Kinds.size(); ++i) {
323  if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
324  First[i]->WhitespaceRange.getEnd())
325  return false;
326  AddLength += First[i]->TokenText.size();
327  }
328  Tokens.resize(Tokens.size() - Kinds.size() + 1);
329  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
330  First[0]->TokenText.size() + AddLength);
331  First[0]->ColumnWidth += AddLength;
332  First[0]->Type = NewType;
333  return true;
334 }
335 
336 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
337 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
338  // NB: This is not entirely correct, as an r_paren can introduce an operand
339  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
340  // corner case to not matter in practice, though.
341  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
342  tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
343  tok::colon, tok::question, tok::tilde) ||
344  Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
345  tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
346  tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
347  Tok->isBinaryOperator();
348 }
349 
350 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
351  if (!Prev)
352  return true;
353 
354  // Regex literals can only follow after prefix unary operators, not after
355  // postfix unary operators. If the '++' is followed by a non-operand
356  // introducing token, the slash here is the operand and not the start of a
357  // regex.
358  // `!` is an unary prefix operator, but also a post-fix operator that casts
359  // away nullability, so the same check applies.
360  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
361  return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
362 
363  // The previous token must introduce an operand location where regex
364  // literals can occur.
365  if (!precedesOperand(Prev))
366  return false;
367 
368  return true;
369 }
370 
371 // Tries to parse a JavaScript Regex literal starting at the current token,
372 // if that begins with a slash and is in a location where JavaScript allows
373 // regex literals. Changes the current token to a regex literal and updates
374 // its text if successful.
375 void FormatTokenLexer::tryParseJSRegexLiteral() {
376  FormatToken *RegexToken = Tokens.back();
377  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
378  return;
379 
380  FormatToken *Prev = nullptr;
381  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
382  // NB: Because previous pointers are not initialized yet, this cannot use
383  // Token.getPreviousNonComment.
384  if ((*I)->isNot(tok::comment)) {
385  Prev = *I;
386  break;
387  }
388  }
389 
390  if (!canPrecedeRegexLiteral(Prev))
391  return;
392 
393  // 'Manually' lex ahead in the current file buffer.
394  const char *Offset = Lex->getBufferLocation();
395  const char *RegexBegin = Offset - RegexToken->TokenText.size();
396  StringRef Buffer = Lex->getBuffer();
397  bool InCharacterClass = false;
398  bool HaveClosingSlash = false;
399  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
400  // Regular expressions are terminated with a '/', which can only be
401  // escaped using '\' or a character class between '[' and ']'.
402  // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
403  switch (*Offset) {
404  case '\\':
405  // Skip the escaped character.
406  ++Offset;
407  break;
408  case '[':
409  InCharacterClass = true;
410  break;
411  case ']':
412  InCharacterClass = false;
413  break;
414  case '/':
415  if (!InCharacterClass)
416  HaveClosingSlash = true;
417  break;
418  }
419  }
420 
421  RegexToken->Type = TT_RegexLiteral;
422  // Treat regex literals like other string_literals.
423  RegexToken->Tok.setKind(tok::string_literal);
424  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
425  RegexToken->ColumnWidth = RegexToken->TokenText.size();
426 
427  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
428 }
429 
430 void FormatTokenLexer::handleTemplateStrings() {
431  FormatToken *BacktickToken = Tokens.back();
432 
433  if (BacktickToken->is(tok::l_brace)) {
434  StateStack.push(LexerState::NORMAL);
435  return;
436  }
437  if (BacktickToken->is(tok::r_brace)) {
438  if (StateStack.size() == 1)
439  return;
440  StateStack.pop();
441  if (StateStack.top() != LexerState::TEMPLATE_STRING)
442  return;
443  // If back in TEMPLATE_STRING, fallthrough and continue parsing the
444  } else if (BacktickToken->is(tok::unknown) &&
445  BacktickToken->TokenText == "`") {
446  StateStack.push(LexerState::TEMPLATE_STRING);
447  } else {
448  return; // Not actually a template
449  }
450 
451  // 'Manually' lex ahead in the current file buffer.
452  const char *Offset = Lex->getBufferLocation();
453  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
454  for (; Offset != Lex->getBuffer().end(); ++Offset) {
455  if (Offset[0] == '`') {
456  StateStack.pop();
457  break;
458  }
459  if (Offset[0] == '\\') {
460  ++Offset; // Skip the escaped character.
461  } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
462  Offset[1] == '{') {
463  // '${' introduces an expression interpolation in the template string.
464  StateStack.push(LexerState::NORMAL);
465  ++Offset;
466  break;
467  }
468  }
469 
470  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
471  BacktickToken->Type = TT_TemplateString;
472  BacktickToken->Tok.setKind(tok::string_literal);
473  BacktickToken->TokenText = LiteralText;
474 
475  // Adjust width for potentially multiline string literals.
476  size_t FirstBreak = LiteralText.find('\n');
477  StringRef FirstLineText = FirstBreak == StringRef::npos
478  ? LiteralText
479  : LiteralText.substr(0, FirstBreak);
481  FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
482  size_t LastBreak = LiteralText.rfind('\n');
483  if (LastBreak != StringRef::npos) {
484  BacktickToken->IsMultiline = true;
485  unsigned StartColumn = 0; // The template tail spans the entire line.
487  LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
488  Style.TabWidth, Encoding);
489  }
490 
491  SourceLocation loc = Offset < Lex->getBuffer().end()
492  ? Lex->getSourceLocation(Offset + 1)
493  : SourceMgr.getLocForEndOfFile(ID);
494  resetLexer(SourceMgr.getFileOffset(loc));
495 }
496 
497 void FormatTokenLexer::tryParsePythonComment() {
498  FormatToken *HashToken = Tokens.back();
499  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
500  return;
501  // Turn the remainder of this line into a comment.
502  const char *CommentBegin =
503  Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
504  size_t From = CommentBegin - Lex->getBuffer().begin();
505  size_t To = Lex->getBuffer().find_first_of('\n', From);
506  if (To == StringRef::npos)
507  To = Lex->getBuffer().size();
508  size_t Len = To - From;
509  HashToken->Type = TT_LineComment;
510  HashToken->Tok.setKind(tok::comment);
511  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
512  SourceLocation Loc = To < Lex->getBuffer().size()
513  ? Lex->getSourceLocation(CommentBegin + Len)
514  : SourceMgr.getLocForEndOfFile(ID);
515  resetLexer(SourceMgr.getFileOffset(Loc));
516 }
517 
518 bool FormatTokenLexer::tryMerge_TMacro() {
519  if (Tokens.size() < 4)
520  return false;
521  FormatToken *Last = Tokens.back();
522  if (!Last->is(tok::r_paren))
523  return false;
524 
525  FormatToken *String = Tokens[Tokens.size() - 2];
526  if (!String->is(tok::string_literal) || String->IsMultiline)
527  return false;
528 
529  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
530  return false;
531 
532  FormatToken *Macro = Tokens[Tokens.size() - 4];
533  if (Macro->TokenText != "_T")
534  return false;
535 
536  const char *Start = Macro->TokenText.data();
537  const char *End = Last->TokenText.data() + Last->TokenText.size();
538  String->TokenText = StringRef(Start, End - Start);
539  String->IsFirst = Macro->IsFirst;
540  String->LastNewlineOffset = Macro->LastNewlineOffset;
541  String->WhitespaceRange = Macro->WhitespaceRange;
542  String->OriginalColumn = Macro->OriginalColumn;
544  String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
545  String->NewlinesBefore = Macro->NewlinesBefore;
546  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
547 
548  Tokens.pop_back();
549  Tokens.pop_back();
550  Tokens.pop_back();
551  Tokens.back() = String;
552  return true;
553 }
554 
555 bool FormatTokenLexer::tryMergeConflictMarkers() {
556  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
557  return false;
558 
559  // Conflict lines look like:
560  // <marker> <text from the vcs>
561  // For example:
562  // >>>>>>> /file/in/file/system at revision 1234
563  //
564  // We merge all tokens in a line that starts with a conflict marker
565  // into a single token with a special token type that the unwrapped line
566  // parser will use to correctly rebuild the underlying code.
567 
568  FileID ID;
569  // Get the position of the first token in the line.
570  unsigned FirstInLineOffset;
571  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
572  Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
573  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
574  // Calculate the offset of the start of the current line.
575  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
576  if (LineOffset == StringRef::npos) {
577  LineOffset = 0;
578  } else {
579  ++LineOffset;
580  }
581 
582  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
583  StringRef LineStart;
584  if (FirstSpace == StringRef::npos) {
585  LineStart = Buffer.substr(LineOffset);
586  } else {
587  LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
588  }
589 
590  TokenType Type = TT_Unknown;
591  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
592  Type = TT_ConflictStart;
593  } else if (LineStart == "|||||||" || LineStart == "=======" ||
594  LineStart == "====") {
595  Type = TT_ConflictAlternative;
596  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
597  Type = TT_ConflictEnd;
598  }
599 
600  if (Type != TT_Unknown) {
601  FormatToken *Next = Tokens.back();
602 
603  Tokens.resize(FirstInLineIndex + 1);
604  // We do not need to build a complete token here, as we will skip it
605  // during parsing anyway (as we must not touch whitespace around conflict
606  // markers).
607  Tokens.back()->Type = Type;
608  Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
609 
610  Tokens.push_back(Next);
611  return true;
612  }
613 
614  return false;
615 }
616 
617 FormatToken *FormatTokenLexer::getStashedToken() {
618  // Create a synthesized second '>' or '<' token.
619  Token Tok = FormatTok->Tok;
620  StringRef TokenText = FormatTok->TokenText;
621 
622  unsigned OriginalColumn = FormatTok->OriginalColumn;
623  FormatTok = new (Allocator.Allocate()) FormatToken;
624  FormatTok->Tok = Tok;
625  SourceLocation TokLocation =
626  FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
627  FormatTok->Tok.setLocation(TokLocation);
628  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
629  FormatTok->TokenText = TokenText;
630  FormatTok->ColumnWidth = 1;
631  FormatTok->OriginalColumn = OriginalColumn + 1;
632 
633  return FormatTok;
634 }
635 
636 FormatToken *FormatTokenLexer::getNextToken() {
637  if (StateStack.top() == LexerState::TOKEN_STASHED) {
638  StateStack.pop();
639  return getStashedToken();
640  }
641 
642  FormatTok = new (Allocator.Allocate()) FormatToken;
643  readRawToken(*FormatTok);
644  SourceLocation WhitespaceStart =
645  FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
646  FormatTok->IsFirst = IsFirstToken;
647  IsFirstToken = false;
648 
649  // Consume and record whitespace until we find a significant token.
650  unsigned WhitespaceLength = TrailingWhitespace;
651  while (FormatTok->Tok.is(tok::unknown)) {
652  StringRef Text = FormatTok->TokenText;
653  auto EscapesNewline = [&](int pos) {
654  // A '\r' here is just part of '\r\n'. Skip it.
655  if (pos >= 0 && Text[pos] == '\r')
656  --pos;
657  // See whether there is an odd number of '\' before this.
658  // FIXME: This is wrong. A '\' followed by a newline is always removed,
659  // regardless of whether there is another '\' before it.
660  // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
661  unsigned count = 0;
662  for (; pos >= 0; --pos, ++count)
663  if (Text[pos] != '\\')
664  break;
665  return count & 1;
666  };
667  // FIXME: This miscounts tok:unknown tokens that are not just
668  // whitespace, e.g. a '`' character.
669  for (int i = 0, e = Text.size(); i != e; ++i) {
670  switch (Text[i]) {
671  case '\n':
672  ++FormatTok->NewlinesBefore;
673  FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
674  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
675  Column = 0;
676  break;
677  case '\r':
678  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
679  Column = 0;
680  break;
681  case '\f':
682  case '\v':
683  Column = 0;
684  break;
685  case ' ':
686  ++Column;
687  break;
688  case '\t':
689  Column +=
690  Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
691  break;
692  case '\\':
693  if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
694  FormatTok->Type = TT_ImplicitStringLiteral;
695  break;
696  default:
697  FormatTok->Type = TT_ImplicitStringLiteral;
698  break;
699  }
700  if (FormatTok->Type == TT_ImplicitStringLiteral)
701  break;
702  }
703 
704  if (FormatTok->is(TT_ImplicitStringLiteral))
705  break;
706  WhitespaceLength += FormatTok->Tok.getLength();
707 
708  readRawToken(*FormatTok);
709  }
710 
711  // JavaScript and Java do not allow to escape the end of the line with a
712  // backslash. Backslashes are syntax errors in plain source, but can occur in
713  // comments. When a single line comment ends with a \, it'll cause the next
714  // line of code to be lexed as a comment, breaking formatting. The code below
715  // finds comments that contain a backslash followed by a line break, truncates
716  // the comment token at the backslash, and resets the lexer to restart behind
717  // the backslash.
718  if ((Style.Language == FormatStyle::LK_JavaScript ||
719  Style.Language == FormatStyle::LK_Java) &&
720  FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
721  size_t BackslashPos = FormatTok->TokenText.find('\\');
722  while (BackslashPos != StringRef::npos) {
723  if (BackslashPos + 1 < FormatTok->TokenText.size() &&
724  FormatTok->TokenText[BackslashPos + 1] == '\n') {
725  const char *Offset = Lex->getBufferLocation();
726  Offset -= FormatTok->TokenText.size();
727  Offset += BackslashPos + 1;
728  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
729  FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
731  FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
732  Encoding);
733  break;
734  }
735  BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
736  }
737  }
738 
739  // In case the token starts with escaped newlines, we want to
740  // take them into account as whitespace - this pattern is quite frequent
741  // in macro definitions.
742  // FIXME: Add a more explicit test.
743  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
744  unsigned SkippedWhitespace = 0;
745  if (FormatTok->TokenText.size() > 2 &&
746  (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
747  SkippedWhitespace = 3;
748  else if (FormatTok->TokenText[1] == '\n')
749  SkippedWhitespace = 2;
750  else
751  break;
752 
753  ++FormatTok->NewlinesBefore;
754  WhitespaceLength += SkippedWhitespace;
755  FormatTok->LastNewlineOffset = SkippedWhitespace;
756  Column = 0;
757  FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
758  }
759 
760  FormatTok->WhitespaceRange = SourceRange(
761  WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
762 
763  FormatTok->OriginalColumn = Column;
764 
765  TrailingWhitespace = 0;
766  if (FormatTok->Tok.is(tok::comment)) {
767  // FIXME: Add the trimmed whitespace to Column.
768  StringRef UntrimmedText = FormatTok->TokenText;
769  FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
770  TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
771  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
772  IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
773  FormatTok->Tok.setIdentifierInfo(&Info);
774  FormatTok->Tok.setKind(Info.getTokenID());
775  if (Style.Language == FormatStyle::LK_Java &&
776  FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
777  tok::kw_operator)) {
778  FormatTok->Tok.setKind(tok::identifier);
779  FormatTok->Tok.setIdentifierInfo(nullptr);
780  } else if (Style.Language == FormatStyle::LK_JavaScript &&
781  FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
782  tok::kw_operator)) {
783  FormatTok->Tok.setKind(tok::identifier);
784  FormatTok->Tok.setIdentifierInfo(nullptr);
785  }
786  } else if (FormatTok->Tok.is(tok::greatergreater)) {
787  FormatTok->Tok.setKind(tok::greater);
788  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
789  ++Column;
790  StateStack.push(LexerState::TOKEN_STASHED);
791  } else if (FormatTok->Tok.is(tok::lessless)) {
792  FormatTok->Tok.setKind(tok::less);
793  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
794  ++Column;
795  StateStack.push(LexerState::TOKEN_STASHED);
796  }
797 
798  // Now FormatTok is the next non-whitespace token.
799 
800  StringRef Text = FormatTok->TokenText;
801  size_t FirstNewlinePos = Text.find('\n');
802  if (FirstNewlinePos == StringRef::npos) {
803  // FIXME: ColumnWidth actually depends on the start column, we need to
804  // take this into account when the token is moved.
805  FormatTok->ColumnWidth =
806  encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
807  Column += FormatTok->ColumnWidth;
808  } else {
809  FormatTok->IsMultiline = true;
810  // FIXME: ColumnWidth actually depends on the start column, we need to
811  // take this into account when the token is moved.
813  Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
814 
815  // The last line of the token always starts in column 0.
816  // Thus, the length can be precomputed even in the presence of tabs.
818  Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
819  Column = FormatTok->LastLineColumnWidth;
820  }
821 
822  if (Style.isCpp()) {
823  auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
824  if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
825  Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
826  tok::pp_define) &&
827  it != Macros.end()) {
828  FormatTok->Type = it->second;
829  } else if (FormatTok->is(tok::identifier)) {
830  if (MacroBlockBeginRegex.match(Text)) {
831  FormatTok->Type = TT_MacroBlockBegin;
832  } else if (MacroBlockEndRegex.match(Text)) {
833  FormatTok->Type = TT_MacroBlockEnd;
834  }
835  }
836  }
837 
838  return FormatTok;
839 }
840 
841 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
842  Lex->LexFromRawLexer(Tok.Tok);
843  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
844  Tok.Tok.getLength());
845  // For formatting, treat unterminated string literals like normal string
846  // literals.
847  if (Tok.is(tok::unknown)) {
848  if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
849  Tok.Tok.setKind(tok::string_literal);
850  Tok.IsUnterminatedLiteral = true;
851  } else if (Style.Language == FormatStyle::LK_JavaScript &&
852  Tok.TokenText == "''") {
853  Tok.Tok.setKind(tok::string_literal);
854  }
855  }
856 
857  if ((Style.Language == FormatStyle::LK_JavaScript ||
858  Style.Language == FormatStyle::LK_Proto ||
859  Style.Language == FormatStyle::LK_TextProto) &&
860  Tok.is(tok::char_constant)) {
861  Tok.Tok.setKind(tok::string_literal);
862  }
863 
864  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
865  Tok.TokenText == "/* clang-format on */")) {
866  FormattingDisabled = false;
867  }
868 
869  Tok.Finalized = FormattingDisabled;
870 
871  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
872  Tok.TokenText == "/* clang-format off */")) {
873  FormattingDisabled = true;
874  }
875 }
876 
877 void FormatTokenLexer::resetLexer(unsigned Offset) {
878  StringRef Buffer = SourceMgr.getBufferData(ID);
879  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
880  getFormattingLangOpts(Style), Buffer.begin(),
881  Buffer.begin() + Offset, Buffer.end()));
882  Lex->SetKeepWhitespaceMode(true);
883  TrailingWhitespace = 0;
884 }
885 
886 } // namespace format
887 } // namespace clang
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
StringRef Identifier
Definition: Format.cpp:1833
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
Definition: Lexer.h:76
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
Token Tok
The Token.
Definition: FormatToken.h:135
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Definition: Token.h:97
Defines the SourceManager interface.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:222
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
Definition: Type.h:1450
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
bool IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:164
bool IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:167
bool isBinaryOperator() const
Definition: FormatToken.h:429
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:141
One of these records is kept for each identifier that is lexed.
Token - This structure provides full information about a lexed token.
Definition: Token.h:34
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:61
void setKind(tok::TokenKind K)
Definition: Token.h:93
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
const FormatToken & Tok
bool isCSharpKeyword(const FormatToken &Tok) const
Returns true if Tok is a C# keyword, returns false if it is a anything else.
Definition: FormatToken.h:904
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:323
unsigned Offset
Definition: Format.cpp:1827
SourceLocation End
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding)
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:2509
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
Definition: Token.h:126
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:131
unsigned LastNewlineOffset
The offset just past the last &#39; &#39; in this token&#39;s leading whitespace (relative to WhiteSpaceStart)...
Definition: FormatToken.h:152
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:314
Various functions to configurably format source code.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:179
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:188
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:148
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:24
bool IsUnterminatedLiteral
Set to true if this token is an unterminated literal.
Definition: FormatToken.h:182
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:179
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:49
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
const llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Dataflow Directional Tag Classes.
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:157
bool Finalized
If true, this token has been fully formatted (indented and potentially re-formatted inside)...
Definition: FormatToken.h:312
unsigned getLength() const
Definition: Token.h:129
Defines the clang::SourceLocation class and associated facilities.
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:1826
void setLocation(SourceLocation L)
Definition: Token.h:134
#define true
Definition: stdbool.h:16
A trivial tuple used to represent a source range.
bool HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:145
This class handles loading and caching of source files into memory.
ArrayRef< FormatToken * > lex()
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:161
const encoding::Encoding Encoding
const FormatStyle & Style