clang-tools  8.0.0
SourceCode.cpp
Go to the documentation of this file.
1 //===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 #include "SourceCode.h"
10 
11 #include "Logger.h"
12 #include "clang/AST/ASTContext.h"
13 #include "clang/Basic/SourceManager.h"
14 #include "clang/Lex/Lexer.h"
15 #include "llvm/Support/Errc.h"
16 #include "llvm/Support/Error.h"
17 #include "llvm/Support/Path.h"
18 
19 namespace clang {
20 namespace clangd {
21 
22 // Here be dragons. LSP positions use columns measured in *UTF-16 code units*!
23 // Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial.
24 
25 // Iterates over unicode codepoints in the (UTF-8) string. For each,
26 // invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true.
27 // Returns true if CB returned true, false if we hit the end of string.
28 template <typename Callback>
29 static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
30  for (size_t I = 0; I < U8.size();) {
31  unsigned char C = static_cast<unsigned char>(U8[I]);
32  if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character.
33  if (CB(1, 1))
34  return true;
35  ++I;
36  continue;
37  }
38  // This convenient property of UTF-8 holds for all non-ASCII characters.
39  size_t UTF8Length = llvm::countLeadingOnes(C);
40  // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here.
41  // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug.
42  assert((UTF8Length >= 2 && UTF8Length <= 4) &&
43  "Invalid UTF-8, or transcoding bug?");
44  I += UTF8Length; // Skip over all trailing bytes.
45  // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
46  // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...)
47  if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1))
48  return true;
49  }
50  return false;
51 }
52 
53 // Returns the offset into the string that matches \p Units UTF-16 code units.
54 // Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back
55 // to UTF-8, and returns the length in bytes.
56 static size_t measureUTF16(llvm::StringRef U8, int U16Units, bool &Valid) {
57  size_t Result = 0;
58  Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) {
59  Result += U8Len;
60  U16Units -= U16Len;
61  return U16Units <= 0;
62  });
63  if (U16Units < 0) // Offset was into the middle of a surrogate pair.
64  Valid = false;
65  // Don't return an out-of-range index if we overran.
66  return std::min(Result, U8.size());
67 }
68 
69 // Like most strings in clangd, the input is UTF-8 encoded.
70 size_t lspLength(llvm::StringRef Code) {
71  // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
72  // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
73  size_t Count = 0;
74  iterateCodepoints(Code, [&](int U8Len, int U16Len) {
75  Count += U16Len;
76  return false;
77  });
78  return Count;
79 }
80 
81 llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P,
82  bool AllowColumnsBeyondLineLength) {
83  if (P.line < 0)
84  return llvm::make_error<llvm::StringError>(
85  llvm::formatv("Line value can't be negative ({0})", P.line),
86  llvm::errc::invalid_argument);
87  if (P.character < 0)
88  return llvm::make_error<llvm::StringError>(
89  llvm::formatv("Character value can't be negative ({0})", P.character),
90  llvm::errc::invalid_argument);
91  size_t StartOfLine = 0;
92  for (int I = 0; I != P.line; ++I) {
93  size_t NextNL = Code.find('\n', StartOfLine);
94  if (NextNL == llvm::StringRef::npos)
95  return llvm::make_error<llvm::StringError>(
96  llvm::formatv("Line value is out of range ({0})", P.line),
97  llvm::errc::invalid_argument);
98  StartOfLine = NextNL + 1;
99  }
100 
101  size_t NextNL = Code.find('\n', StartOfLine);
102  if (NextNL == llvm::StringRef::npos)
103  NextNL = Code.size();
104 
105  bool Valid;
106  size_t ByteOffsetInLine = measureUTF16(
107  Code.substr(StartOfLine, NextNL - StartOfLine), P.character, Valid);
108  if (!Valid && !AllowColumnsBeyondLineLength)
109  return llvm::make_error<llvm::StringError>(
110  llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character,
111  P.line),
112  llvm::errc::invalid_argument);
113  return StartOfLine + ByteOffsetInLine;
114 }
115 
116 Position offsetToPosition(llvm::StringRef Code, size_t Offset) {
117  Offset = std::min(Code.size(), Offset);
118  llvm::StringRef Before = Code.substr(0, Offset);
119  int Lines = Before.count('\n');
120  size_t PrevNL = Before.rfind('\n');
121  size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1);
122  Position Pos;
123  Pos.line = Lines;
124  Pos.character = lspLength(Before.substr(StartOfLine));
125  return Pos;
126 }
127 
128 Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc) {
129  // We use the SourceManager's line tables, but its column number is in bytes.
130  FileID FID;
131  unsigned Offset;
132  std::tie(FID, Offset) = SM.getDecomposedSpellingLoc(Loc);
133  Position P;
134  P.line = static_cast<int>(SM.getLineNumber(FID, Offset)) - 1;
135  bool Invalid = false;
136  llvm::StringRef Code = SM.getBufferData(FID, &Invalid);
137  if (!Invalid) {
138  auto ColumnInBytes = SM.getColumnNumber(FID, Offset) - 1;
139  auto LineSoFar = Code.substr(Offset - ColumnInBytes, ColumnInBytes);
140  P.character = lspLength(LineSoFar);
141  }
142  return P;
143 }
144 
145 Range halfOpenToRange(const SourceManager &SM, CharSourceRange R) {
146  // Clang is 1-based, LSP uses 0-based indexes.
147  Position Begin = sourceLocToPosition(SM, R.getBegin());
148  Position End = sourceLocToPosition(SM, R.getEnd());
149 
150  return {Begin, End};
151 }
152 
153 std::pair<size_t, size_t> offsetToClangLineColumn(llvm::StringRef Code,
154  size_t Offset) {
155  Offset = std::min(Code.size(), Offset);
156  llvm::StringRef Before = Code.substr(0, Offset);
157  int Lines = Before.count('\n');
158  size_t PrevNL = Before.rfind('\n');
159  size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1);
160  return {Lines + 1, Offset - StartOfLine + 1};
161 }
162 
163 std::pair<llvm::StringRef, llvm::StringRef>
164 splitQualifiedName(llvm::StringRef QName) {
165  size_t Pos = QName.rfind("::");
166  if (Pos == llvm::StringRef::npos)
167  return {llvm::StringRef(), QName};
168  return {QName.substr(0, Pos + 2), QName.substr(Pos + 2)};
169 }
170 
171 TextEdit replacementToEdit(llvm::StringRef Code,
172  const tooling::Replacement &R) {
173  Range ReplacementRange = {
174  offsetToPosition(Code, R.getOffset()),
175  offsetToPosition(Code, R.getOffset() + R.getLength())};
176  return {ReplacementRange, R.getReplacementText()};
177 }
178 
179 std::vector<TextEdit> replacementsToEdits(llvm::StringRef Code,
180  const tooling::Replacements &Repls) {
181  std::vector<TextEdit> Edits;
182  for (const auto &R : Repls)
183  Edits.push_back(replacementToEdit(Code, R));
184  return Edits;
185 }
186 
187 llvm::Optional<std::string> getCanonicalPath(const FileEntry *F,
188  const SourceManager &SourceMgr) {
189  if (!F)
190  return None;
191 
192  llvm::SmallString<128> FilePath = F->getName();
193  if (!llvm::sys::path::is_absolute(FilePath)) {
194  if (auto EC =
195  SourceMgr.getFileManager().getVirtualFileSystem()->makeAbsolute(
196  FilePath)) {
197  elog("Could not turn relative path '{0}' to absolute: {1}", FilePath,
198  EC.message());
199  return None;
200  }
201  }
202 
203  // Handle the symbolic link path case where the current working directory
204  // (getCurrentWorkingDirectory) is a symlink./ We always want to the real
205  // file path (instead of the symlink path) for the C++ symbols.
206  //
207  // Consider the following example:
208  //
209  // src dir: /project/src/foo.h
210  // current working directory (symlink): /tmp/build -> /project/src/
211  //
212  // The file path of Symbol is "/project/src/foo.h" instead of
213  // "/tmp/build/foo.h"
214  if (const DirectoryEntry *Dir = SourceMgr.getFileManager().getDirectory(
215  llvm::sys::path::parent_path(FilePath))) {
216  llvm::SmallString<128> RealPath;
217  llvm::StringRef DirName = SourceMgr.getFileManager().getCanonicalName(Dir);
218  llvm::sys::path::append(RealPath, DirName,
219  llvm::sys::path::filename(FilePath));
220  return RealPath.str().str();
221  }
222 
223  return FilePath.str().str();
224 }
225 
226 TextEdit toTextEdit(const FixItHint &FixIt, const SourceManager &M,
227  const LangOptions &L) {
229  Result.range =
230  halfOpenToRange(M, Lexer::makeFileCharRange(FixIt.RemoveRange, M, L));
231  Result.newText = FixIt.CodeToInsert;
232  return Result;
233 }
234 
235 bool IsRangeConsecutive(const Range &Left, const Range &Right) {
236  return Left.end.line == Right.start.line &&
237  Left.end.character == Right.start.character;
238 }
239 
240 FileDigest digest(llvm::StringRef Content) {
241  return llvm::SHA1::hash({(const uint8_t *)Content.data(), Content.size()});
242 }
243 
244 llvm::Optional<FileDigest> digestFile(const SourceManager &SM, FileID FID) {
245  bool Invalid = false;
246  llvm::StringRef Content = SM.getBufferData(FID, &Invalid);
247  if (Invalid)
248  return None;
249  return digest(Content);
250 }
251 
252 } // namespace clangd
253 } // namespace clang
SourceLocation Loc
&#39;#&#39; location in the include directive
Position start
The range&#39;s start position.
Definition: Protocol.h:156
size_t lspLength(llvm::StringRef Code)
Definition: SourceCode.cpp:70
decltype(llvm::SHA1::hash({})) FileDigest
Definition: SourceCode.h:31
bool IsRangeConsecutive(const Range &Left, const Range &Right)
Definition: SourceCode.cpp:235
llvm::unique_function< void(llvm::Expected< T >)> Callback
A Callback<T> is a void function that accepts Expected<T>.
Definition: Function.h:29
void elog(const char *Fmt, Ts &&... Vals)
Definition: Logger.h:57
std::pair< size_t, size_t > offsetToClangLineColumn(llvm::StringRef Code, size_t Offset)
Definition: SourceCode.cpp:153
TextEdit toTextEdit(const FixItHint &FixIt, const SourceManager &M, const LangOptions &L)
Definition: SourceCode.cpp:226
std::string newText
The string to be inserted.
Definition: Protocol.h:207
Position offsetToPosition(llvm::StringRef Code, size_t Offset)
Turn an offset in Code into a [line, column] pair.
Definition: SourceCode.cpp:116
llvm::Expected< size_t > positionToOffset(llvm::StringRef Code, Position P, bool AllowColumnsBeyondLineLength)
Turn a [line, column] pair into an offset in Code.
Definition: SourceCode.cpp:81
Range range
The range of the text document to be manipulated.
Definition: Protocol.h:203
llvm::Optional< llvm::Expected< tooling::AtomicChanges > > Result
llvm::Optional< FileDigest > digestFile(const SourceManager &SM, FileID FID)
Definition: SourceCode.cpp:244
Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc)
Turn a SourceLocation into a [line, column] pair.
Definition: SourceCode.cpp:128
Position Pos
FileDigest digest(llvm::StringRef Content)
Definition: SourceCode.cpp:240
int line
Line position in a document (zero-based).
Definition: Protocol.h:127
int character
Character offset on a line in a document (zero-based).
Definition: Protocol.h:132
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
TextEdit replacementToEdit(llvm::StringRef Code, const tooling::Replacement &R)
Definition: SourceCode.cpp:171
std::pair< llvm::StringRef, llvm::StringRef > splitQualifiedName(llvm::StringRef QName)
From "a::b::c", return {"a::b::", "c"}.
Definition: SourceCode.cpp:164
llvm::Optional< std::string > getCanonicalPath(const FileEntry *F, const SourceManager &SourceMgr)
Get the canonical path of F.
Definition: SourceCode.cpp:187
std::vector< TextEdit > replacementsToEdits(llvm::StringRef Code, const tooling::Replacements &Repls)
Definition: SourceCode.cpp:179
llvm::Optional< FixItHint > FixIt
Position end
The range&#39;s end position.
Definition: Protocol.h:159
static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB)
Definition: SourceCode.cpp:29
static size_t measureUTF16(llvm::StringRef U8, int U16Units, bool &Valid)
Definition: SourceCode.cpp:56
unsigned Lines
Range halfOpenToRange(const SourceManager &SM, CharSourceRange R)
Definition: SourceCode.cpp:145