clang API Documentation
00001 //===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file implements the Lexer and Token interfaces. 00011 // 00012 //===----------------------------------------------------------------------===// 00013 // 00014 // TODO: GCC Diagnostics emitted by the lexer: 00015 // PEDWARN: (form feed|vertical tab) in preprocessing directive 00016 // 00017 // Universal characters, unicode, char mapping: 00018 // WARNING: `%.*s' is not in NFKC 00019 // WARNING: `%.*s' is not in NFC 00020 // 00021 // Other: 00022 // TODO: Options to support: 00023 // -fexec-charset,-fwide-exec-charset 00024 // 00025 //===----------------------------------------------------------------------===// 00026 00027 #include "clang/Lex/Lexer.h" 00028 #include "clang/Lex/Preprocessor.h" 00029 #include "clang/Lex/LexDiagnostic.h" 00030 #include "clang/Lex/CodeCompletionHandler.h" 00031 #include "clang/Basic/SourceManager.h" 00032 #include "llvm/ADT/StringSwitch.h" 00033 #include "llvm/ADT/STLExtras.h" 00034 #include "llvm/Support/Compiler.h" 00035 #include "llvm/Support/MemoryBuffer.h" 00036 #include <cstring> 00037 using namespace clang; 00038 00039 static void InitCharacterInfo(); 00040 00041 //===----------------------------------------------------------------------===// 00042 // Token Class Implementation 00043 //===----------------------------------------------------------------------===// 00044 00045 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 00046 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 00047 if (IdentifierInfo *II = getIdentifierInfo()) 00048 return II->getObjCKeywordID() == objcKey; 00049 return false; 00050 } 00051 00052 /// getObjCKeywordID - Return the ObjC keyword kind. 00053 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 00054 IdentifierInfo *specId = getIdentifierInfo(); 00055 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 00056 } 00057 00058 00059 //===----------------------------------------------------------------------===// 00060 // Lexer Class Implementation 00061 //===----------------------------------------------------------------------===// 00062 00063 void Lexer::anchor() { } 00064 00065 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 00066 const char *BufEnd) { 00067 InitCharacterInfo(); 00068 00069 BufferStart = BufStart; 00070 BufferPtr = BufPtr; 00071 BufferEnd = BufEnd; 00072 00073 assert(BufEnd[0] == 0 && 00074 "We assume that the input buffer has a null character at the end" 00075 " to simplify lexing!"); 00076 00077 // Check whether we have a BOM in the beginning of the buffer. If yes - act 00078 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 00079 // skip the UTF-8 BOM if it's present. 00080 if (BufferStart == BufferPtr) { 00081 // Determine the size of the BOM. 00082 StringRef Buf(BufferStart, BufferEnd - BufferStart); 00083 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 00084 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 00085 .Default(0); 00086 00087 // Skip the BOM. 00088 BufferPtr += BOMLength; 00089 } 00090 00091 Is_PragmaLexer = false; 00092 CurrentConflictMarkerState = CMK_None; 00093 00094 // Start of the file is a start of line. 00095 IsAtStartOfLine = true; 00096 00097 // We are not after parsing a #. 00098 ParsingPreprocessorDirective = false; 00099 00100 // We are not after parsing #include. 00101 ParsingFilename = false; 00102 00103 // We are not in raw mode. Raw mode disables diagnostics and interpretation 00104 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 00105 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 00106 // or otherwise skipping over tokens. 00107 LexingRawMode = false; 00108 00109 // Default to not keeping comments. 00110 ExtendedTokenMode = 0; 00111 } 00112 00113 /// Lexer constructor - Create a new lexer object for the specified buffer 00114 /// with the specified preprocessor managing the lexing process. This lexer 00115 /// assumes that the associated file buffer and Preprocessor objects will 00116 /// outlive it, so it doesn't take ownership of either of them. 00117 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 00118 : PreprocessorLexer(&PP, FID), 00119 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 00120 LangOpts(PP.getLangOpts()) { 00121 00122 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 00123 InputFile->getBufferEnd()); 00124 00125 // Default to keeping comments if the preprocessor wants them. 00126 SetCommentRetentionState(PP.getCommentRetentionState()); 00127 } 00128 00129 /// Lexer constructor - Create a new raw lexer object. This object is only 00130 /// suitable for calls to 'LexRawToken'. This lexer assumes that the text 00131 /// range will outlive it, so it doesn't take ownership of it. 00132 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 00133 const char *BufStart, const char *BufPtr, const char *BufEnd) 00134 : FileLoc(fileloc), LangOpts(langOpts) { 00135 00136 InitLexer(BufStart, BufPtr, BufEnd); 00137 00138 // We *are* in raw mode. 00139 LexingRawMode = true; 00140 } 00141 00142 /// Lexer constructor - Create a new raw lexer object. This object is only 00143 /// suitable for calls to 'LexRawToken'. This lexer assumes that the text 00144 /// range will outlive it, so it doesn't take ownership of it. 00145 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 00146 const SourceManager &SM, const LangOptions &langOpts) 00147 : FileLoc(SM.getLocForStartOfFile(FID)), LangOpts(langOpts) { 00148 00149 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 00150 FromFile->getBufferEnd()); 00151 00152 // We *are* in raw mode. 00153 LexingRawMode = true; 00154 } 00155 00156 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 00157 /// _Pragma expansion. This has a variety of magic semantics that this method 00158 /// sets up. It returns a new'd Lexer that must be delete'd when done. 00159 /// 00160 /// On entrance to this routine, TokStartLoc is a macro location which has a 00161 /// spelling loc that indicates the bytes to be lexed for the token and an 00162 /// expansion location that indicates where all lexed tokens should be 00163 /// "expanded from". 00164 /// 00165 /// FIXME: It would really be nice to make _Pragma just be a wrapper around a 00166 /// normal lexer that remaps tokens as they fly by. This would require making 00167 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 00168 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 00169 /// out of the critical path of the lexer! 00170 /// 00171 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 00172 SourceLocation ExpansionLocStart, 00173 SourceLocation ExpansionLocEnd, 00174 unsigned TokLen, Preprocessor &PP) { 00175 SourceManager &SM = PP.getSourceManager(); 00176 00177 // Create the lexer as if we were going to lex the file normally. 00178 FileID SpellingFID = SM.getFileID(SpellingLoc); 00179 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 00180 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 00181 00182 // Now that the lexer is created, change the start/end locations so that we 00183 // just lex the subsection of the file that we want. This is lexing from a 00184 // scratch buffer. 00185 const char *StrData = SM.getCharacterData(SpellingLoc); 00186 00187 L->BufferPtr = StrData; 00188 L->BufferEnd = StrData+TokLen; 00189 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 00190 00191 // Set the SourceLocation with the remapping information. This ensures that 00192 // GetMappedTokenLoc will remap the tokens as they are lexed. 00193 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 00194 ExpansionLocStart, 00195 ExpansionLocEnd, TokLen); 00196 00197 // Ensure that the lexer thinks it is inside a directive, so that end \n will 00198 // return an EOD token. 00199 L->ParsingPreprocessorDirective = true; 00200 00201 // This lexer really is for _Pragma. 00202 L->Is_PragmaLexer = true; 00203 return L; 00204 } 00205 00206 00207 /// Stringify - Convert the specified string into a C string, with surrounding 00208 /// ""'s, and with escaped \ and " characters. 00209 std::string Lexer::Stringify(const std::string &Str, bool Charify) { 00210 std::string Result = Str; 00211 char Quote = Charify ? '\'' : '"'; 00212 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 00213 if (Result[i] == '\\' || Result[i] == Quote) { 00214 Result.insert(Result.begin()+i, '\\'); 00215 ++i; ++e; 00216 } 00217 } 00218 return Result; 00219 } 00220 00221 /// Stringify - Convert the specified string into a C string by escaping '\' 00222 /// and " characters. This does not add surrounding ""'s to the string. 00223 void Lexer::Stringify(SmallVectorImpl<char> &Str) { 00224 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 00225 if (Str[i] == '\\' || Str[i] == '"') { 00226 Str.insert(Str.begin()+i, '\\'); 00227 ++i; ++e; 00228 } 00229 } 00230 } 00231 00232 //===----------------------------------------------------------------------===// 00233 // Token Spelling 00234 //===----------------------------------------------------------------------===// 00235 00236 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 00237 /// token are the characters used to represent the token in the source file 00238 /// after trigraph expansion and escaped-newline folding. In particular, this 00239 /// wants to get the true, uncanonicalized, spelling of things like digraphs 00240 /// UCNs, etc. 00241 StringRef Lexer::getSpelling(SourceLocation loc, 00242 SmallVectorImpl<char> &buffer, 00243 const SourceManager &SM, 00244 const LangOptions &options, 00245 bool *invalid) { 00246 // Break down the source location. 00247 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 00248 00249 // Try to the load the file buffer. 00250 bool invalidTemp = false; 00251 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 00252 if (invalidTemp) { 00253 if (invalid) *invalid = true; 00254 return StringRef(); 00255 } 00256 00257 const char *tokenBegin = file.data() + locInfo.second; 00258 00259 // Lex from the start of the given location. 00260 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 00261 file.begin(), tokenBegin, file.end()); 00262 Token token; 00263 lexer.LexFromRawLexer(token); 00264 00265 unsigned length = token.getLength(); 00266 00267 // Common case: no need for cleaning. 00268 if (!token.needsCleaning()) 00269 return StringRef(tokenBegin, length); 00270 00271 // Hard case, we need to relex the characters into the string. 00272 buffer.clear(); 00273 buffer.reserve(length); 00274 00275 for (const char *ti = tokenBegin, *te = ti + length; ti != te; ) { 00276 unsigned charSize; 00277 buffer.push_back(Lexer::getCharAndSizeNoWarn(ti, charSize, options)); 00278 ti += charSize; 00279 } 00280 00281 return StringRef(buffer.data(), buffer.size()); 00282 } 00283 00284 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 00285 /// token are the characters used to represent the token in the source file 00286 /// after trigraph expansion and escaped-newline folding. In particular, this 00287 /// wants to get the true, uncanonicalized, spelling of things like digraphs 00288 /// UCNs, etc. 00289 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 00290 const LangOptions &LangOpts, bool *Invalid) { 00291 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 00292 00293 // If this token contains nothing interesting, return it directly. 00294 bool CharDataInvalid = false; 00295 const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 00296 &CharDataInvalid); 00297 if (Invalid) 00298 *Invalid = CharDataInvalid; 00299 if (CharDataInvalid) 00300 return std::string(); 00301 00302 if (!Tok.needsCleaning()) 00303 return std::string(TokStart, TokStart+Tok.getLength()); 00304 00305 std::string Result; 00306 Result.reserve(Tok.getLength()); 00307 00308 // Otherwise, hard case, relex the characters into the string. 00309 for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); 00310 Ptr != End; ) { 00311 unsigned CharSize; 00312 Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, LangOpts)); 00313 Ptr += CharSize; 00314 } 00315 assert(Result.size() != unsigned(Tok.getLength()) && 00316 "NeedsCleaning flag set on something that didn't need cleaning!"); 00317 return Result; 00318 } 00319 00320 /// getSpelling - This method is used to get the spelling of a token into a 00321 /// preallocated buffer, instead of as an std::string. The caller is required 00322 /// to allocate enough space for the token, which is guaranteed to be at least 00323 /// Tok.getLength() bytes long. The actual length of the token is returned. 00324 /// 00325 /// Note that this method may do two possible things: it may either fill in 00326 /// the buffer specified with characters, or it may *change the input pointer* 00327 /// to point to a constant buffer with the data already in it (avoiding a 00328 /// copy). The caller is not allowed to modify the returned buffer pointer 00329 /// if an internal buffer is returned. 00330 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 00331 const SourceManager &SourceMgr, 00332 const LangOptions &LangOpts, bool *Invalid) { 00333 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 00334 00335 const char *TokStart = 0; 00336 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 00337 if (Tok.is(tok::raw_identifier)) 00338 TokStart = Tok.getRawIdentifierData(); 00339 else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 00340 // Just return the string from the identifier table, which is very quick. 00341 Buffer = II->getNameStart(); 00342 return II->getLength(); 00343 } 00344 00345 // NOTE: this can be checked even after testing for an IdentifierInfo. 00346 if (Tok.isLiteral()) 00347 TokStart = Tok.getLiteralData(); 00348 00349 if (TokStart == 0) { 00350 // Compute the start of the token in the input lexer buffer. 00351 bool CharDataInvalid = false; 00352 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 00353 if (Invalid) 00354 *Invalid = CharDataInvalid; 00355 if (CharDataInvalid) { 00356 Buffer = ""; 00357 return 0; 00358 } 00359 } 00360 00361 // If this token contains nothing interesting, return it directly. 00362 if (!Tok.needsCleaning()) { 00363 Buffer = TokStart; 00364 return Tok.getLength(); 00365 } 00366 00367 // Otherwise, hard case, relex the characters into the string. 00368 char *OutBuf = const_cast<char*>(Buffer); 00369 for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); 00370 Ptr != End; ) { 00371 unsigned CharSize; 00372 *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, LangOpts); 00373 Ptr += CharSize; 00374 } 00375 assert(unsigned(OutBuf-Buffer) != Tok.getLength() && 00376 "NeedsCleaning flag set on something that didn't need cleaning!"); 00377 00378 return OutBuf-Buffer; 00379 } 00380 00381 00382 00383 static bool isWhitespace(unsigned char c); 00384 00385 /// MeasureTokenLength - Relex the token at the specified location and return 00386 /// its length in bytes in the input file. If the token needs cleaning (e.g. 00387 /// includes a trigraph or an escaped newline) then this count includes bytes 00388 /// that are part of that. 00389 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 00390 const SourceManager &SM, 00391 const LangOptions &LangOpts) { 00392 // TODO: this could be special cased for common tokens like identifiers, ')', 00393 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 00394 // all obviously single-char tokens. This could use 00395 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 00396 // something. 00397 00398 // If this comes from a macro expansion, we really do want the macro name, not 00399 // the token this macro expanded to. 00400 Loc = SM.getExpansionLoc(Loc); 00401 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 00402 bool Invalid = false; 00403 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 00404 if (Invalid) 00405 return 0; 00406 00407 const char *StrData = Buffer.data()+LocInfo.second; 00408 00409 if (isWhitespace(StrData[0])) 00410 return 0; 00411 00412 // Create a lexer starting at the beginning of this token. 00413 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 00414 Buffer.begin(), StrData, Buffer.end()); 00415 TheLexer.SetCommentRetentionState(true); 00416 Token TheTok; 00417 TheLexer.LexFromRawLexer(TheTok); 00418 return TheTok.getLength(); 00419 } 00420 00421 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 00422 const SourceManager &SM, 00423 const LangOptions &LangOpts) { 00424 assert(Loc.isFileID()); 00425 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 00426 if (LocInfo.first.isInvalid()) 00427 return Loc; 00428 00429 bool Invalid = false; 00430 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 00431 if (Invalid) 00432 return Loc; 00433 00434 // Back up from the current location until we hit the beginning of a line 00435 // (or the buffer). We'll relex from that point. 00436 const char *BufStart = Buffer.data(); 00437 if (LocInfo.second >= Buffer.size()) 00438 return Loc; 00439 00440 const char *StrData = BufStart+LocInfo.second; 00441 if (StrData[0] == '\n' || StrData[0] == '\r') 00442 return Loc; 00443 00444 const char *LexStart = StrData; 00445 while (LexStart != BufStart) { 00446 if (LexStart[0] == '\n' || LexStart[0] == '\r') { 00447 ++LexStart; 00448 break; 00449 } 00450 00451 --LexStart; 00452 } 00453 00454 // Create a lexer starting at the beginning of this token. 00455 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 00456 Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end()); 00457 TheLexer.SetCommentRetentionState(true); 00458 00459 // Lex tokens until we find the token that contains the source location. 00460 Token TheTok; 00461 do { 00462 TheLexer.LexFromRawLexer(TheTok); 00463 00464 if (TheLexer.getBufferLocation() > StrData) { 00465 // Lexing this token has taken the lexer past the source location we're 00466 // looking for. If the current token encompasses our source location, 00467 // return the beginning of that token. 00468 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 00469 return TheTok.getLocation(); 00470 00471 // We ended up skipping over the source location entirely, which means 00472 // that it points into whitespace. We're done here. 00473 break; 00474 } 00475 } while (TheTok.getKind() != tok::eof); 00476 00477 // We've passed our source location; just return the original source location. 00478 return Loc; 00479 } 00480 00481 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 00482 const SourceManager &SM, 00483 const LangOptions &LangOpts) { 00484 if (Loc.isFileID()) 00485 return getBeginningOfFileToken(Loc, SM, LangOpts); 00486 00487 if (!SM.isMacroArgExpansion(Loc)) 00488 return Loc; 00489 00490 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 00491 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 00492 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 00493 std::pair<FileID, unsigned> BeginFileLocInfo 00494 = SM.getDecomposedLoc(BeginFileLoc); 00495 assert(FileLocInfo.first == BeginFileLocInfo.first && 00496 FileLocInfo.second >= BeginFileLocInfo.second); 00497 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 00498 } 00499 00500 namespace { 00501 enum PreambleDirectiveKind { 00502 PDK_Skipped, 00503 PDK_StartIf, 00504 PDK_EndIf, 00505 PDK_Unknown 00506 }; 00507 } 00508 00509 std::pair<unsigned, bool> 00510 Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, 00511 const LangOptions &LangOpts, unsigned MaxLines) { 00512 // Create a lexer starting at the beginning of the file. Note that we use a 00513 // "fake" file source location at offset 1 so that the lexer will track our 00514 // position within the file. 00515 const unsigned StartOffset = 1; 00516 SourceLocation StartLoc = SourceLocation::getFromRawEncoding(StartOffset); 00517 Lexer TheLexer(StartLoc, LangOpts, Buffer->getBufferStart(), 00518 Buffer->getBufferStart(), Buffer->getBufferEnd()); 00519 00520 bool InPreprocessorDirective = false; 00521 Token TheTok; 00522 Token IfStartTok; 00523 unsigned IfCount = 0; 00524 00525 unsigned MaxLineOffset = 0; 00526 if (MaxLines) { 00527 const char *CurPtr = Buffer->getBufferStart(); 00528 unsigned CurLine = 0; 00529 while (CurPtr != Buffer->getBufferEnd()) { 00530 char ch = *CurPtr++; 00531 if (ch == '\n') { 00532 ++CurLine; 00533 if (CurLine == MaxLines) 00534 break; 00535 } 00536 } 00537 if (CurPtr != Buffer->getBufferEnd()) 00538 MaxLineOffset = CurPtr - Buffer->getBufferStart(); 00539 } 00540 00541 do { 00542 TheLexer.LexFromRawLexer(TheTok); 00543 00544 if (InPreprocessorDirective) { 00545 // If we've hit the end of the file, we're done. 00546 if (TheTok.getKind() == tok::eof) { 00547 InPreprocessorDirective = false; 00548 break; 00549 } 00550 00551 // If we haven't hit the end of the preprocessor directive, skip this 00552 // token. 00553 if (!TheTok.isAtStartOfLine()) 00554 continue; 00555 00556 // We've passed the end of the preprocessor directive, and will look 00557 // at this token again below. 00558 InPreprocessorDirective = false; 00559 } 00560 00561 // Keep track of the # of lines in the preamble. 00562 if (TheTok.isAtStartOfLine()) { 00563 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 00564 00565 // If we were asked to limit the number of lines in the preamble, 00566 // and we're about to exceed that limit, we're done. 00567 if (MaxLineOffset && TokOffset >= MaxLineOffset) 00568 break; 00569 } 00570 00571 // Comments are okay; skip over them. 00572 if (TheTok.getKind() == tok::comment) 00573 continue; 00574 00575 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 00576 // This is the start of a preprocessor directive. 00577 Token HashTok = TheTok; 00578 InPreprocessorDirective = true; 00579 00580 // Figure out which directive this is. Since we're lexing raw tokens, 00581 // we don't have an identifier table available. Instead, just look at 00582 // the raw identifier to recognize and categorize preprocessor directives. 00583 TheLexer.LexFromRawLexer(TheTok); 00584 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 00585 StringRef Keyword(TheTok.getRawIdentifierData(), 00586 TheTok.getLength()); 00587 PreambleDirectiveKind PDK 00588 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 00589 .Case("include", PDK_Skipped) 00590 .Case("__include_macros", PDK_Skipped) 00591 .Case("define", PDK_Skipped) 00592 .Case("undef", PDK_Skipped) 00593 .Case("line", PDK_Skipped) 00594 .Case("error", PDK_Skipped) 00595 .Case("pragma", PDK_Skipped) 00596 .Case("import", PDK_Skipped) 00597 .Case("include_next", PDK_Skipped) 00598 .Case("warning", PDK_Skipped) 00599 .Case("ident", PDK_Skipped) 00600 .Case("sccs", PDK_Skipped) 00601 .Case("assert", PDK_Skipped) 00602 .Case("unassert", PDK_Skipped) 00603 .Case("if", PDK_StartIf) 00604 .Case("ifdef", PDK_StartIf) 00605 .Case("ifndef", PDK_StartIf) 00606 .Case("elif", PDK_Skipped) 00607 .Case("else", PDK_Skipped) 00608 .Case("endif", PDK_EndIf) 00609 .Default(PDK_Unknown); 00610 00611 switch (PDK) { 00612 case PDK_Skipped: 00613 continue; 00614 00615 case PDK_StartIf: 00616 if (IfCount == 0) 00617 IfStartTok = HashTok; 00618 00619 ++IfCount; 00620 continue; 00621 00622 case PDK_EndIf: 00623 // Mismatched #endif. The preamble ends here. 00624 if (IfCount == 0) 00625 break; 00626 00627 --IfCount; 00628 continue; 00629 00630 case PDK_Unknown: 00631 // We don't know what this directive is; stop at the '#'. 00632 break; 00633 } 00634 } 00635 00636 // We only end up here if we didn't recognize the preprocessor 00637 // directive or it was one that can't occur in the preamble at this 00638 // point. Roll back the current token to the location of the '#'. 00639 InPreprocessorDirective = false; 00640 TheTok = HashTok; 00641 } 00642 00643 // We hit a token that we don't recognize as being in the 00644 // "preprocessing only" part of the file, so we're no longer in 00645 // the preamble. 00646 break; 00647 } while (true); 00648 00649 SourceLocation End = IfCount? IfStartTok.getLocation() : TheTok.getLocation(); 00650 return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(), 00651 IfCount? IfStartTok.isAtStartOfLine() 00652 : TheTok.isAtStartOfLine()); 00653 } 00654 00655 00656 /// AdvanceToTokenCharacter - Given a location that specifies the start of a 00657 /// token, return a new location that specifies a character within the token. 00658 SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart, 00659 unsigned CharNo, 00660 const SourceManager &SM, 00661 const LangOptions &LangOpts) { 00662 // Figure out how many physical characters away the specified expansion 00663 // character is. This needs to take into consideration newlines and 00664 // trigraphs. 00665 bool Invalid = false; 00666 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 00667 00668 // If they request the first char of the token, we're trivially done. 00669 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 00670 return TokStart; 00671 00672 unsigned PhysOffset = 0; 00673 00674 // The usual case is that tokens don't contain anything interesting. Skip 00675 // over the uninteresting characters. If a token only consists of simple 00676 // chars, this method is extremely fast. 00677 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 00678 if (CharNo == 0) 00679 return TokStart.getLocWithOffset(PhysOffset); 00680 ++TokPtr, --CharNo, ++PhysOffset; 00681 } 00682 00683 // If we have a character that may be a trigraph or escaped newline, use a 00684 // lexer to parse it correctly. 00685 for (; CharNo; --CharNo) { 00686 unsigned Size; 00687 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); 00688 TokPtr += Size; 00689 PhysOffset += Size; 00690 } 00691 00692 // Final detail: if we end up on an escaped newline, we want to return the 00693 // location of the actual byte of the token. For example foo<newline>bar 00694 // advanced by 3 should return the location of b, not of \\. One compounding 00695 // detail of this is that the escape may be made by a trigraph. 00696 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 00697 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 00698 00699 return TokStart.getLocWithOffset(PhysOffset); 00700 } 00701 00702 /// \brief Computes the source location just past the end of the 00703 /// token at this source location. 00704 /// 00705 /// This routine can be used to produce a source location that 00706 /// points just past the end of the token referenced by \p Loc, and 00707 /// is generally used when a diagnostic needs to point just after a 00708 /// token where it expected something different that it received. If 00709 /// the returned source location would not be meaningful (e.g., if 00710 /// it points into a macro), this routine returns an invalid 00711 /// source location. 00712 /// 00713 /// \param Offset an offset from the end of the token, where the source 00714 /// location should refer to. The default offset (0) produces a source 00715 /// location pointing just past the end of the token; an offset of 1 produces 00716 /// a source location pointing to the last character in the token, etc. 00717 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 00718 const SourceManager &SM, 00719 const LangOptions &LangOpts) { 00720 if (Loc.isInvalid()) 00721 return SourceLocation(); 00722 00723 if (Loc.isMacroID()) { 00724 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 00725 return SourceLocation(); // Points inside the macro expansion. 00726 } 00727 00728 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 00729 if (Len > Offset) 00730 Len = Len - Offset; 00731 else 00732 return Loc; 00733 00734 return Loc.getLocWithOffset(Len); 00735 } 00736 00737 /// \brief Returns true if the given MacroID location points at the first 00738 /// token of the macro expansion. 00739 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 00740 const SourceManager &SM, 00741 const LangOptions &LangOpts, 00742 SourceLocation *MacroBegin) { 00743 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 00744 00745 std::pair<FileID, unsigned> infoLoc = SM.getDecomposedLoc(loc); 00746 // FIXME: If the token comes from the macro token paste operator ('##') 00747 // this function will always return false; 00748 if (infoLoc.second > 0) 00749 return false; // Does not point at the start of token. 00750 00751 SourceLocation expansionLoc = 00752 SM.getSLocEntry(infoLoc.first).getExpansion().getExpansionLocStart(); 00753 if (expansionLoc.isFileID()) { 00754 // No other macro expansions, this is the first. 00755 if (MacroBegin) 00756 *MacroBegin = expansionLoc; 00757 return true; 00758 } 00759 00760 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 00761 } 00762 00763 /// \brief Returns true if the given MacroID location points at the last 00764 /// token of the macro expansion. 00765 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 00766 const SourceManager &SM, 00767 const LangOptions &LangOpts, 00768 SourceLocation *MacroEnd) { 00769 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 00770 00771 SourceLocation spellLoc = SM.getSpellingLoc(loc); 00772 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 00773 if (tokLen == 0) 00774 return false; 00775 00776 FileID FID = SM.getFileID(loc); 00777 SourceLocation afterLoc = loc.getLocWithOffset(tokLen+1); 00778 if (SM.isInFileID(afterLoc, FID)) 00779 return false; // Still in the same FileID, does not point to the last token. 00780 00781 // FIXME: If the token comes from the macro token paste operator ('##') 00782 // or the stringify operator ('#') this function will always return false; 00783 00784 SourceLocation expansionLoc = 00785 SM.getSLocEntry(FID).getExpansion().getExpansionLocEnd(); 00786 if (expansionLoc.isFileID()) { 00787 // No other macro expansions. 00788 if (MacroEnd) 00789 *MacroEnd = expansionLoc; 00790 return true; 00791 } 00792 00793 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 00794 } 00795 00796 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 00797 const SourceManager &SM, 00798 const LangOptions &LangOpts) { 00799 SourceLocation Begin = Range.getBegin(); 00800 SourceLocation End = Range.getEnd(); 00801 assert(Begin.isFileID() && End.isFileID()); 00802 if (Range.isTokenRange()) { 00803 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 00804 if (End.isInvalid()) 00805 return CharSourceRange(); 00806 } 00807 00808 // Break down the source locations. 00809 FileID FID; 00810 unsigned BeginOffs; 00811 llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 00812 if (FID.isInvalid()) 00813 return CharSourceRange(); 00814 00815 unsigned EndOffs; 00816 if (!SM.isInFileID(End, FID, &EndOffs) || 00817 BeginOffs > EndOffs) 00818 return CharSourceRange(); 00819 00820 return CharSourceRange::getCharRange(Begin, End); 00821 } 00822 00823 /// \brief Accepts a range and returns a character range with file locations. 00824 /// 00825 /// Returns a null range if a part of the range resides inside a macro 00826 /// expansion or the range does not reside on the same FileID. 00827 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 00828 const SourceManager &SM, 00829 const LangOptions &LangOpts) { 00830 SourceLocation Begin = Range.getBegin(); 00831 SourceLocation End = Range.getEnd(); 00832 if (Begin.isInvalid() || End.isInvalid()) 00833 return CharSourceRange(); 00834 00835 if (Begin.isFileID() && End.isFileID()) 00836 return makeRangeFromFileLocs(Range, SM, LangOpts); 00837 00838 if (Begin.isMacroID() && End.isFileID()) { 00839 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 00840 return CharSourceRange(); 00841 Range.setBegin(Begin); 00842 return makeRangeFromFileLocs(Range, SM, LangOpts); 00843 } 00844 00845 if (Begin.isFileID() && End.isMacroID()) { 00846 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts, 00847 &End)) || 00848 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts, 00849 &End))) 00850 return CharSourceRange(); 00851 Range.setEnd(End); 00852 return makeRangeFromFileLocs(Range, SM, LangOpts); 00853 } 00854 00855 assert(Begin.isMacroID() && End.isMacroID()); 00856 SourceLocation MacroBegin, MacroEnd; 00857 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 00858 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 00859 &MacroEnd)) || 00860 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 00861 &MacroEnd)))) { 00862 Range.setBegin(MacroBegin); 00863 Range.setEnd(MacroEnd); 00864 return makeRangeFromFileLocs(Range, SM, LangOpts); 00865 } 00866 00867 FileID FID; 00868 unsigned BeginOffs; 00869 llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 00870 if (FID.isInvalid()) 00871 return CharSourceRange(); 00872 00873 unsigned EndOffs; 00874 if (!SM.isInFileID(End, FID, &EndOffs) || 00875 BeginOffs > EndOffs) 00876 return CharSourceRange(); 00877 00878 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 00879 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 00880 if (Expansion.isMacroArgExpansion() && 00881 Expansion.getSpellingLoc().isFileID()) { 00882 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 00883 Range.setBegin(SpellLoc.getLocWithOffset(BeginOffs)); 00884 Range.setEnd(SpellLoc.getLocWithOffset(EndOffs)); 00885 return makeRangeFromFileLocs(Range, SM, LangOpts); 00886 } 00887 00888 return CharSourceRange(); 00889 } 00890 00891 StringRef Lexer::getSourceText(CharSourceRange Range, 00892 const SourceManager &SM, 00893 const LangOptions &LangOpts, 00894 bool *Invalid) { 00895 Range = makeFileCharRange(Range, SM, LangOpts); 00896 if (Range.isInvalid()) { 00897 if (Invalid) *Invalid = true; 00898 return StringRef(); 00899 } 00900 00901 // Break down the source location. 00902 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 00903 if (beginInfo.first.isInvalid()) { 00904 if (Invalid) *Invalid = true; 00905 return StringRef(); 00906 } 00907 00908 unsigned EndOffs; 00909 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 00910 beginInfo.second > EndOffs) { 00911 if (Invalid) *Invalid = true; 00912 return StringRef(); 00913 } 00914 00915 // Try to the load the file buffer. 00916 bool invalidTemp = false; 00917 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 00918 if (invalidTemp) { 00919 if (Invalid) *Invalid = true; 00920 return StringRef(); 00921 } 00922 00923 if (Invalid) *Invalid = false; 00924 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 00925 } 00926 00927 StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 00928 const SourceManager &SM, 00929 const LangOptions &LangOpts) { 00930 assert(Loc.isMacroID() && "Only reasonble to call this on macros"); 00931 00932 // Find the location of the immediate macro expansion. 00933 while (1) { 00934 FileID FID = SM.getFileID(Loc); 00935 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 00936 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 00937 Loc = Expansion.getExpansionLocStart(); 00938 if (!Expansion.isMacroArgExpansion()) 00939 break; 00940 00941 // For macro arguments we need to check that the argument did not come 00942 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 00943 00944 // Loc points to the argument id of the macro definition, move to the 00945 // macro expansion. 00946 Loc = SM.getImmediateExpansionRange(Loc).first; 00947 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 00948 if (SpellLoc.isFileID()) 00949 break; // No inner macro. 00950 00951 // If spelling location resides in the same FileID as macro expansion 00952 // location, it means there is no inner macro. 00953 FileID MacroFID = SM.getFileID(Loc); 00954 if (SM.isInFileID(SpellLoc, MacroFID)) 00955 break; 00956 00957 // Argument came from inner macro. 00958 Loc = SpellLoc; 00959 } 00960 00961 // Find the spelling location of the start of the non-argument expansion 00962 // range. This is where the macro name was spelled in order to begin 00963 // expanding this macro. 00964 Loc = SM.getSpellingLoc(Loc); 00965 00966 // Dig out the buffer where the macro name was spelled and the extents of the 00967 // name so that we can render it into the expansion note. 00968 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 00969 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 00970 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 00971 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 00972 } 00973 00974 //===----------------------------------------------------------------------===// 00975 // Character information. 00976 //===----------------------------------------------------------------------===// 00977 00978 enum { 00979 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 00980 CHAR_VERT_WS = 0x02, // '\r', '\n' 00981 CHAR_LETTER = 0x04, // a-z,A-Z 00982 CHAR_NUMBER = 0x08, // 0-9 00983 CHAR_UNDER = 0x10, // _ 00984 CHAR_PERIOD = 0x20, // . 00985 CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"' 00986 }; 00987 00988 // Statically initialize CharInfo table based on ASCII character set 00989 // Reference: FreeBSD 7.2 /usr/share/misc/ascii 00990 static const unsigned char CharInfo[256] = 00991 { 00992 // 0 NUL 1 SOH 2 STX 3 ETX 00993 // 4 EOT 5 ENQ 6 ACK 7 BEL 00994 0 , 0 , 0 , 0 , 00995 0 , 0 , 0 , 0 , 00996 // 8 BS 9 HT 10 NL 11 VT 00997 //12 NP 13 CR 14 SO 15 SI 00998 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, 00999 CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , 01000 //16 DLE 17 DC1 18 DC2 19 DC3 01001 //20 DC4 21 NAK 22 SYN 23 ETB 01002 0 , 0 , 0 , 0 , 01003 0 , 0 , 0 , 0 , 01004 //24 CAN 25 EM 26 SUB 27 ESC 01005 //28 FS 29 GS 30 RS 31 US 01006 0 , 0 , 0 , 0 , 01007 0 , 0 , 0 , 0 , 01008 //32 SP 33 ! 34 " 35 # 01009 //36 $ 37 % 38 & 39 ' 01010 CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 01011 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 01012 //40 ( 41 ) 42 * 43 + 01013 //44 , 45 - 46 . 47 / 01014 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL , 01015 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL , 01016 //48 0 49 1 50 2 51 3 01017 //52 4 53 5 54 6 55 7 01018 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 01019 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 01020 //56 8 57 9 58 : 59 ; 01021 //60 < 61 = 62 > 63 ? 01022 CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL , 01023 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 01024 //64 @ 65 A 66 B 67 C 01025 //68 D 69 E 70 F 71 G 01026 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 01027 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 01028 //72 H 73 I 74 J 75 K 01029 //76 L 77 M 78 N 79 O 01030 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 01031 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 01032 //80 P 81 Q 82 R 83 S 01033 //84 T 85 U 86 V 87 W 01034 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 01035 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 01036 //88 X 89 Y 90 Z 91 [ 01037 //92 \ 93 ] 94 ^ 95 _ 01038 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , 01039 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER , 01040 //96 ` 97 a 98 b 99 c 01041 //100 d 101 e 102 f 103 g 01042 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 01043 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 01044 //104 h 105 i 106 j 107 k 01045 //108 l 109 m 110 n 111 o 01046 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 01047 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 01048 //112 p 113 q 114 r 115 s 01049 //116 t 117 u 118 v 119 w 01050 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 01051 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 01052 //120 x 121 y 122 z 123 { 01053 //124 | 125 } 126 ~ 127 DEL 01054 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , 01055 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0 01056 }; 01057 01058 static void InitCharacterInfo() { 01059 static bool isInited = false; 01060 if (isInited) return; 01061 // check the statically-initialized CharInfo table 01062 assert(CHAR_HORZ_WS == CharInfo[(int)' ']); 01063 assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); 01064 assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); 01065 assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); 01066 assert(CHAR_VERT_WS == CharInfo[(int)'\n']); 01067 assert(CHAR_VERT_WS == CharInfo[(int)'\r']); 01068 assert(CHAR_UNDER == CharInfo[(int)'_']); 01069 assert(CHAR_PERIOD == CharInfo[(int)'.']); 01070 for (unsigned i = 'a'; i <= 'z'; ++i) { 01071 assert(CHAR_LETTER == CharInfo[i]); 01072 assert(CHAR_LETTER == CharInfo[i+'A'-'a']); 01073 } 01074 for (unsigned i = '0'; i <= '9'; ++i) 01075 assert(CHAR_NUMBER == CharInfo[i]); 01076 01077 isInited = true; 01078 } 01079 01080 01081 /// isIdentifierHead - Return true if this is the first character of an 01082 /// identifier, which is [a-zA-Z_]. 01083 static inline bool isIdentifierHead(unsigned char c) { 01084 return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false; 01085 } 01086 01087 /// isIdentifierBody - Return true if this is the body character of an 01088 /// identifier, which is [a-zA-Z0-9_]. 01089 static inline bool isIdentifierBody(unsigned char c) { 01090 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 01091 } 01092 01093 /// isHorizontalWhitespace - Return true if this character is horizontal 01094 /// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. 01095 static inline bool isHorizontalWhitespace(unsigned char c) { 01096 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 01097 } 01098 01099 /// isVerticalWhitespace - Return true if this character is vertical 01100 /// whitespace: '\n', '\r'. Note that this returns false for '\0'. 01101 static inline bool isVerticalWhitespace(unsigned char c) { 01102 return (CharInfo[c] & CHAR_VERT_WS) ? true : false; 01103 } 01104 01105 /// isWhitespace - Return true if this character is horizontal or vertical 01106 /// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false 01107 /// for '\0'. 01108 static inline bool isWhitespace(unsigned char c) { 01109 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 01110 } 01111 01112 /// isNumberBody - Return true if this is the body character of an 01113 /// preprocessing number, which is [a-zA-Z0-9_.]. 01114 static inline bool isNumberBody(unsigned char c) { 01115 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 01116 true : false; 01117 } 01118 01119 /// isRawStringDelimBody - Return true if this is the body character of a 01120 /// raw string delimiter. 01121 static inline bool isRawStringDelimBody(unsigned char c) { 01122 return (CharInfo[c] & 01123 (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ? 01124 true : false; 01125 } 01126 01127 01128 //===----------------------------------------------------------------------===// 01129 // Diagnostics forwarding code. 01130 //===----------------------------------------------------------------------===// 01131 01132 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 01133 /// lexer buffer was all expanded at a single point, perform the mapping. 01134 /// This is currently only used for _Pragma implementation, so it is the slow 01135 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 01136 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 01137 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 01138 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 01139 SourceLocation FileLoc, 01140 unsigned CharNo, unsigned TokLen) { 01141 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 01142 01143 // Otherwise, we're lexing "mapped tokens". This is used for things like 01144 // _Pragma handling. Combine the expansion location of FileLoc with the 01145 // spelling location. 01146 SourceManager &SM = PP.getSourceManager(); 01147 01148 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 01149 // characters come from spelling(FileLoc)+Offset. 01150 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 01151 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 01152 01153 // Figure out the expansion loc range, which is the range covered by the 01154 // original _Pragma(...) sequence. 01155 std::pair<SourceLocation,SourceLocation> II = 01156 SM.getImmediateExpansionRange(FileLoc); 01157 01158 return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen); 01159 } 01160 01161 /// getSourceLocation - Return a source location identifier for the specified 01162 /// offset in the current file. 01163 SourceLocation Lexer::getSourceLocation(const char *Loc, 01164 unsigned TokLen) const { 01165 assert(Loc >= BufferStart && Loc <= BufferEnd && 01166 "Location out of range for this buffer!"); 01167 01168 // In the normal case, we're just lexing from a simple file buffer, return 01169 // the file id from FileLoc with the offset specified. 01170 unsigned CharNo = Loc-BufferStart; 01171 if (FileLoc.isFileID()) 01172 return FileLoc.getLocWithOffset(CharNo); 01173 01174 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 01175 // tokens are lexed from where the _Pragma was defined. 01176 assert(PP && "This doesn't work on raw lexers"); 01177 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 01178 } 01179 01180 /// Diag - Forwarding function for diagnostics. This translate a source 01181 /// position in the current buffer into a SourceLocation object for rendering. 01182 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 01183 return PP->Diag(getSourceLocation(Loc), DiagID); 01184 } 01185 01186 //===----------------------------------------------------------------------===// 01187 // Trigraph and Escaped Newline Handling Code. 01188 //===----------------------------------------------------------------------===// 01189 01190 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 01191 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 01192 static char GetTrigraphCharForLetter(char Letter) { 01193 switch (Letter) { 01194 default: return 0; 01195 case '=': return '#'; 01196 case ')': return ']'; 01197 case '(': return '['; 01198 case '!': return '|'; 01199 case '\'': return '^'; 01200 case '>': return '}'; 01201 case '/': return '\\'; 01202 case '<': return '{'; 01203 case '-': return '~'; 01204 } 01205 } 01206 01207 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 01208 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 01209 /// return the result character. Finally, emit a warning about trigraph use 01210 /// whether trigraphs are enabled or not. 01211 static char DecodeTrigraphChar(const char *CP, Lexer *L) { 01212 char Res = GetTrigraphCharForLetter(*CP); 01213 if (!Res || !L) return Res; 01214 01215 if (!L->getLangOpts().Trigraphs) { 01216 if (!L->isLexingRawMode()) 01217 L->Diag(CP-2, diag::trigraph_ignored); 01218 return 0; 01219 } 01220 01221 if (!L->isLexingRawMode()) 01222 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 01223 return Res; 01224 } 01225 01226 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 01227 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 01228 /// trigraph equivalent on entry to this function. 01229 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 01230 unsigned Size = 0; 01231 while (isWhitespace(Ptr[Size])) { 01232 ++Size; 01233 01234 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 01235 continue; 01236 01237 // If this is a \r\n or \n\r, skip the other half. 01238 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 01239 Ptr[Size-1] != Ptr[Size]) 01240 ++Size; 01241 01242 return Size; 01243 } 01244 01245 // Not an escaped newline, must be a \t or something else. 01246 return 0; 01247 } 01248 01249 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 01250 /// them), skip over them and return the first non-escaped-newline found, 01251 /// otherwise return P. 01252 const char *Lexer::SkipEscapedNewLines(const char *P) { 01253 while (1) { 01254 const char *AfterEscape; 01255 if (*P == '\\') { 01256 AfterEscape = P+1; 01257 } else if (*P == '?') { 01258 // If not a trigraph for escape, bail out. 01259 if (P[1] != '?' || P[2] != '/') 01260 return P; 01261 AfterEscape = P+3; 01262 } else { 01263 return P; 01264 } 01265 01266 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 01267 if (NewLineSize == 0) return P; 01268 P = AfterEscape+NewLineSize; 01269 } 01270 } 01271 01272 /// \brief Checks that the given token is the first token that occurs after the 01273 /// given location (this excludes comments and whitespace). Returns the location 01274 /// immediately after the specified token. If the token is not found or the 01275 /// location is inside a macro, the returned source location will be invalid. 01276 SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, 01277 tok::TokenKind TKind, 01278 const SourceManager &SM, 01279 const LangOptions &LangOpts, 01280 bool SkipTrailingWhitespaceAndNewLine) { 01281 if (Loc.isMacroID()) { 01282 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 01283 return SourceLocation(); 01284 } 01285 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 01286 01287 // Break down the source location. 01288 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 01289 01290 // Try to load the file buffer. 01291 bool InvalidTemp = false; 01292 llvm::StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 01293 if (InvalidTemp) 01294 return SourceLocation(); 01295 01296 const char *TokenBegin = File.data() + LocInfo.second; 01297 01298 // Lex from the start of the given location. 01299 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 01300 TokenBegin, File.end()); 01301 // Find the token. 01302 Token Tok; 01303 lexer.LexFromRawLexer(Tok); 01304 if (Tok.isNot(TKind)) 01305 return SourceLocation(); 01306 SourceLocation TokenLoc = Tok.getLocation(); 01307 01308 // Calculate how much whitespace needs to be skipped if any. 01309 unsigned NumWhitespaceChars = 0; 01310 if (SkipTrailingWhitespaceAndNewLine) { 01311 const char *TokenEnd = SM.getCharacterData(TokenLoc) + 01312 Tok.getLength(); 01313 unsigned char C = *TokenEnd; 01314 while (isHorizontalWhitespace(C)) { 01315 C = *(++TokenEnd); 01316 NumWhitespaceChars++; 01317 } 01318 if (isVerticalWhitespace(C)) 01319 NumWhitespaceChars++; 01320 } 01321 01322 return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars); 01323 } 01324 01325 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 01326 /// get its size, and return it. This is tricky in several cases: 01327 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 01328 /// then either return the trigraph (skipping 3 chars) or the '?', 01329 /// depending on whether trigraphs are enabled or not. 01330 /// 2. If this is an escaped newline (potentially with whitespace between 01331 /// the backslash and newline), implicitly skip the newline and return 01332 /// the char after it. 01333 /// 3. If this is a UCN, return it. FIXME: C++ UCN's? 01334 /// 01335 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 01336 /// know that we can accumulate into Size, and that we have already incremented 01337 /// Ptr by Size bytes. 01338 /// 01339 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 01340 /// be updated to match. 01341 /// 01342 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 01343 Token *Tok) { 01344 // If we have a slash, look for an escaped newline. 01345 if (Ptr[0] == '\\') { 01346 ++Size; 01347 ++Ptr; 01348 Slash: 01349 // Common case, backslash-char where the char is not whitespace. 01350 if (!isWhitespace(Ptr[0])) return '\\'; 01351 01352 // See if we have optional whitespace characters between the slash and 01353 // newline. 01354 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 01355 // Remember that this token needs to be cleaned. 01356 if (Tok) Tok->setFlag(Token::NeedsCleaning); 01357 01358 // Warn if there was whitespace between the backslash and newline. 01359 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 01360 Diag(Ptr, diag::backslash_newline_space); 01361 01362 // Found backslash<whitespace><newline>. Parse the char after it. 01363 Size += EscapedNewLineSize; 01364 Ptr += EscapedNewLineSize; 01365 01366 // If the char that we finally got was a \n, then we must have had 01367 // something like <newline><newline>. We don't want to consume the 01368 // second newline. 01369 if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0') 01370 return ' '; 01371 01372 // Use slow version to accumulate a correct size field. 01373 return getCharAndSizeSlow(Ptr, Size, Tok); 01374 } 01375 01376 // Otherwise, this is not an escaped newline, just return the slash. 01377 return '\\'; 01378 } 01379 01380 // If this is a trigraph, process it. 01381 if (Ptr[0] == '?' && Ptr[1] == '?') { 01382 // If this is actually a legal trigraph (not something like "??x"), emit 01383 // a trigraph warning. If so, and if trigraphs are enabled, return it. 01384 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 01385 // Remember that this token needs to be cleaned. 01386 if (Tok) Tok->setFlag(Token::NeedsCleaning); 01387 01388 Ptr += 3; 01389 Size += 3; 01390 if (C == '\\') goto Slash; 01391 return C; 01392 } 01393 } 01394 01395 // If this is neither, return a single character. 01396 ++Size; 01397 return *Ptr; 01398 } 01399 01400 01401 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 01402 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 01403 /// and that we have already incremented Ptr by Size bytes. 01404 /// 01405 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 01406 /// be updated to match. 01407 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 01408 const LangOptions &LangOpts) { 01409 // If we have a slash, look for an escaped newline. 01410 if (Ptr[0] == '\\') { 01411 ++Size; 01412 ++Ptr; 01413 Slash: 01414 // Common case, backslash-char where the char is not whitespace. 01415 if (!isWhitespace(Ptr[0])) return '\\'; 01416 01417 // See if we have optional whitespace characters followed by a newline. 01418 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 01419 // Found backslash<whitespace><newline>. Parse the char after it. 01420 Size += EscapedNewLineSize; 01421 Ptr += EscapedNewLineSize; 01422 01423 // If the char that we finally got was a \n, then we must have had 01424 // something like <newline><newline>. We don't want to consume the 01425 // second newline. 01426 if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0') 01427 return ' '; 01428 01429 // Use slow version to accumulate a correct size field. 01430 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 01431 } 01432 01433 // Otherwise, this is not an escaped newline, just return the slash. 01434 return '\\'; 01435 } 01436 01437 // If this is a trigraph, process it. 01438 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 01439 // If this is actually a legal trigraph (not something like "??x"), return 01440 // it. 01441 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 01442 Ptr += 3; 01443 Size += 3; 01444 if (C == '\\') goto Slash; 01445 return C; 01446 } 01447 } 01448 01449 // If this is neither, return a single character. 01450 ++Size; 01451 return *Ptr; 01452 } 01453 01454 //===----------------------------------------------------------------------===// 01455 // Helper methods for lexing. 01456 //===----------------------------------------------------------------------===// 01457 01458 /// \brief Routine that indiscriminately skips bytes in the source file. 01459 void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) { 01460 BufferPtr += Bytes; 01461 if (BufferPtr > BufferEnd) 01462 BufferPtr = BufferEnd; 01463 IsAtStartOfLine = StartOfLine; 01464 } 01465 01466 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 01467 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 01468 unsigned Size; 01469 unsigned char C = *CurPtr++; 01470 while (isIdentifierBody(C)) 01471 C = *CurPtr++; 01472 01473 --CurPtr; // Back up over the skipped character. 01474 01475 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 01476 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 01477 // FIXME: UCNs. 01478 // 01479 // TODO: Could merge these checks into a CharInfo flag to make the comparison 01480 // cheaper 01481 if (C != '\\' && C != '?' && (C != '$' || !LangOpts.DollarIdents)) { 01482 FinishIdentifier: 01483 const char *IdStart = BufferPtr; 01484 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 01485 Result.setRawIdentifierData(IdStart); 01486 01487 // If we are in raw mode, return this identifier raw. There is no need to 01488 // look up identifier information or attempt to macro expand it. 01489 if (LexingRawMode) 01490 return; 01491 01492 // Fill in Result.IdentifierInfo and update the token kind, 01493 // looking up the identifier in the identifier table. 01494 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 01495 01496 // Finally, now that we know we have an identifier, pass this off to the 01497 // preprocessor, which may macro expand it or something. 01498 if (II->isHandleIdentifierCase()) 01499 PP->HandleIdentifier(Result); 01500 01501 return; 01502 } 01503 01504 // Otherwise, $,\,? in identifier found. Enter slower path. 01505 01506 C = getCharAndSize(CurPtr, Size); 01507 while (1) { 01508 if (C == '$') { 01509 // If we hit a $ and they are not supported in identifiers, we are done. 01510 if (!LangOpts.DollarIdents) goto FinishIdentifier; 01511 01512 // Otherwise, emit a diagnostic and continue. 01513 if (!isLexingRawMode()) 01514 Diag(CurPtr, diag::ext_dollar_in_identifier); 01515 CurPtr = ConsumeChar(CurPtr, Size, Result); 01516 C = getCharAndSize(CurPtr, Size); 01517 continue; 01518 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 01519 // Found end of identifier. 01520 goto FinishIdentifier; 01521 } 01522 01523 // Otherwise, this character is good, consume it. 01524 CurPtr = ConsumeChar(CurPtr, Size, Result); 01525 01526 C = getCharAndSize(CurPtr, Size); 01527 while (isIdentifierBody(C)) { // FIXME: UCNs. 01528 CurPtr = ConsumeChar(CurPtr, Size, Result); 01529 C = getCharAndSize(CurPtr, Size); 01530 } 01531 } 01532 } 01533 01534 /// isHexaLiteral - Return true if Start points to a hex constant. 01535 /// in microsoft mode (where this is supposed to be several different tokens). 01536 static bool isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 01537 unsigned Size; 01538 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); 01539 if (C1 != '0') 01540 return false; 01541 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); 01542 return (C2 == 'x' || C2 == 'X'); 01543 } 01544 01545 /// LexNumericConstant - Lex the remainder of a integer or floating point 01546 /// constant. From[-1] is the first character lexed. Return the end of the 01547 /// constant. 01548 void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 01549 unsigned Size; 01550 char C = getCharAndSize(CurPtr, Size); 01551 char PrevCh = 0; 01552 while (isNumberBody(C)) { // FIXME: UCNs. 01553 CurPtr = ConsumeChar(CurPtr, Size, Result); 01554 PrevCh = C; 01555 C = getCharAndSize(CurPtr, Size); 01556 } 01557 01558 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 01559 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 01560 // If we are in Microsoft mode, don't continue if the constant is hex. 01561 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 01562 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 01563 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 01564 } 01565 01566 // If we have a hex FP constant, continue. 01567 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) 01568 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 01569 01570 // Update the location of token as well as BufferPtr. 01571 const char *TokStart = BufferPtr; 01572 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 01573 Result.setLiteralData(TokStart); 01574 } 01575 01576 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 01577 /// in C++11, or warn on a ud-suffix in C++98. 01578 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr) { 01579 assert(getLangOpts().CPlusPlus); 01580 01581 // Maximally munch an identifier. FIXME: UCNs. 01582 unsigned Size; 01583 char C = getCharAndSize(CurPtr, Size); 01584 if (isIdentifierHead(C)) { 01585 if (!getLangOpts().CPlusPlus0x) { 01586 if (!isLexingRawMode()) 01587 Diag(CurPtr, 01588 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 01589 : diag::warn_cxx11_compat_reserved_user_defined_literal) 01590 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 01591 return CurPtr; 01592 } 01593 01594 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 01595 // that does not start with an underscore is ill-formed. As a conforming 01596 // extension, we treat all such suffixes as if they had whitespace before 01597 // them. 01598 if (C != '_') { 01599 if (!isLexingRawMode()) 01600 Diag(CurPtr, getLangOpts().MicrosoftMode ? 01601 diag::ext_ms_reserved_user_defined_literal : 01602 diag::ext_reserved_user_defined_literal) 01603 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 01604 return CurPtr; 01605 } 01606 01607 Result.setFlag(Token::HasUDSuffix); 01608 do { 01609 CurPtr = ConsumeChar(CurPtr, Size, Result); 01610 C = getCharAndSize(CurPtr, Size); 01611 } while (isIdentifierBody(C)); 01612 } 01613 return CurPtr; 01614 } 01615 01616 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 01617 /// either " or L" or u8" or u" or U". 01618 void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 01619 tok::TokenKind Kind) { 01620 const char *NulCharacter = 0; // Does this string contain the \0 character? 01621 01622 if (!isLexingRawMode() && 01623 (Kind == tok::utf8_string_literal || 01624 Kind == tok::utf16_string_literal || 01625 Kind == tok::utf32_string_literal)) 01626 Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal); 01627 01628 char C = getAndAdvanceChar(CurPtr, Result); 01629 while (C != '"') { 01630 // Skip escaped characters. Escaped newlines will already be processed by 01631 // getAndAdvanceChar. 01632 if (C == '\\') 01633 C = getAndAdvanceChar(CurPtr, Result); 01634 01635 if (C == '\n' || C == '\r' || // Newline. 01636 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 01637 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 01638 Diag(BufferPtr, diag::warn_unterminated_string); 01639 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 01640 return; 01641 } 01642 01643 if (C == 0) { 01644 if (isCodeCompletionPoint(CurPtr-1)) { 01645 PP->CodeCompleteNaturalLanguage(); 01646 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 01647 return cutOffLexing(); 01648 } 01649 01650 NulCharacter = CurPtr-1; 01651 } 01652 C = getAndAdvanceChar(CurPtr, Result); 01653 } 01654 01655 // If we are in C++11, lex the optional ud-suffix. 01656 if (getLangOpts().CPlusPlus) 01657 CurPtr = LexUDSuffix(Result, CurPtr); 01658 01659 // If a nul character existed in the string, warn about it. 01660 if (NulCharacter && !isLexingRawMode()) 01661 Diag(NulCharacter, diag::null_in_string); 01662 01663 // Update the location of the token as well as the BufferPtr instance var. 01664 const char *TokStart = BufferPtr; 01665 FormTokenWithChars(Result, CurPtr, Kind); 01666 Result.setLiteralData(TokStart); 01667 } 01668 01669 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 01670 /// having lexed R", LR", u8R", uR", or UR". 01671 void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 01672 tok::TokenKind Kind) { 01673 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 01674 // Between the initial and final double quote characters of the raw string, 01675 // any transformations performed in phases 1 and 2 (trigraphs, 01676 // universal-character-names, and line splicing) are reverted. 01677 01678 if (!isLexingRawMode()) 01679 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 01680 01681 unsigned PrefixLen = 0; 01682 01683 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 01684 ++PrefixLen; 01685 01686 // If the last character was not a '(', then we didn't lex a valid delimiter. 01687 if (CurPtr[PrefixLen] != '(') { 01688 if (!isLexingRawMode()) { 01689 const char *PrefixEnd = &CurPtr[PrefixLen]; 01690 if (PrefixLen == 16) { 01691 Diag(PrefixEnd, diag::err_raw_delim_too_long); 01692 } else { 01693 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 01694 << StringRef(PrefixEnd, 1); 01695 } 01696 } 01697 01698 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 01699 // it's possible the '"' was intended to be part of the raw string, but 01700 // there's not much we can do about that. 01701 while (1) { 01702 char C = *CurPtr++; 01703 01704 if (C == '"') 01705 break; 01706 if (C == 0 && CurPtr-1 == BufferEnd) { 01707 --CurPtr; 01708 break; 01709 } 01710 } 01711 01712 FormTokenWithChars(Result, CurPtr, tok::unknown); 01713 return; 01714 } 01715 01716 // Save prefix and move CurPtr past it 01717 const char *Prefix = CurPtr; 01718 CurPtr += PrefixLen + 1; // skip over prefix and '(' 01719 01720 while (1) { 01721 char C = *CurPtr++; 01722 01723 if (C == ')') { 01724 // Check for prefix match and closing quote. 01725 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 01726 CurPtr += PrefixLen + 1; // skip over prefix and '"' 01727 break; 01728 } 01729 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 01730 if (!isLexingRawMode()) 01731 Diag(BufferPtr, diag::err_unterminated_raw_string) 01732 << StringRef(Prefix, PrefixLen); 01733 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 01734 return; 01735 } 01736 } 01737 01738 // If we are in C++11, lex the optional ud-suffix. 01739 if (getLangOpts().CPlusPlus) 01740 CurPtr = LexUDSuffix(Result, CurPtr); 01741 01742 // Update the location of token as well as BufferPtr. 01743 const char *TokStart = BufferPtr; 01744 FormTokenWithChars(Result, CurPtr, Kind); 01745 Result.setLiteralData(TokStart); 01746 } 01747 01748 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 01749 /// after having lexed the '<' character. This is used for #include filenames. 01750 void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 01751 const char *NulCharacter = 0; // Does this string contain the \0 character? 01752 const char *AfterLessPos = CurPtr; 01753 char C = getAndAdvanceChar(CurPtr, Result); 01754 while (C != '>') { 01755 // Skip escaped characters. 01756 if (C == '\\') { 01757 // Skip the escaped character. 01758 C = getAndAdvanceChar(CurPtr, Result); 01759 } else if (C == '\n' || C == '\r' || // Newline. 01760 (C == 0 && (CurPtr-1 == BufferEnd || // End of file. 01761 isCodeCompletionPoint(CurPtr-1)))) { 01762 // If the filename is unterminated, then it must just be a lone < 01763 // character. Return this as such. 01764 FormTokenWithChars(Result, AfterLessPos, tok::less); 01765 return; 01766 } else if (C == 0) { 01767 NulCharacter = CurPtr-1; 01768 } 01769 C = getAndAdvanceChar(CurPtr, Result); 01770 } 01771 01772 // If a nul character existed in the string, warn about it. 01773 if (NulCharacter && !isLexingRawMode()) 01774 Diag(NulCharacter, diag::null_in_string); 01775 01776 // Update the location of token as well as BufferPtr. 01777 const char *TokStart = BufferPtr; 01778 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 01779 Result.setLiteralData(TokStart); 01780 } 01781 01782 01783 /// LexCharConstant - Lex the remainder of a character constant, after having 01784 /// lexed either ' or L' or u' or U'. 01785 void Lexer::LexCharConstant(Token &Result, const char *CurPtr, 01786 tok::TokenKind Kind) { 01787 const char *NulCharacter = 0; // Does this character contain the \0 character? 01788 01789 if (!isLexingRawMode() && 01790 (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)) 01791 Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal); 01792 01793 char C = getAndAdvanceChar(CurPtr, Result); 01794 if (C == '\'') { 01795 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 01796 Diag(BufferPtr, diag::err_empty_character); 01797 FormTokenWithChars(Result, CurPtr, tok::unknown); 01798 return; 01799 } 01800 01801 while (C != '\'') { 01802 // Skip escaped characters. 01803 if (C == '\\') { 01804 // Skip the escaped character. 01805 // FIXME: UCN's 01806 C = getAndAdvanceChar(CurPtr, Result); 01807 } else if (C == '\n' || C == '\r' || // Newline. 01808 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 01809 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 01810 Diag(BufferPtr, diag::warn_unterminated_char); 01811 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 01812 return; 01813 } else if (C == 0) { 01814 if (isCodeCompletionPoint(CurPtr-1)) { 01815 PP->CodeCompleteNaturalLanguage(); 01816 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 01817 return cutOffLexing(); 01818 } 01819 01820 NulCharacter = CurPtr-1; 01821 } 01822 C = getAndAdvanceChar(CurPtr, Result); 01823 } 01824 01825 // If we are in C++11, lex the optional ud-suffix. 01826 if (getLangOpts().CPlusPlus) 01827 CurPtr = LexUDSuffix(Result, CurPtr); 01828 01829 // If a nul character existed in the character, warn about it. 01830 if (NulCharacter && !isLexingRawMode()) 01831 Diag(NulCharacter, diag::null_in_char); 01832 01833 // Update the location of token as well as BufferPtr. 01834 const char *TokStart = BufferPtr; 01835 FormTokenWithChars(Result, CurPtr, Kind); 01836 Result.setLiteralData(TokStart); 01837 } 01838 01839 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 01840 /// Update BufferPtr to point to the next non-whitespace character and return. 01841 /// 01842 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 01843 /// 01844 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 01845 // Whitespace - Skip it, then return the token after the whitespace. 01846 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 01847 while (1) { 01848 // Skip horizontal whitespace very aggressively. 01849 while (isHorizontalWhitespace(Char)) 01850 Char = *++CurPtr; 01851 01852 // Otherwise if we have something other than whitespace, we're done. 01853 if (Char != '\n' && Char != '\r') 01854 break; 01855 01856 if (ParsingPreprocessorDirective) { 01857 // End of preprocessor directive line, let LexTokenInternal handle this. 01858 BufferPtr = CurPtr; 01859 return false; 01860 } 01861 01862 // ok, but handle newline. 01863 // The returned token is at the start of the line. 01864 Result.setFlag(Token::StartOfLine); 01865 // No leading whitespace seen so far. 01866 Result.clearFlag(Token::LeadingSpace); 01867 Char = *++CurPtr; 01868 } 01869 01870 // If this isn't immediately after a newline, there is leading space. 01871 char PrevChar = CurPtr[-1]; 01872 if (PrevChar != '\n' && PrevChar != '\r') 01873 Result.setFlag(Token::LeadingSpace); 01874 01875 // If the client wants us to return whitespace, return it now. 01876 if (isKeepWhitespaceMode()) { 01877 FormTokenWithChars(Result, CurPtr, tok::unknown); 01878 return true; 01879 } 01880 01881 BufferPtr = CurPtr; 01882 return false; 01883 } 01884 01885 // SkipBCPLComment - We have just read the // characters from input. Skip until 01886 // we find the newline character thats terminate the comment. Then update 01887 /// BufferPtr and return. 01888 /// 01889 /// If we're in KeepCommentMode or any CommentHandler has inserted 01890 /// some tokens, this will store the first token and return true. 01891 bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 01892 // If BCPL comments aren't explicitly enabled for this language, emit an 01893 // extension warning. 01894 if (!LangOpts.BCPLComment && !isLexingRawMode()) { 01895 Diag(BufferPtr, diag::ext_bcpl_comment); 01896 01897 // Mark them enabled so we only emit one warning for this translation 01898 // unit. 01899 LangOpts.BCPLComment = true; 01900 } 01901 01902 // Scan over the body of the comment. The common case, when scanning, is that 01903 // the comment contains normal ascii characters with nothing interesting in 01904 // them. As such, optimize for this case with the inner loop. 01905 char C; 01906 do { 01907 C = *CurPtr; 01908 // Skip over characters in the fast loop. 01909 while (C != 0 && // Potentially EOF. 01910 C != '\n' && C != '\r') // Newline or DOS-style newline. 01911 C = *++CurPtr; 01912 01913 const char *NextLine = CurPtr; 01914 if (C != 0) { 01915 // We found a newline, see if it's escaped. 01916 const char *EscapePtr = CurPtr-1; 01917 while (isHorizontalWhitespace(*EscapePtr)) // Skip whitespace. 01918 --EscapePtr; 01919 01920 if (*EscapePtr == '\\') // Escaped newline. 01921 CurPtr = EscapePtr; 01922 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 01923 EscapePtr[-2] == '?') // Trigraph-escaped newline. 01924 CurPtr = EscapePtr-2; 01925 else 01926 break; // This is a newline, we're done. 01927 01928 C = *CurPtr; 01929 } 01930 01931 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 01932 // properly decode the character. Read it in raw mode to avoid emitting 01933 // diagnostics about things like trigraphs. If we see an escaped newline, 01934 // we'll handle it below. 01935 const char *OldPtr = CurPtr; 01936 bool OldRawMode = isLexingRawMode(); 01937 LexingRawMode = true; 01938 C = getAndAdvanceChar(CurPtr, Result); 01939 LexingRawMode = OldRawMode; 01940 01941 // If we only read only one character, then no special handling is needed. 01942 // We're done and can skip forward to the newline. 01943 if (C != 0 && CurPtr == OldPtr+1) { 01944 CurPtr = NextLine; 01945 break; 01946 } 01947 01948 // If we read multiple characters, and one of those characters was a \r or 01949 // \n, then we had an escaped newline within the comment. Emit diagnostic 01950 // unless the next line is also a // comment. 01951 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 01952 for (; OldPtr != CurPtr; ++OldPtr) 01953 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 01954 // Okay, we found a // comment that ends in a newline, if the next 01955 // line is also a // comment, but has spaces, don't emit a diagnostic. 01956 if (isWhitespace(C)) { 01957 const char *ForwardPtr = CurPtr; 01958 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 01959 ++ForwardPtr; 01960 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 01961 break; 01962 } 01963 01964 if (!isLexingRawMode()) 01965 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 01966 break; 01967 } 01968 } 01969 01970 if (CurPtr == BufferEnd+1) { 01971 --CurPtr; 01972 break; 01973 } 01974 01975 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 01976 PP->CodeCompleteNaturalLanguage(); 01977 cutOffLexing(); 01978 return false; 01979 } 01980 01981 } while (C != '\n' && C != '\r'); 01982 01983 // Found but did not consume the newline. Notify comment handlers about the 01984 // comment unless we're in a #if 0 block. 01985 if (PP && !isLexingRawMode() && 01986 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 01987 getSourceLocation(CurPtr)))) { 01988 BufferPtr = CurPtr; 01989 return true; // A token has to be returned. 01990 } 01991 01992 // If we are returning comments as tokens, return this comment as a token. 01993 if (inKeepCommentMode()) 01994 return SaveBCPLComment(Result, CurPtr); 01995 01996 // If we are inside a preprocessor directive and we see the end of line, 01997 // return immediately, so that the lexer can return this as an EOD token. 01998 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 01999 BufferPtr = CurPtr; 02000 return false; 02001 } 02002 02003 // Otherwise, eat the \n character. We don't care if this is a \n\r or 02004 // \r\n sequence. This is an efficiency hack (because we know the \n can't 02005 // contribute to another token), it isn't needed for correctness. Note that 02006 // this is ok even in KeepWhitespaceMode, because we would have returned the 02007 /// comment above in that mode. 02008 ++CurPtr; 02009 02010 // The next returned token is at the start of the line. 02011 Result.setFlag(Token::StartOfLine); 02012 // No leading whitespace seen so far. 02013 Result.clearFlag(Token::LeadingSpace); 02014 BufferPtr = CurPtr; 02015 return false; 02016 } 02017 02018 /// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 02019 /// an appropriate way and return it. 02020 bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 02021 // If we're not in a preprocessor directive, just return the // comment 02022 // directly. 02023 FormTokenWithChars(Result, CurPtr, tok::comment); 02024 02025 if (!ParsingPreprocessorDirective) 02026 return true; 02027 02028 // If this BCPL-style comment is in a macro definition, transmogrify it into 02029 // a C-style block comment. 02030 bool Invalid = false; 02031 std::string Spelling = PP->getSpelling(Result, &Invalid); 02032 if (Invalid) 02033 return true; 02034 02035 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 02036 Spelling[1] = '*'; // Change prefix to "/*". 02037 Spelling += "*/"; // add suffix. 02038 02039 Result.setKind(tok::comment); 02040 PP->CreateString(&Spelling[0], Spelling.size(), Result, 02041 Result.getLocation(), Result.getLocation()); 02042 return true; 02043 } 02044 02045 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 02046 /// character (either \n or \r) is part of an escaped newline sequence. Issue a 02047 /// diagnostic if so. We know that the newline is inside of a block comment. 02048 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 02049 Lexer *L) { 02050 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 02051 02052 // Back up off the newline. 02053 --CurPtr; 02054 02055 // If this is a two-character newline sequence, skip the other character. 02056 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 02057 // \n\n or \r\r -> not escaped newline. 02058 if (CurPtr[0] == CurPtr[1]) 02059 return false; 02060 // \n\r or \r\n -> skip the newline. 02061 --CurPtr; 02062 } 02063 02064 // If we have horizontal whitespace, skip over it. We allow whitespace 02065 // between the slash and newline. 02066 bool HasSpace = false; 02067 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 02068 --CurPtr; 02069 HasSpace = true; 02070 } 02071 02072 // If we have a slash, we know this is an escaped newline. 02073 if (*CurPtr == '\\') { 02074 if (CurPtr[-1] != '*') return false; 02075 } else { 02076 // It isn't a slash, is it the ?? / trigraph? 02077 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 02078 CurPtr[-3] != '*') 02079 return false; 02080 02081 // This is the trigraph ending the comment. Emit a stern warning! 02082 CurPtr -= 2; 02083 02084 // If no trigraphs are enabled, warn that we ignored this trigraph and 02085 // ignore this * character. 02086 if (!L->getLangOpts().Trigraphs) { 02087 if (!L->isLexingRawMode()) 02088 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 02089 return false; 02090 } 02091 if (!L->isLexingRawMode()) 02092 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 02093 } 02094 02095 // Warn about having an escaped newline between the */ characters. 02096 if (!L->isLexingRawMode()) 02097 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 02098 02099 // If there was space between the backslash and newline, warn about it. 02100 if (HasSpace && !L->isLexingRawMode()) 02101 L->Diag(CurPtr, diag::backslash_newline_space); 02102 02103 return true; 02104 } 02105 02106 #ifdef __SSE2__ 02107 #include <emmintrin.h> 02108 #elif __ALTIVEC__ 02109 #include <altivec.h> 02110 #undef bool 02111 #endif 02112 02113 /// SkipBlockComment - We have just read the /* characters from input. Read 02114 /// until we find the */ characters that terminate the comment. Note that we 02115 /// don't bother decoding trigraphs or escaped newlines in block comments, 02116 /// because they cannot cause the comment to end. The only thing that can 02117 /// happen is the comment could end with an escaped newline between the */ end 02118 /// of comment. 02119 /// 02120 /// If we're in KeepCommentMode or any CommentHandler has inserted 02121 /// some tokens, this will store the first token and return true. 02122 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 02123 // Scan one character past where we should, looking for a '/' character. Once 02124 // we find it, check to see if it was preceded by a *. This common 02125 // optimization helps people who like to put a lot of * characters in their 02126 // comments. 02127 02128 // The first character we get with newlines and trigraphs skipped to handle 02129 // the degenerate /*/ case below correctly if the * has an escaped newline 02130 // after it. 02131 unsigned CharSize; 02132 unsigned char C = getCharAndSize(CurPtr, CharSize); 02133 CurPtr += CharSize; 02134 if (C == 0 && CurPtr == BufferEnd+1) { 02135 if (!isLexingRawMode()) 02136 Diag(BufferPtr, diag::err_unterminated_block_comment); 02137 --CurPtr; 02138 02139 // KeepWhitespaceMode should return this broken comment as a token. Since 02140 // it isn't a well formed comment, just return it as an 'unknown' token. 02141 if (isKeepWhitespaceMode()) { 02142 FormTokenWithChars(Result, CurPtr, tok::unknown); 02143 return true; 02144 } 02145 02146 BufferPtr = CurPtr; 02147 return false; 02148 } 02149 02150 // Check to see if the first character after the '/*' is another /. If so, 02151 // then this slash does not end the block comment, it is part of it. 02152 if (C == '/') 02153 C = *CurPtr++; 02154 02155 while (1) { 02156 // Skip over all non-interesting characters until we find end of buffer or a 02157 // (probably ending) '/' character. 02158 if (CurPtr + 24 < BufferEnd && 02159 // If there is a code-completion point avoid the fast scan because it 02160 // doesn't check for '\0'. 02161 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 02162 // While not aligned to a 16-byte boundary. 02163 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 02164 C = *CurPtr++; 02165 02166 if (C == '/') goto FoundSlash; 02167 02168 #ifdef __SSE2__ 02169 __m128i Slashes = _mm_set1_epi8('/'); 02170 while (CurPtr+16 <= BufferEnd) { 02171 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)); 02172 if (cmp != 0) { 02173 // Adjust the pointer to point directly after the first slash. It's 02174 // not necessary to set C here, it will be overwritten at the end of 02175 // the outer loop. 02176 CurPtr += llvm::CountTrailingZeros_32(cmp) + 1; 02177 goto FoundSlash; 02178 } 02179 CurPtr += 16; 02180 } 02181 #elif __ALTIVEC__ 02182 __vector unsigned char Slashes = { 02183 '/', '/', '/', '/', '/', '/', '/', '/', 02184 '/', '/', '/', '/', '/', '/', '/', '/' 02185 }; 02186 while (CurPtr+16 <= BufferEnd && 02187 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 02188 CurPtr += 16; 02189 #else 02190 // Scan for '/' quickly. Many block comments are very large. 02191 while (CurPtr[0] != '/' && 02192 CurPtr[1] != '/' && 02193 CurPtr[2] != '/' && 02194 CurPtr[3] != '/' && 02195 CurPtr+4 < BufferEnd) { 02196 CurPtr += 4; 02197 } 02198 #endif 02199 02200 // It has to be one of the bytes scanned, increment to it and read one. 02201 C = *CurPtr++; 02202 } 02203 02204 // Loop to scan the remainder. 02205 while (C != '/' && C != '\0') 02206 C = *CurPtr++; 02207 02208 if (C == '/') { 02209 FoundSlash: 02210 if (CurPtr[-2] == '*') // We found the final */. We're done! 02211 break; 02212 02213 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 02214 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 02215 // We found the final */, though it had an escaped newline between the 02216 // * and /. We're done! 02217 break; 02218 } 02219 } 02220 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 02221 // If this is a /* inside of the comment, emit a warning. Don't do this 02222 // if this is a /*/, which will end the comment. This misses cases with 02223 // embedded escaped newlines, but oh well. 02224 if (!isLexingRawMode()) 02225 Diag(CurPtr-1, diag::warn_nested_block_comment); 02226 } 02227 } else if (C == 0 && CurPtr == BufferEnd+1) { 02228 if (!isLexingRawMode()) 02229 Diag(BufferPtr, diag::err_unterminated_block_comment); 02230 // Note: the user probably forgot a */. We could continue immediately 02231 // after the /*, but this would involve lexing a lot of what really is the 02232 // comment, which surely would confuse the parser. 02233 --CurPtr; 02234 02235 // KeepWhitespaceMode should return this broken comment as a token. Since 02236 // it isn't a well formed comment, just return it as an 'unknown' token. 02237 if (isKeepWhitespaceMode()) { 02238 FormTokenWithChars(Result, CurPtr, tok::unknown); 02239 return true; 02240 } 02241 02242 BufferPtr = CurPtr; 02243 return false; 02244 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 02245 PP->CodeCompleteNaturalLanguage(); 02246 cutOffLexing(); 02247 return false; 02248 } 02249 02250 C = *CurPtr++; 02251 } 02252 02253 // Notify comment handlers about the comment unless we're in a #if 0 block. 02254 if (PP && !isLexingRawMode() && 02255 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 02256 getSourceLocation(CurPtr)))) { 02257 BufferPtr = CurPtr; 02258 return true; // A token has to be returned. 02259 } 02260 02261 // If we are returning comments as tokens, return this comment as a token. 02262 if (inKeepCommentMode()) { 02263 FormTokenWithChars(Result, CurPtr, tok::comment); 02264 return true; 02265 } 02266 02267 // It is common for the tokens immediately after a /**/ comment to be 02268 // whitespace. Instead of going through the big switch, handle it 02269 // efficiently now. This is safe even in KeepWhitespaceMode because we would 02270 // have already returned above with the comment as a token. 02271 if (isHorizontalWhitespace(*CurPtr)) { 02272 Result.setFlag(Token::LeadingSpace); 02273 SkipWhitespace(Result, CurPtr+1); 02274 return false; 02275 } 02276 02277 // Otherwise, just return so that the next character will be lexed as a token. 02278 BufferPtr = CurPtr; 02279 Result.setFlag(Token::LeadingSpace); 02280 return false; 02281 } 02282 02283 //===----------------------------------------------------------------------===// 02284 // Primary Lexing Entry Points 02285 //===----------------------------------------------------------------------===// 02286 02287 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 02288 /// uninterpreted string. This switches the lexer out of directive mode. 02289 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 02290 assert(ParsingPreprocessorDirective && ParsingFilename == false && 02291 "Must be in a preprocessing directive!"); 02292 Token Tmp; 02293 02294 // CurPtr - Cache BufferPtr in an automatic variable. 02295 const char *CurPtr = BufferPtr; 02296 while (1) { 02297 char Char = getAndAdvanceChar(CurPtr, Tmp); 02298 switch (Char) { 02299 default: 02300 if (Result) 02301 Result->push_back(Char); 02302 break; 02303 case 0: // Null. 02304 // Found end of file? 02305 if (CurPtr-1 != BufferEnd) { 02306 if (isCodeCompletionPoint(CurPtr-1)) { 02307 PP->CodeCompleteNaturalLanguage(); 02308 cutOffLexing(); 02309 return; 02310 } 02311 02312 // Nope, normal character, continue. 02313 if (Result) 02314 Result->push_back(Char); 02315 break; 02316 } 02317 // FALL THROUGH. 02318 case '\r': 02319 case '\n': 02320 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 02321 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 02322 BufferPtr = CurPtr-1; 02323 02324 // Next, lex the character, which should handle the EOD transition. 02325 Lex(Tmp); 02326 if (Tmp.is(tok::code_completion)) { 02327 if (PP) 02328 PP->CodeCompleteNaturalLanguage(); 02329 Lex(Tmp); 02330 } 02331 assert(Tmp.is(tok::eod) && "Unexpected token!"); 02332 02333 // Finally, we're done; 02334 return; 02335 } 02336 } 02337 } 02338 02339 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 02340 /// condition, reporting diagnostics and handling other edge cases as required. 02341 /// This returns true if Result contains a token, false if PP.Lex should be 02342 /// called again. 02343 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 02344 // If we hit the end of the file while parsing a preprocessor directive, 02345 // end the preprocessor directive first. The next token returned will 02346 // then be the end of file. 02347 if (ParsingPreprocessorDirective) { 02348 // Done parsing the "line". 02349 ParsingPreprocessorDirective = false; 02350 // Update the location of token as well as BufferPtr. 02351 FormTokenWithChars(Result, CurPtr, tok::eod); 02352 02353 // Restore comment saving mode, in case it was disabled for directive. 02354 SetCommentRetentionState(PP->getCommentRetentionState()); 02355 return true; // Have a token. 02356 } 02357 02358 // If we are in raw mode, return this event as an EOF token. Let the caller 02359 // that put us in raw mode handle the event. 02360 if (isLexingRawMode()) { 02361 Result.startToken(); 02362 BufferPtr = BufferEnd; 02363 FormTokenWithChars(Result, BufferEnd, tok::eof); 02364 return true; 02365 } 02366 02367 // Issue diagnostics for unterminated #if and missing newline. 02368 02369 // If we are in a #if directive, emit an error. 02370 while (!ConditionalStack.empty()) { 02371 if (PP->getCodeCompletionFileLoc() != FileLoc) 02372 PP->Diag(ConditionalStack.back().IfLoc, 02373 diag::err_pp_unterminated_conditional); 02374 ConditionalStack.pop_back(); 02375 } 02376 02377 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 02378 // a pedwarn. 02379 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 02380 Diag(BufferEnd, LangOpts.CPlusPlus0x ? // C++11 [lex.phases] 2.2 p2 02381 diag::warn_cxx98_compat_no_newline_eof : diag::ext_no_newline_eof) 02382 << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n"); 02383 02384 BufferPtr = CurPtr; 02385 02386 // Finally, let the preprocessor handle this. 02387 return PP->HandleEndOfFile(Result); 02388 } 02389 02390 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 02391 /// the specified lexer will return a tok::l_paren token, 0 if it is something 02392 /// else and 2 if there are no more tokens in the buffer controlled by the 02393 /// lexer. 02394 unsigned Lexer::isNextPPTokenLParen() { 02395 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 02396 02397 // Switch to 'skipping' mode. This will ensure that we can lex a token 02398 // without emitting diagnostics, disables macro expansion, and will cause EOF 02399 // to return an EOF token instead of popping the include stack. 02400 LexingRawMode = true; 02401 02402 // Save state that can be changed while lexing so that we can restore it. 02403 const char *TmpBufferPtr = BufferPtr; 02404 bool inPPDirectiveMode = ParsingPreprocessorDirective; 02405 02406 Token Tok; 02407 Tok.startToken(); 02408 LexTokenInternal(Tok); 02409 02410 // Restore state that may have changed. 02411 BufferPtr = TmpBufferPtr; 02412 ParsingPreprocessorDirective = inPPDirectiveMode; 02413 02414 // Restore the lexer back to non-skipping mode. 02415 LexingRawMode = false; 02416 02417 if (Tok.is(tok::eof)) 02418 return 2; 02419 return Tok.is(tok::l_paren); 02420 } 02421 02422 /// FindConflictEnd - Find the end of a version control conflict marker. 02423 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 02424 ConflictMarkerKind CMK) { 02425 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 02426 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 02427 StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen); 02428 size_t Pos = RestOfBuffer.find(Terminator); 02429 while (Pos != StringRef::npos) { 02430 // Must occur at start of line. 02431 if (RestOfBuffer[Pos-1] != '\r' && 02432 RestOfBuffer[Pos-1] != '\n') { 02433 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 02434 Pos = RestOfBuffer.find(Terminator); 02435 continue; 02436 } 02437 return RestOfBuffer.data()+Pos; 02438 } 02439 return 0; 02440 } 02441 02442 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 02443 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 02444 /// and recover nicely. This returns true if it is a conflict marker and false 02445 /// if not. 02446 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 02447 // Only a conflict marker if it starts at the beginning of a line. 02448 if (CurPtr != BufferStart && 02449 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 02450 return false; 02451 02452 // Check to see if we have <<<<<<< or >>>>. 02453 if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") && 02454 (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> ")) 02455 return false; 02456 02457 // If we have a situation where we don't care about conflict markers, ignore 02458 // it. 02459 if (CurrentConflictMarkerState || isLexingRawMode()) 02460 return false; 02461 02462 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 02463 02464 // Check to see if there is an ending marker somewhere in the buffer at the 02465 // start of a line to terminate this conflict marker. 02466 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 02467 // We found a match. We are really in a conflict marker. 02468 // Diagnose this, and ignore to the end of line. 02469 Diag(CurPtr, diag::err_conflict_marker); 02470 CurrentConflictMarkerState = Kind; 02471 02472 // Skip ahead to the end of line. We know this exists because the 02473 // end-of-conflict marker starts with \r or \n. 02474 while (*CurPtr != '\r' && *CurPtr != '\n') { 02475 assert(CurPtr != BufferEnd && "Didn't find end of line"); 02476 ++CurPtr; 02477 } 02478 BufferPtr = CurPtr; 02479 return true; 02480 } 02481 02482 // No end of conflict marker found. 02483 return false; 02484 } 02485 02486 02487 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 02488 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 02489 /// is the end of a conflict marker. Handle it by ignoring up until the end of 02490 /// the line. This returns true if it is a conflict marker and false if not. 02491 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 02492 // Only a conflict marker if it starts at the beginning of a line. 02493 if (CurPtr != BufferStart && 02494 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 02495 return false; 02496 02497 // If we have a situation where we don't care about conflict markers, ignore 02498 // it. 02499 if (!CurrentConflictMarkerState || isLexingRawMode()) 02500 return false; 02501 02502 // Check to see if we have the marker (4 characters in a row). 02503 for (unsigned i = 1; i != 4; ++i) 02504 if (CurPtr[i] != CurPtr[0]) 02505 return false; 02506 02507 // If we do have it, search for the end of the conflict marker. This could 02508 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 02509 // be the end of conflict marker. 02510 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 02511 CurrentConflictMarkerState)) { 02512 CurPtr = End; 02513 02514 // Skip ahead to the end of line. 02515 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 02516 ++CurPtr; 02517 02518 BufferPtr = CurPtr; 02519 02520 // No longer in the conflict marker. 02521 CurrentConflictMarkerState = CMK_None; 02522 return true; 02523 } 02524 02525 return false; 02526 } 02527 02528 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 02529 if (PP && PP->isCodeCompletionEnabled()) { 02530 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 02531 return Loc == PP->getCodeCompletionLoc(); 02532 } 02533 02534 return false; 02535 } 02536 02537 02538 /// LexTokenInternal - This implements a simple C family lexer. It is an 02539 /// extremely performance critical piece of code. This assumes that the buffer 02540 /// has a null character at the end of the file. This returns a preprocessing 02541 /// token, not a normal token, as such, it is an internal interface. It assumes 02542 /// that the Flags of result have been cleared before calling this. 02543 void Lexer::LexTokenInternal(Token &Result) { 02544 LexNextToken: 02545 // New token, can't need cleaning yet. 02546 Result.clearFlag(Token::NeedsCleaning); 02547 Result.setIdentifierInfo(0); 02548 02549 // CurPtr - Cache BufferPtr in an automatic variable. 02550 const char *CurPtr = BufferPtr; 02551 02552 // Small amounts of horizontal whitespace is very common between tokens. 02553 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 02554 ++CurPtr; 02555 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 02556 ++CurPtr; 02557 02558 // If we are keeping whitespace and other tokens, just return what we just 02559 // skipped. The next lexer invocation will return the token after the 02560 // whitespace. 02561 if (isKeepWhitespaceMode()) { 02562 FormTokenWithChars(Result, CurPtr, tok::unknown); 02563 return; 02564 } 02565 02566 BufferPtr = CurPtr; 02567 Result.setFlag(Token::LeadingSpace); 02568 } 02569 02570 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 02571 02572 // Read a character, advancing over it. 02573 char Char = getAndAdvanceChar(CurPtr, Result); 02574 tok::TokenKind Kind; 02575 02576 switch (Char) { 02577 case 0: // Null. 02578 // Found end of file? 02579 if (CurPtr-1 == BufferEnd) { 02580 // Read the PP instance variable into an automatic variable, because 02581 // LexEndOfFile will often delete 'this'. 02582 Preprocessor *PPCache = PP; 02583 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 02584 return; // Got a token to return. 02585 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 02586 return PPCache->Lex(Result); 02587 } 02588 02589 // Check if we are performing code completion. 02590 if (isCodeCompletionPoint(CurPtr-1)) { 02591 // Return the code-completion token. 02592 Result.startToken(); 02593 FormTokenWithChars(Result, CurPtr, tok::code_completion); 02594 return; 02595 } 02596 02597 if (!isLexingRawMode()) 02598 Diag(CurPtr-1, diag::null_in_file); 02599 Result.setFlag(Token::LeadingSpace); 02600 if (SkipWhitespace(Result, CurPtr)) 02601 return; // KeepWhitespaceMode 02602 02603 goto LexNextToken; // GCC isn't tail call eliminating. 02604 02605 case 26: // DOS & CP/M EOF: "^Z". 02606 // If we're in Microsoft extensions mode, treat this as end of file. 02607 if (LangOpts.MicrosoftExt) { 02608 // Read the PP instance variable into an automatic variable, because 02609 // LexEndOfFile will often delete 'this'. 02610 Preprocessor *PPCache = PP; 02611 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 02612 return; // Got a token to return. 02613 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 02614 return PPCache->Lex(Result); 02615 } 02616 // If Microsoft extensions are disabled, this is just random garbage. 02617 Kind = tok::unknown; 02618 break; 02619 02620 case '\n': 02621 case '\r': 02622 // If we are inside a preprocessor directive and we see the end of line, 02623 // we know we are done with the directive, so return an EOD token. 02624 if (ParsingPreprocessorDirective) { 02625 // Done parsing the "line". 02626 ParsingPreprocessorDirective = false; 02627 02628 // Restore comment saving mode, in case it was disabled for directive. 02629 SetCommentRetentionState(PP->getCommentRetentionState()); 02630 02631 // Since we consumed a newline, we are back at the start of a line. 02632 IsAtStartOfLine = true; 02633 02634 Kind = tok::eod; 02635 break; 02636 } 02637 // The returned token is at the start of the line. 02638 Result.setFlag(Token::StartOfLine); 02639 // No leading whitespace seen so far. 02640 Result.clearFlag(Token::LeadingSpace); 02641 02642 if (SkipWhitespace(Result, CurPtr)) 02643 return; // KeepWhitespaceMode 02644 goto LexNextToken; // GCC isn't tail call eliminating. 02645 case ' ': 02646 case '\t': 02647 case '\f': 02648 case '\v': 02649 SkipHorizontalWhitespace: 02650 Result.setFlag(Token::LeadingSpace); 02651 if (SkipWhitespace(Result, CurPtr)) 02652 return; // KeepWhitespaceMode 02653 02654 SkipIgnoredUnits: 02655 CurPtr = BufferPtr; 02656 02657 // If the next token is obviously a // or /* */ comment, skip it efficiently 02658 // too (without going through the big switch stmt). 02659 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 02660 LangOpts.BCPLComment && !LangOpts.TraditionalCPP) { 02661 if (SkipBCPLComment(Result, CurPtr+2)) 02662 return; // There is a token to return. 02663 goto SkipIgnoredUnits; 02664 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 02665 if (SkipBlockComment(Result, CurPtr+2)) 02666 return; // There is a token to return. 02667 goto SkipIgnoredUnits; 02668 } else if (isHorizontalWhitespace(*CurPtr)) { 02669 goto SkipHorizontalWhitespace; 02670 } 02671 goto LexNextToken; // GCC isn't tail call eliminating. 02672 02673 // C99 6.4.4.1: Integer Constants. 02674 // C99 6.4.4.2: Floating Constants. 02675 case '0': case '1': case '2': case '3': case '4': 02676 case '5': case '6': case '7': case '8': case '9': 02677 // Notify MIOpt that we read a non-whitespace/non-comment token. 02678 MIOpt.ReadToken(); 02679 return LexNumericConstant(Result, CurPtr); 02680 02681 case 'u': // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal 02682 // Notify MIOpt that we read a non-whitespace/non-comment token. 02683 MIOpt.ReadToken(); 02684 02685 if (LangOpts.CPlusPlus0x) { 02686 Char = getCharAndSize(CurPtr, SizeTmp); 02687 02688 // UTF-16 string literal 02689 if (Char == '"') 02690 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 02691 tok::utf16_string_literal); 02692 02693 // UTF-16 character constant 02694 if (Char == '\'') 02695 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 02696 tok::utf16_char_constant); 02697 02698 // UTF-16 raw string literal 02699 if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 02700 return LexRawStringLiteral(Result, 02701 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 02702 SizeTmp2, Result), 02703 tok::utf16_string_literal); 02704 02705 if (Char == '8') { 02706 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 02707 02708 // UTF-8 string literal 02709 if (Char2 == '"') 02710 return LexStringLiteral(Result, 02711 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 02712 SizeTmp2, Result), 02713 tok::utf8_string_literal); 02714 02715 if (Char2 == 'R') { 02716 unsigned SizeTmp3; 02717 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 02718 // UTF-8 raw string literal 02719 if (Char3 == '"') { 02720 return LexRawStringLiteral(Result, 02721 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 02722 SizeTmp2, Result), 02723 SizeTmp3, Result), 02724 tok::utf8_string_literal); 02725 } 02726 } 02727 } 02728 } 02729 02730 // treat u like the start of an identifier. 02731 return LexIdentifier(Result, CurPtr); 02732 02733 case 'U': // Identifier (Uber) or C++0x UTF-32 string literal 02734 // Notify MIOpt that we read a non-whitespace/non-comment token. 02735 MIOpt.ReadToken(); 02736 02737 if (LangOpts.CPlusPlus0x) { 02738 Char = getCharAndSize(CurPtr, SizeTmp); 02739 02740 // UTF-32 string literal 02741 if (Char == '"') 02742 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 02743 tok::utf32_string_literal); 02744 02745 // UTF-32 character constant 02746 if (Char == '\'') 02747 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 02748 tok::utf32_char_constant); 02749 02750 // UTF-32 raw string literal 02751 if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 02752 return LexRawStringLiteral(Result, 02753 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 02754 SizeTmp2, Result), 02755 tok::utf32_string_literal); 02756 } 02757 02758 // treat U like the start of an identifier. 02759 return LexIdentifier(Result, CurPtr); 02760 02761 case 'R': // Identifier or C++0x raw string literal 02762 // Notify MIOpt that we read a non-whitespace/non-comment token. 02763 MIOpt.ReadToken(); 02764 02765 if (LangOpts.CPlusPlus0x) { 02766 Char = getCharAndSize(CurPtr, SizeTmp); 02767 02768 if (Char == '"') 02769 return LexRawStringLiteral(Result, 02770 ConsumeChar(CurPtr, SizeTmp, Result), 02771 tok::string_literal); 02772 } 02773 02774 // treat R like the start of an identifier. 02775 return LexIdentifier(Result, CurPtr); 02776 02777 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 02778 // Notify MIOpt that we read a non-whitespace/non-comment token. 02779 MIOpt.ReadToken(); 02780 Char = getCharAndSize(CurPtr, SizeTmp); 02781 02782 // Wide string literal. 02783 if (Char == '"') 02784 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 02785 tok::wide_string_literal); 02786 02787 // Wide raw string literal. 02788 if (LangOpts.CPlusPlus0x && Char == 'R' && 02789 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 02790 return LexRawStringLiteral(Result, 02791 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 02792 SizeTmp2, Result), 02793 tok::wide_string_literal); 02794 02795 // Wide character constant. 02796 if (Char == '\'') 02797 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 02798 tok::wide_char_constant); 02799 // FALL THROUGH, treating L like the start of an identifier. 02800 02801 // C99 6.4.2: Identifiers. 02802 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 02803 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 02804 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 02805 case 'V': case 'W': case 'X': case 'Y': case 'Z': 02806 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 02807 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 02808 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 02809 case 'v': case 'w': case 'x': case 'y': case 'z': 02810 case '_': 02811 // Notify MIOpt that we read a non-whitespace/non-comment token. 02812 MIOpt.ReadToken(); 02813 return LexIdentifier(Result, CurPtr); 02814 02815 case '$': // $ in identifiers. 02816 if (LangOpts.DollarIdents) { 02817 if (!isLexingRawMode()) 02818 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 02819 // Notify MIOpt that we read a non-whitespace/non-comment token. 02820 MIOpt.ReadToken(); 02821 return LexIdentifier(Result, CurPtr); 02822 } 02823 02824 Kind = tok::unknown; 02825 break; 02826 02827 // C99 6.4.4: Character Constants. 02828 case '\'': 02829 // Notify MIOpt that we read a non-whitespace/non-comment token. 02830 MIOpt.ReadToken(); 02831 return LexCharConstant(Result, CurPtr, tok::char_constant); 02832 02833 // C99 6.4.5: String Literals. 02834 case '"': 02835 // Notify MIOpt that we read a non-whitespace/non-comment token. 02836 MIOpt.ReadToken(); 02837 return LexStringLiteral(Result, CurPtr, tok::string_literal); 02838 02839 // C99 6.4.6: Punctuators. 02840 case '?': 02841 Kind = tok::question; 02842 break; 02843 case '[': 02844 Kind = tok::l_square; 02845 break; 02846 case ']': 02847 Kind = tok::r_square; 02848 break; 02849 case '(': 02850 Kind = tok::l_paren; 02851 break; 02852 case ')': 02853 Kind = tok::r_paren; 02854 break; 02855 case '{': 02856 Kind = tok::l_brace; 02857 break; 02858 case '}': 02859 Kind = tok::r_brace; 02860 break; 02861 case '.': 02862 Char = getCharAndSize(CurPtr, SizeTmp); 02863 if (Char >= '0' && Char <= '9') { 02864 // Notify MIOpt that we read a non-whitespace/non-comment token. 02865 MIOpt.ReadToken(); 02866 02867 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 02868 } else if (LangOpts.CPlusPlus && Char == '*') { 02869 Kind = tok::periodstar; 02870 CurPtr += SizeTmp; 02871 } else if (Char == '.' && 02872 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 02873 Kind = tok::ellipsis; 02874 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 02875 SizeTmp2, Result); 02876 } else { 02877 Kind = tok::period; 02878 } 02879 break; 02880 case '&': 02881 Char = getCharAndSize(CurPtr, SizeTmp); 02882 if (Char == '&') { 02883 Kind = tok::ampamp; 02884 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02885 } else if (Char == '=') { 02886 Kind = tok::ampequal; 02887 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02888 } else { 02889 Kind = tok::amp; 02890 } 02891 break; 02892 case '*': 02893 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 02894 Kind = tok::starequal; 02895 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02896 } else { 02897 Kind = tok::star; 02898 } 02899 break; 02900 case '+': 02901 Char = getCharAndSize(CurPtr, SizeTmp); 02902 if (Char == '+') { 02903 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02904 Kind = tok::plusplus; 02905 } else if (Char == '=') { 02906 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02907 Kind = tok::plusequal; 02908 } else { 02909 Kind = tok::plus; 02910 } 02911 break; 02912 case '-': 02913 Char = getCharAndSize(CurPtr, SizeTmp); 02914 if (Char == '-') { // -- 02915 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02916 Kind = tok::minusminus; 02917 } else if (Char == '>' && LangOpts.CPlusPlus && 02918 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 02919 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 02920 SizeTmp2, Result); 02921 Kind = tok::arrowstar; 02922 } else if (Char == '>') { // -> 02923 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02924 Kind = tok::arrow; 02925 } else if (Char == '=') { // -= 02926 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02927 Kind = tok::minusequal; 02928 } else { 02929 Kind = tok::minus; 02930 } 02931 break; 02932 case '~': 02933 Kind = tok::tilde; 02934 break; 02935 case '!': 02936 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 02937 Kind = tok::exclaimequal; 02938 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02939 } else { 02940 Kind = tok::exclaim; 02941 } 02942 break; 02943 case '/': 02944 // 6.4.9: Comments 02945 Char = getCharAndSize(CurPtr, SizeTmp); 02946 if (Char == '/') { // BCPL comment. 02947 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 02948 // want to lex this as a comment. There is one problem with this though, 02949 // that in one particular corner case, this can change the behavior of the 02950 // resultant program. For example, In "foo //**/ bar", C89 would lex 02951 // this as "foo / bar" and langauges with BCPL comments would lex it as 02952 // "foo". Check to see if the character after the second slash is a '*'. 02953 // If so, we will lex that as a "/" instead of the start of a comment. 02954 // However, we never do this in -traditional-cpp mode. 02955 if ((LangOpts.BCPLComment || 02956 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') && 02957 !LangOpts.TraditionalCPP) { 02958 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 02959 return; // There is a token to return. 02960 02961 // It is common for the tokens immediately after a // comment to be 02962 // whitespace (indentation for the next line). Instead of going through 02963 // the big switch, handle it efficiently now. 02964 goto SkipIgnoredUnits; 02965 } 02966 } 02967 02968 if (Char == '*') { // /**/ comment. 02969 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 02970 return; // There is a token to return. 02971 goto LexNextToken; // GCC isn't tail call eliminating. 02972 } 02973 02974 if (Char == '=') { 02975 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02976 Kind = tok::slashequal; 02977 } else { 02978 Kind = tok::slash; 02979 } 02980 break; 02981 case '%': 02982 Char = getCharAndSize(CurPtr, SizeTmp); 02983 if (Char == '=') { 02984 Kind = tok::percentequal; 02985 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02986 } else if (LangOpts.Digraphs && Char == '>') { 02987 Kind = tok::r_brace; // '%>' -> '}' 02988 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02989 } else if (LangOpts.Digraphs && Char == ':') { 02990 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02991 Char = getCharAndSize(CurPtr, SizeTmp); 02992 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 02993 Kind = tok::hashhash; // '%:%:' -> '##' 02994 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 02995 SizeTmp2, Result); 02996 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 02997 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02998 if (!isLexingRawMode()) 02999 Diag(BufferPtr, diag::ext_charize_microsoft); 03000 Kind = tok::hashat; 03001 } else { // '%:' -> '#' 03002 // We parsed a # character. If this occurs at the start of the line, 03003 // it's actually the start of a preprocessing directive. Callback to 03004 // the preprocessor to handle it. 03005 // FIXME: -fpreprocessed mode?? 03006 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 03007 FormTokenWithChars(Result, CurPtr, tok::hash); 03008 PP->HandleDirective(Result); 03009 03010 // As an optimization, if the preprocessor didn't switch lexers, tail 03011 // recurse. 03012 if (PP->isCurrentLexer(this)) { 03013 // Start a new token. If this is a #include or something, the PP may 03014 // want us starting at the beginning of the line again. If so, set 03015 // the StartOfLine flag and clear LeadingSpace. 03016 if (IsAtStartOfLine) { 03017 Result.setFlag(Token::StartOfLine); 03018 Result.clearFlag(Token::LeadingSpace); 03019 IsAtStartOfLine = false; 03020 } 03021 goto LexNextToken; // GCC isn't tail call eliminating. 03022 } 03023 03024 return PP->Lex(Result); 03025 } 03026 03027 Kind = tok::hash; 03028 } 03029 } else { 03030 Kind = tok::percent; 03031 } 03032 break; 03033 case '<': 03034 Char = getCharAndSize(CurPtr, SizeTmp); 03035 if (ParsingFilename) { 03036 return LexAngledStringLiteral(Result, CurPtr); 03037 } else if (Char == '<') { 03038 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 03039 if (After == '=') { 03040 Kind = tok::lesslessequal; 03041 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03042 SizeTmp2, Result); 03043 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 03044 // If this is actually a '<<<<<<<' version control conflict marker, 03045 // recognize it as such and recover nicely. 03046 goto LexNextToken; 03047 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 03048 // If this is '<<<<' and we're in a Perforce-style conflict marker, 03049 // ignore it. 03050 goto LexNextToken; 03051 } else if (LangOpts.CUDA && After == '<') { 03052 Kind = tok::lesslessless; 03053 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03054 SizeTmp2, Result); 03055 } else { 03056 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03057 Kind = tok::lessless; 03058 } 03059 } else if (Char == '=') { 03060 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03061 Kind = tok::lessequal; 03062 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 03063 if (LangOpts.CPlusPlus0x && 03064 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 03065 // C++0x [lex.pptoken]p3: 03066 // Otherwise, if the next three characters are <:: and the subsequent 03067 // character is neither : nor >, the < is treated as a preprocessor 03068 // token by itself and not as the first character of the alternative 03069 // token <:. 03070 unsigned SizeTmp3; 03071 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 03072 if (After != ':' && After != '>') { 03073 Kind = tok::less; 03074 if (!isLexingRawMode()) 03075 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 03076 break; 03077 } 03078 } 03079 03080 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03081 Kind = tok::l_square; 03082 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 03083 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03084 Kind = tok::l_brace; 03085 } else { 03086 Kind = tok::less; 03087 } 03088 break; 03089 case '>': 03090 Char = getCharAndSize(CurPtr, SizeTmp); 03091 if (Char == '=') { 03092 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03093 Kind = tok::greaterequal; 03094 } else if (Char == '>') { 03095 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 03096 if (After == '=') { 03097 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03098 SizeTmp2, Result); 03099 Kind = tok::greatergreaterequal; 03100 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 03101 // If this is actually a '>>>>' conflict marker, recognize it as such 03102 // and recover nicely. 03103 goto LexNextToken; 03104 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 03105 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 03106 goto LexNextToken; 03107 } else if (LangOpts.CUDA && After == '>') { 03108 Kind = tok::greatergreatergreater; 03109 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03110 SizeTmp2, Result); 03111 } else { 03112 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03113 Kind = tok::greatergreater; 03114 } 03115 03116 } else { 03117 Kind = tok::greater; 03118 } 03119 break; 03120 case '^': 03121 Char = getCharAndSize(CurPtr, SizeTmp); 03122 if (Char == '=') { 03123 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03124 Kind = tok::caretequal; 03125 } else { 03126 Kind = tok::caret; 03127 } 03128 break; 03129 case '|': 03130 Char = getCharAndSize(CurPtr, SizeTmp); 03131 if (Char == '=') { 03132 Kind = tok::pipeequal; 03133 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03134 } else if (Char == '|') { 03135 // If this is '|||||||' and we're in a conflict marker, ignore it. 03136 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 03137 goto LexNextToken; 03138 Kind = tok::pipepipe; 03139 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03140 } else { 03141 Kind = tok::pipe; 03142 } 03143 break; 03144 case ':': 03145 Char = getCharAndSize(CurPtr, SizeTmp); 03146 if (LangOpts.Digraphs && Char == '>') { 03147 Kind = tok::r_square; // ':>' -> ']' 03148 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03149 } else if (LangOpts.CPlusPlus && Char == ':') { 03150 Kind = tok::coloncolon; 03151 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03152 } else { 03153 Kind = tok::colon; 03154 } 03155 break; 03156 case ';': 03157 Kind = tok::semi; 03158 break; 03159 case '=': 03160 Char = getCharAndSize(CurPtr, SizeTmp); 03161 if (Char == '=') { 03162 // If this is '====' and we're in a conflict marker, ignore it. 03163 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 03164 goto LexNextToken; 03165 03166 Kind = tok::equalequal; 03167 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03168 } else { 03169 Kind = tok::equal; 03170 } 03171 break; 03172 case ',': 03173 Kind = tok::comma; 03174 break; 03175 case '#': 03176 Char = getCharAndSize(CurPtr, SizeTmp); 03177 if (Char == '#') { 03178 Kind = tok::hashhash; 03179 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03180 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 03181 Kind = tok::hashat; 03182 if (!isLexingRawMode()) 03183 Diag(BufferPtr, diag::ext_charize_microsoft); 03184 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03185 } else { 03186 // We parsed a # character. If this occurs at the start of the line, 03187 // it's actually the start of a preprocessing directive. Callback to 03188 // the preprocessor to handle it. 03189 // FIXME: -fpreprocessed mode?? 03190 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 03191 FormTokenWithChars(Result, CurPtr, tok::hash); 03192 PP->HandleDirective(Result); 03193 03194 // As an optimization, if the preprocessor didn't switch lexers, tail 03195 // recurse. 03196 if (PP->isCurrentLexer(this)) { 03197 // Start a new token. If this is a #include or something, the PP may 03198 // want us starting at the beginning of the line again. If so, set 03199 // the StartOfLine flag and clear LeadingSpace. 03200 if (IsAtStartOfLine) { 03201 Result.setFlag(Token::StartOfLine); 03202 Result.clearFlag(Token::LeadingSpace); 03203 IsAtStartOfLine = false; 03204 } 03205 goto LexNextToken; // GCC isn't tail call eliminating. 03206 } 03207 return PP->Lex(Result); 03208 } 03209 03210 Kind = tok::hash; 03211 } 03212 break; 03213 03214 case '@': 03215 // Objective C support. 03216 if (CurPtr[-1] == '@' && LangOpts.ObjC1) 03217 Kind = tok::at; 03218 else 03219 Kind = tok::unknown; 03220 break; 03221 03222 case '\\': 03223 // FIXME: UCN's. 03224 // FALL THROUGH. 03225 default: 03226 Kind = tok::unknown; 03227 break; 03228 } 03229 03230 // Notify MIOpt that we read a non-whitespace/non-comment token. 03231 MIOpt.ReadToken(); 03232 03233 // Update the location of token as well as BufferPtr. 03234 FormTokenWithChars(Result, CurPtr, Kind); 03235 }