clang API Documentation
00001 //===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file implements the Lexer and Token interfaces. 00011 // 00012 //===----------------------------------------------------------------------===// 00013 // 00014 // TODO: GCC Diagnostics emitted by the lexer: 00015 // PEDWARN: (form feed|vertical tab) in preprocessing directive 00016 // 00017 // Universal characters, unicode, char mapping: 00018 // WARNING: `%.*s' is not in NFKC 00019 // WARNING: `%.*s' is not in NFC 00020 // 00021 // Other: 00022 // TODO: Options to support: 00023 // -fexec-charset,-fwide-exec-charset 00024 // 00025 //===----------------------------------------------------------------------===// 00026 00027 #include "clang/Lex/Lexer.h" 00028 #include "clang/Lex/Preprocessor.h" 00029 #include "clang/Lex/LexDiagnostic.h" 00030 #include "clang/Basic/SourceManager.h" 00031 #include "llvm/Support/Compiler.h" 00032 #include "llvm/Support/MemoryBuffer.h" 00033 #include <cctype> 00034 using namespace clang; 00035 00036 static void InitCharacterInfo(); 00037 00038 //===----------------------------------------------------------------------===// 00039 // Token Class Implementation 00040 //===----------------------------------------------------------------------===// 00041 00042 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 00043 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 00044 if (IdentifierInfo *II = getIdentifierInfo()) 00045 return II->getObjCKeywordID() == objcKey; 00046 return false; 00047 } 00048 00049 /// getObjCKeywordID - Return the ObjC keyword kind. 00050 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 00051 IdentifierInfo *specId = getIdentifierInfo(); 00052 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 00053 } 00054 00055 00056 //===----------------------------------------------------------------------===// 00057 // Lexer Class Implementation 00058 //===----------------------------------------------------------------------===// 00059 00060 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 00061 const char *BufEnd) { 00062 InitCharacterInfo(); 00063 00064 BufferStart = BufStart; 00065 BufferPtr = BufPtr; 00066 BufferEnd = BufEnd; 00067 00068 assert(BufEnd[0] == 0 && 00069 "We assume that the input buffer has a null character at the end" 00070 " to simplify lexing!"); 00071 00072 Is_PragmaLexer = false; 00073 IsInConflictMarker = false; 00074 00075 // Start of the file is a start of line. 00076 IsAtStartOfLine = true; 00077 00078 // We are not after parsing a #. 00079 ParsingPreprocessorDirective = false; 00080 00081 // We are not after parsing #include. 00082 ParsingFilename = false; 00083 00084 // We are not in raw mode. Raw mode disables diagnostics and interpretation 00085 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 00086 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 00087 // or otherwise skipping over tokens. 00088 LexingRawMode = false; 00089 00090 // Default to not keeping comments. 00091 ExtendedTokenMode = 0; 00092 } 00093 00094 /// Lexer constructor - Create a new lexer object for the specified buffer 00095 /// with the specified preprocessor managing the lexing process. This lexer 00096 /// assumes that the associated file buffer and Preprocessor objects will 00097 /// outlive it, so it doesn't take ownership of either of them. 00098 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 00099 : PreprocessorLexer(&PP, FID), 00100 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 00101 Features(PP.getLangOptions()) { 00102 00103 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 00104 InputFile->getBufferEnd()); 00105 00106 // Default to keeping comments if the preprocessor wants them. 00107 SetCommentRetentionState(PP.getCommentRetentionState()); 00108 } 00109 00110 /// Lexer constructor - Create a new raw lexer object. This object is only 00111 /// suitable for calls to 'LexRawToken'. This lexer assumes that the text 00112 /// range will outlive it, so it doesn't take ownership of it. 00113 Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, 00114 const char *BufStart, const char *BufPtr, const char *BufEnd) 00115 : FileLoc(fileloc), Features(features) { 00116 00117 InitLexer(BufStart, BufPtr, BufEnd); 00118 00119 // We *are* in raw mode. 00120 LexingRawMode = true; 00121 } 00122 00123 /// Lexer constructor - Create a new raw lexer object. This object is only 00124 /// suitable for calls to 'LexRawToken'. This lexer assumes that the text 00125 /// range will outlive it, so it doesn't take ownership of it. 00126 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 00127 const SourceManager &SM, const LangOptions &features) 00128 : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) { 00129 00130 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 00131 FromFile->getBufferEnd()); 00132 00133 // We *are* in raw mode. 00134 LexingRawMode = true; 00135 } 00136 00137 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 00138 /// _Pragma expansion. This has a variety of magic semantics that this method 00139 /// sets up. It returns a new'd Lexer that must be delete'd when done. 00140 /// 00141 /// On entrance to this routine, TokStartLoc is a macro location which has a 00142 /// spelling loc that indicates the bytes to be lexed for the token and an 00143 /// instantiation location that indicates where all lexed tokens should be 00144 /// "expanded from". 00145 /// 00146 /// FIXME: It would really be nice to make _Pragma just be a wrapper around a 00147 /// normal lexer that remaps tokens as they fly by. This would require making 00148 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 00149 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 00150 /// out of the critical path of the lexer! 00151 /// 00152 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 00153 SourceLocation InstantiationLocStart, 00154 SourceLocation InstantiationLocEnd, 00155 unsigned TokLen, Preprocessor &PP) { 00156 SourceManager &SM = PP.getSourceManager(); 00157 00158 // Create the lexer as if we were going to lex the file normally. 00159 FileID SpellingFID = SM.getFileID(SpellingLoc); 00160 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 00161 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 00162 00163 // Now that the lexer is created, change the start/end locations so that we 00164 // just lex the subsection of the file that we want. This is lexing from a 00165 // scratch buffer. 00166 const char *StrData = SM.getCharacterData(SpellingLoc); 00167 00168 L->BufferPtr = StrData; 00169 L->BufferEnd = StrData+TokLen; 00170 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 00171 00172 // Set the SourceLocation with the remapping information. This ensures that 00173 // GetMappedTokenLoc will remap the tokens as they are lexed. 00174 L->FileLoc = SM.createInstantiationLoc(SM.getLocForStartOfFile(SpellingFID), 00175 InstantiationLocStart, 00176 InstantiationLocEnd, TokLen); 00177 00178 // Ensure that the lexer thinks it is inside a directive, so that end \n will 00179 // return an EOM token. 00180 L->ParsingPreprocessorDirective = true; 00181 00182 // This lexer really is for _Pragma. 00183 L->Is_PragmaLexer = true; 00184 return L; 00185 } 00186 00187 00188 /// Stringify - Convert the specified string into a C string, with surrounding 00189 /// ""'s, and with escaped \ and " characters. 00190 std::string Lexer::Stringify(const std::string &Str, bool Charify) { 00191 std::string Result = Str; 00192 char Quote = Charify ? '\'' : '"'; 00193 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 00194 if (Result[i] == '\\' || Result[i] == Quote) { 00195 Result.insert(Result.begin()+i, '\\'); 00196 ++i; ++e; 00197 } 00198 } 00199 return Result; 00200 } 00201 00202 /// Stringify - Convert the specified string into a C string by escaping '\' 00203 /// and " characters. This does not add surrounding ""'s to the string. 00204 void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) { 00205 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 00206 if (Str[i] == '\\' || Str[i] == '"') { 00207 Str.insert(Str.begin()+i, '\\'); 00208 ++i; ++e; 00209 } 00210 } 00211 } 00212 00213 static bool isWhitespace(unsigned char c); 00214 00215 /// MeasureTokenLength - Relex the token at the specified location and return 00216 /// its length in bytes in the input file. If the token needs cleaning (e.g. 00217 /// includes a trigraph or an escaped newline) then this count includes bytes 00218 /// that are part of that. 00219 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 00220 const SourceManager &SM, 00221 const LangOptions &LangOpts) { 00222 // TODO: this could be special cased for common tokens like identifiers, ')', 00223 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 00224 // all obviously single-char tokens. This could use 00225 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 00226 // something. 00227 00228 // If this comes from a macro expansion, we really do want the macro name, not 00229 // the token this macro expanded to. 00230 Loc = SM.getInstantiationLoc(Loc); 00231 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 00232 bool Invalid = false; 00233 llvm::StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 00234 if (Invalid) 00235 return 0; 00236 00237 const char *StrData = Buffer.data()+LocInfo.second; 00238 00239 if (isWhitespace(StrData[0])) 00240 return 0; 00241 00242 // Create a lexer starting at the beginning of this token. 00243 Lexer TheLexer(Loc, LangOpts, Buffer.begin(), StrData, Buffer.end()); 00244 TheLexer.SetCommentRetentionState(true); 00245 Token TheTok; 00246 TheLexer.LexFromRawLexer(TheTok); 00247 return TheTok.getLength(); 00248 } 00249 00250 //===----------------------------------------------------------------------===// 00251 // Character information. 00252 //===----------------------------------------------------------------------===// 00253 00254 enum { 00255 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 00256 CHAR_VERT_WS = 0x02, // '\r', '\n' 00257 CHAR_LETTER = 0x04, // a-z,A-Z 00258 CHAR_NUMBER = 0x08, // 0-9 00259 CHAR_UNDER = 0x10, // _ 00260 CHAR_PERIOD = 0x20 // . 00261 }; 00262 00263 // Statically initialize CharInfo table based on ASCII character set 00264 // Reference: FreeBSD 7.2 /usr/share/misc/ascii 00265 static const unsigned char CharInfo[256] = 00266 { 00267 // 0 NUL 1 SOH 2 STX 3 ETX 00268 // 4 EOT 5 ENQ 6 ACK 7 BEL 00269 0 , 0 , 0 , 0 , 00270 0 , 0 , 0 , 0 , 00271 // 8 BS 9 HT 10 NL 11 VT 00272 //12 NP 13 CR 14 SO 15 SI 00273 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, 00274 CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , 00275 //16 DLE 17 DC1 18 DC2 19 DC3 00276 //20 DC4 21 NAK 22 SYN 23 ETB 00277 0 , 0 , 0 , 0 , 00278 0 , 0 , 0 , 0 , 00279 //24 CAN 25 EM 26 SUB 27 ESC 00280 //28 FS 29 GS 30 RS 31 US 00281 0 , 0 , 0 , 0 , 00282 0 , 0 , 0 , 0 , 00283 //32 SP 33 ! 34 " 35 # 00284 //36 $ 37 % 38 & 39 ' 00285 CHAR_HORZ_WS, 0 , 0 , 0 , 00286 0 , 0 , 0 , 0 , 00287 //40 ( 41 ) 42 * 43 + 00288 //44 , 45 - 46 . 47 / 00289 0 , 0 , 0 , 0 , 00290 0 , 0 , CHAR_PERIOD , 0 , 00291 //48 0 49 1 50 2 51 3 00292 //52 4 53 5 54 6 55 7 00293 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 00294 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 00295 //56 8 57 9 58 : 59 ; 00296 //60 < 61 = 62 > 63 ? 00297 CHAR_NUMBER , CHAR_NUMBER , 0 , 0 , 00298 0 , 0 , 0 , 0 , 00299 //64 @ 65 A 66 B 67 C 00300 //68 D 69 E 70 F 71 G 00301 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00302 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00303 //72 H 73 I 74 J 75 K 00304 //76 L 77 M 78 N 79 O 00305 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00306 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00307 //80 P 81 Q 82 R 83 S 00308 //84 T 85 U 86 V 87 W 00309 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00310 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00311 //88 X 89 Y 90 Z 91 [ 00312 //92 \ 93 ] 94 ^ 95 _ 00313 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 , 00314 0 , 0 , 0 , CHAR_UNDER , 00315 //96 ` 97 a 98 b 99 c 00316 //100 d 101 e 102 f 103 g 00317 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00318 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00319 //104 h 105 i 106 j 107 k 00320 //108 l 109 m 110 n 111 o 00321 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00322 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00323 //112 p 113 q 114 r 115 s 00324 //116 t 117 u 118 v 119 w 00325 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00326 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00327 //120 x 121 y 122 z 123 { 00328 //124 | 125 } 126 ~ 127 DEL 00329 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 , 00330 0 , 0 , 0 , 0 00331 }; 00332 00333 static void InitCharacterInfo() { 00334 static bool isInited = false; 00335 if (isInited) return; 00336 // check the statically-initialized CharInfo table 00337 assert(CHAR_HORZ_WS == CharInfo[(int)' ']); 00338 assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); 00339 assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); 00340 assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); 00341 assert(CHAR_VERT_WS == CharInfo[(int)'\n']); 00342 assert(CHAR_VERT_WS == CharInfo[(int)'\r']); 00343 assert(CHAR_UNDER == CharInfo[(int)'_']); 00344 assert(CHAR_PERIOD == CharInfo[(int)'.']); 00345 for (unsigned i = 'a'; i <= 'z'; ++i) { 00346 assert(CHAR_LETTER == CharInfo[i]); 00347 assert(CHAR_LETTER == CharInfo[i+'A'-'a']); 00348 } 00349 for (unsigned i = '0'; i <= '9'; ++i) 00350 assert(CHAR_NUMBER == CharInfo[i]); 00351 00352 isInited = true; 00353 } 00354 00355 00356 /// isIdentifierBody - Return true if this is the body character of an 00357 /// identifier, which is [a-zA-Z0-9_]. 00358 static inline bool isIdentifierBody(unsigned char c) { 00359 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 00360 } 00361 00362 /// isHorizontalWhitespace - Return true if this character is horizontal 00363 /// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. 00364 static inline bool isHorizontalWhitespace(unsigned char c) { 00365 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 00366 } 00367 00368 /// isWhitespace - Return true if this character is horizontal or vertical 00369 /// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false 00370 /// for '\0'. 00371 static inline bool isWhitespace(unsigned char c) { 00372 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 00373 } 00374 00375 /// isNumberBody - Return true if this is the body character of an 00376 /// preprocessing number, which is [a-zA-Z0-9_.]. 00377 static inline bool isNumberBody(unsigned char c) { 00378 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 00379 true : false; 00380 } 00381 00382 00383 //===----------------------------------------------------------------------===// 00384 // Diagnostics forwarding code. 00385 //===----------------------------------------------------------------------===// 00386 00387 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 00388 /// lexer buffer was all instantiated at a single point, perform the mapping. 00389 /// This is currently only used for _Pragma implementation, so it is the slow 00390 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 00391 static DISABLE_INLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, 00392 SourceLocation FileLoc, 00393 unsigned CharNo, 00394 unsigned TokLen); 00395 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 00396 SourceLocation FileLoc, 00397 unsigned CharNo, unsigned TokLen) { 00398 assert(FileLoc.isMacroID() && "Must be an instantiation"); 00399 00400 // Otherwise, we're lexing "mapped tokens". This is used for things like 00401 // _Pragma handling. Combine the instantiation location of FileLoc with the 00402 // spelling location. 00403 SourceManager &SM = PP.getSourceManager(); 00404 00405 // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose 00406 // characters come from spelling(FileLoc)+Offset. 00407 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 00408 SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo); 00409 00410 // Figure out the expansion loc range, which is the range covered by the 00411 // original _Pragma(...) sequence. 00412 std::pair<SourceLocation,SourceLocation> II = 00413 SM.getImmediateInstantiationRange(FileLoc); 00414 00415 return SM.createInstantiationLoc(SpellingLoc, II.first, II.second, TokLen); 00416 } 00417 00418 /// getSourceLocation - Return a source location identifier for the specified 00419 /// offset in the current file. 00420 SourceLocation Lexer::getSourceLocation(const char *Loc, 00421 unsigned TokLen) const { 00422 assert(Loc >= BufferStart && Loc <= BufferEnd && 00423 "Location out of range for this buffer!"); 00424 00425 // In the normal case, we're just lexing from a simple file buffer, return 00426 // the file id from FileLoc with the offset specified. 00427 unsigned CharNo = Loc-BufferStart; 00428 if (FileLoc.isFileID()) 00429 return FileLoc.getFileLocWithOffset(CharNo); 00430 00431 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 00432 // tokens are lexed from where the _Pragma was defined. 00433 assert(PP && "This doesn't work on raw lexers"); 00434 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 00435 } 00436 00437 /// Diag - Forwarding function for diagnostics. This translate a source 00438 /// position in the current buffer into a SourceLocation object for rendering. 00439 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 00440 return PP->Diag(getSourceLocation(Loc), DiagID); 00441 } 00442 00443 //===----------------------------------------------------------------------===// 00444 // Trigraph and Escaped Newline Handling Code. 00445 //===----------------------------------------------------------------------===// 00446 00447 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 00448 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 00449 static char GetTrigraphCharForLetter(char Letter) { 00450 switch (Letter) { 00451 default: return 0; 00452 case '=': return '#'; 00453 case ')': return ']'; 00454 case '(': return '['; 00455 case '!': return '|'; 00456 case '\'': return '^'; 00457 case '>': return '}'; 00458 case '/': return '\\'; 00459 case '<': return '{'; 00460 case '-': return '~'; 00461 } 00462 } 00463 00464 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 00465 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 00466 /// return the result character. Finally, emit a warning about trigraph use 00467 /// whether trigraphs are enabled or not. 00468 static char DecodeTrigraphChar(const char *CP, Lexer *L) { 00469 char Res = GetTrigraphCharForLetter(*CP); 00470 if (!Res || !L) return Res; 00471 00472 if (!L->getFeatures().Trigraphs) { 00473 if (!L->isLexingRawMode()) 00474 L->Diag(CP-2, diag::trigraph_ignored); 00475 return 0; 00476 } 00477 00478 if (!L->isLexingRawMode()) 00479 L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res; 00480 return Res; 00481 } 00482 00483 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 00484 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 00485 /// trigraph equivalent on entry to this function. 00486 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 00487 unsigned Size = 0; 00488 while (isWhitespace(Ptr[Size])) { 00489 ++Size; 00490 00491 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 00492 continue; 00493 00494 // If this is a \r\n or \n\r, skip the other half. 00495 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 00496 Ptr[Size-1] != Ptr[Size]) 00497 ++Size; 00498 00499 return Size; 00500 } 00501 00502 // Not an escaped newline, must be a \t or something else. 00503 return 0; 00504 } 00505 00506 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 00507 /// them), skip over them and return the first non-escaped-newline found, 00508 /// otherwise return P. 00509 const char *Lexer::SkipEscapedNewLines(const char *P) { 00510 while (1) { 00511 const char *AfterEscape; 00512 if (*P == '\\') { 00513 AfterEscape = P+1; 00514 } else if (*P == '?') { 00515 // If not a trigraph for escape, bail out. 00516 if (P[1] != '?' || P[2] != '/') 00517 return P; 00518 AfterEscape = P+3; 00519 } else { 00520 return P; 00521 } 00522 00523 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 00524 if (NewLineSize == 0) return P; 00525 P = AfterEscape+NewLineSize; 00526 } 00527 } 00528 00529 00530 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 00531 /// get its size, and return it. This is tricky in several cases: 00532 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 00533 /// then either return the trigraph (skipping 3 chars) or the '?', 00534 /// depending on whether trigraphs are enabled or not. 00535 /// 2. If this is an escaped newline (potentially with whitespace between 00536 /// the backslash and newline), implicitly skip the newline and return 00537 /// the char after it. 00538 /// 3. If this is a UCN, return it. FIXME: C++ UCN's? 00539 /// 00540 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 00541 /// know that we can accumulate into Size, and that we have already incremented 00542 /// Ptr by Size bytes. 00543 /// 00544 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 00545 /// be updated to match. 00546 /// 00547 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 00548 Token *Tok) { 00549 // If we have a slash, look for an escaped newline. 00550 if (Ptr[0] == '\\') { 00551 ++Size; 00552 ++Ptr; 00553 Slash: 00554 // Common case, backslash-char where the char is not whitespace. 00555 if (!isWhitespace(Ptr[0])) return '\\'; 00556 00557 // See if we have optional whitespace characters between the slash and 00558 // newline. 00559 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 00560 // Remember that this token needs to be cleaned. 00561 if (Tok) Tok->setFlag(Token::NeedsCleaning); 00562 00563 // Warn if there was whitespace between the backslash and newline. 00564 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 00565 Diag(Ptr, diag::backslash_newline_space); 00566 00567 // Found backslash<whitespace><newline>. Parse the char after it. 00568 Size += EscapedNewLineSize; 00569 Ptr += EscapedNewLineSize; 00570 // Use slow version to accumulate a correct size field. 00571 return getCharAndSizeSlow(Ptr, Size, Tok); 00572 } 00573 00574 // Otherwise, this is not an escaped newline, just return the slash. 00575 return '\\'; 00576 } 00577 00578 // If this is a trigraph, process it. 00579 if (Ptr[0] == '?' && Ptr[1] == '?') { 00580 // If this is actually a legal trigraph (not something like "??x"), emit 00581 // a trigraph warning. If so, and if trigraphs are enabled, return it. 00582 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 00583 // Remember that this token needs to be cleaned. 00584 if (Tok) Tok->setFlag(Token::NeedsCleaning); 00585 00586 Ptr += 3; 00587 Size += 3; 00588 if (C == '\\') goto Slash; 00589 return C; 00590 } 00591 } 00592 00593 // If this is neither, return a single character. 00594 ++Size; 00595 return *Ptr; 00596 } 00597 00598 00599 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 00600 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 00601 /// and that we have already incremented Ptr by Size bytes. 00602 /// 00603 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 00604 /// be updated to match. 00605 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 00606 const LangOptions &Features) { 00607 // If we have a slash, look for an escaped newline. 00608 if (Ptr[0] == '\\') { 00609 ++Size; 00610 ++Ptr; 00611 Slash: 00612 // Common case, backslash-char where the char is not whitespace. 00613 if (!isWhitespace(Ptr[0])) return '\\'; 00614 00615 // See if we have optional whitespace characters followed by a newline. 00616 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 00617 // Found backslash<whitespace><newline>. Parse the char after it. 00618 Size += EscapedNewLineSize; 00619 Ptr += EscapedNewLineSize; 00620 00621 // Use slow version to accumulate a correct size field. 00622 return getCharAndSizeSlowNoWarn(Ptr, Size, Features); 00623 } 00624 00625 // Otherwise, this is not an escaped newline, just return the slash. 00626 return '\\'; 00627 } 00628 00629 // If this is a trigraph, process it. 00630 if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 00631 // If this is actually a legal trigraph (not something like "??x"), return 00632 // it. 00633 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 00634 Ptr += 3; 00635 Size += 3; 00636 if (C == '\\') goto Slash; 00637 return C; 00638 } 00639 } 00640 00641 // If this is neither, return a single character. 00642 ++Size; 00643 return *Ptr; 00644 } 00645 00646 //===----------------------------------------------------------------------===// 00647 // Helper methods for lexing. 00648 //===----------------------------------------------------------------------===// 00649 00650 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 00651 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 00652 unsigned Size; 00653 unsigned char C = *CurPtr++; 00654 while (isIdentifierBody(C)) 00655 C = *CurPtr++; 00656 00657 --CurPtr; // Back up over the skipped character. 00658 00659 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 00660 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 00661 // FIXME: UCNs. 00662 // 00663 // TODO: Could merge these checks into a CharInfo flag to make the comparison 00664 // cheaper 00665 if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { 00666 FinishIdentifier: 00667 const char *IdStart = BufferPtr; 00668 FormTokenWithChars(Result, CurPtr, tok::identifier); 00669 00670 // If we are in raw mode, return this identifier raw. There is no need to 00671 // look up identifier information or attempt to macro expand it. 00672 if (LexingRawMode) return; 00673 00674 // Fill in Result.IdentifierInfo, looking up the identifier in the 00675 // identifier table. 00676 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart); 00677 00678 // Change the kind of this identifier to the appropriate token kind, e.g. 00679 // turning "for" into a keyword. 00680 Result.setKind(II->getTokenID()); 00681 00682 // Finally, now that we know we have an identifier, pass this off to the 00683 // preprocessor, which may macro expand it or something. 00684 if (II->isHandleIdentifierCase()) 00685 PP->HandleIdentifier(Result); 00686 return; 00687 } 00688 00689 // Otherwise, $,\,? in identifier found. Enter slower path. 00690 00691 C = getCharAndSize(CurPtr, Size); 00692 while (1) { 00693 if (C == '$') { 00694 // If we hit a $ and they are not supported in identifiers, we are done. 00695 if (!Features.DollarIdents) goto FinishIdentifier; 00696 00697 // Otherwise, emit a diagnostic and continue. 00698 if (!isLexingRawMode()) 00699 Diag(CurPtr, diag::ext_dollar_in_identifier); 00700 CurPtr = ConsumeChar(CurPtr, Size, Result); 00701 C = getCharAndSize(CurPtr, Size); 00702 continue; 00703 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 00704 // Found end of identifier. 00705 goto FinishIdentifier; 00706 } 00707 00708 // Otherwise, this character is good, consume it. 00709 CurPtr = ConsumeChar(CurPtr, Size, Result); 00710 00711 C = getCharAndSize(CurPtr, Size); 00712 while (isIdentifierBody(C)) { // FIXME: UCNs. 00713 CurPtr = ConsumeChar(CurPtr, Size, Result); 00714 C = getCharAndSize(CurPtr, Size); 00715 } 00716 } 00717 } 00718 00719 00720 /// LexNumericConstant - Lex the remainder of a integer or floating point 00721 /// constant. From[-1] is the first character lexed. Return the end of the 00722 /// constant. 00723 void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 00724 unsigned Size; 00725 char C = getCharAndSize(CurPtr, Size); 00726 char PrevCh = 0; 00727 while (isNumberBody(C)) { // FIXME: UCNs? 00728 CurPtr = ConsumeChar(CurPtr, Size, Result); 00729 PrevCh = C; 00730 C = getCharAndSize(CurPtr, Size); 00731 } 00732 00733 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 00734 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) 00735 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 00736 00737 // If we have a hex FP constant, continue. 00738 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') && 00739 (!PP || !PP->getLangOptions().CPlusPlus0x)) 00740 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 00741 00742 // Update the location of token as well as BufferPtr. 00743 const char *TokStart = BufferPtr; 00744 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 00745 Result.setLiteralData(TokStart); 00746 } 00747 00748 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 00749 /// either " or L". 00750 void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { 00751 const char *NulCharacter = 0; // Does this string contain the \0 character? 00752 00753 char C = getAndAdvanceChar(CurPtr, Result); 00754 while (C != '"') { 00755 // Skip escaped characters. 00756 if (C == '\\') { 00757 // Skip the escaped character. 00758 C = getAndAdvanceChar(CurPtr, Result); 00759 } else if (C == '\n' || C == '\r' || // Newline. 00760 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 00761 if (!isLexingRawMode() && !Features.AsmPreprocessor) 00762 Diag(BufferPtr, diag::err_unterminated_string); 00763 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 00764 return; 00765 } else if (C == 0) { 00766 NulCharacter = CurPtr-1; 00767 } 00768 C = getAndAdvanceChar(CurPtr, Result); 00769 } 00770 00771 // If a nul character existed in the string, warn about it. 00772 if (NulCharacter && !isLexingRawMode()) 00773 Diag(NulCharacter, diag::null_in_string); 00774 00775 // Update the location of the token as well as the BufferPtr instance var. 00776 const char *TokStart = BufferPtr; 00777 FormTokenWithChars(Result, CurPtr, 00778 Wide ? tok::wide_string_literal : tok::string_literal); 00779 Result.setLiteralData(TokStart); 00780 } 00781 00782 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 00783 /// after having lexed the '<' character. This is used for #include filenames. 00784 void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 00785 const char *NulCharacter = 0; // Does this string contain the \0 character? 00786 const char *AfterLessPos = CurPtr; 00787 char C = getAndAdvanceChar(CurPtr, Result); 00788 while (C != '>') { 00789 // Skip escaped characters. 00790 if (C == '\\') { 00791 // Skip the escaped character. 00792 C = getAndAdvanceChar(CurPtr, Result); 00793 } else if (C == '\n' || C == '\r' || // Newline. 00794 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 00795 // If the filename is unterminated, then it must just be a lone < 00796 // character. Return this as such. 00797 FormTokenWithChars(Result, AfterLessPos, tok::less); 00798 return; 00799 } else if (C == 0) { 00800 NulCharacter = CurPtr-1; 00801 } 00802 C = getAndAdvanceChar(CurPtr, Result); 00803 } 00804 00805 // If a nul character existed in the string, warn about it. 00806 if (NulCharacter && !isLexingRawMode()) 00807 Diag(NulCharacter, diag::null_in_string); 00808 00809 // Update the location of token as well as BufferPtr. 00810 const char *TokStart = BufferPtr; 00811 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 00812 Result.setLiteralData(TokStart); 00813 } 00814 00815 00816 /// LexCharConstant - Lex the remainder of a character constant, after having 00817 /// lexed either ' or L'. 00818 void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { 00819 const char *NulCharacter = 0; // Does this character contain the \0 character? 00820 00821 // Handle the common case of 'x' and '\y' efficiently. 00822 char C = getAndAdvanceChar(CurPtr, Result); 00823 if (C == '\'') { 00824 if (!isLexingRawMode() && !Features.AsmPreprocessor) 00825 Diag(BufferPtr, diag::err_empty_character); 00826 FormTokenWithChars(Result, CurPtr, tok::unknown); 00827 return; 00828 } else if (C == '\\') { 00829 // Skip the escaped character. 00830 // FIXME: UCN's. 00831 C = getAndAdvanceChar(CurPtr, Result); 00832 } 00833 00834 if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') { 00835 ++CurPtr; 00836 } else { 00837 // Fall back on generic code for embedded nulls, newlines, wide chars. 00838 do { 00839 // Skip escaped characters. 00840 if (C == '\\') { 00841 // Skip the escaped character. 00842 C = getAndAdvanceChar(CurPtr, Result); 00843 } else if (C == '\n' || C == '\r' || // Newline. 00844 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 00845 if (!isLexingRawMode() && !Features.AsmPreprocessor) 00846 Diag(BufferPtr, diag::err_unterminated_char); 00847 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 00848 return; 00849 } else if (C == 0) { 00850 NulCharacter = CurPtr-1; 00851 } 00852 C = getAndAdvanceChar(CurPtr, Result); 00853 } while (C != '\''); 00854 } 00855 00856 if (NulCharacter && !isLexingRawMode()) 00857 Diag(NulCharacter, diag::null_in_char); 00858 00859 // Update the location of token as well as BufferPtr. 00860 const char *TokStart = BufferPtr; 00861 FormTokenWithChars(Result, CurPtr, tok::char_constant); 00862 Result.setLiteralData(TokStart); 00863 } 00864 00865 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 00866 /// Update BufferPtr to point to the next non-whitespace character and return. 00867 /// 00868 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 00869 /// 00870 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 00871 // Whitespace - Skip it, then return the token after the whitespace. 00872 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 00873 while (1) { 00874 // Skip horizontal whitespace very aggressively. 00875 while (isHorizontalWhitespace(Char)) 00876 Char = *++CurPtr; 00877 00878 // Otherwise if we have something other than whitespace, we're done. 00879 if (Char != '\n' && Char != '\r') 00880 break; 00881 00882 if (ParsingPreprocessorDirective) { 00883 // End of preprocessor directive line, let LexTokenInternal handle this. 00884 BufferPtr = CurPtr; 00885 return false; 00886 } 00887 00888 // ok, but handle newline. 00889 // The returned token is at the start of the line. 00890 Result.setFlag(Token::StartOfLine); 00891 // No leading whitespace seen so far. 00892 Result.clearFlag(Token::LeadingSpace); 00893 Char = *++CurPtr; 00894 } 00895 00896 // If this isn't immediately after a newline, there is leading space. 00897 char PrevChar = CurPtr[-1]; 00898 if (PrevChar != '\n' && PrevChar != '\r') 00899 Result.setFlag(Token::LeadingSpace); 00900 00901 // If the client wants us to return whitespace, return it now. 00902 if (isKeepWhitespaceMode()) { 00903 FormTokenWithChars(Result, CurPtr, tok::unknown); 00904 return true; 00905 } 00906 00907 BufferPtr = CurPtr; 00908 return false; 00909 } 00910 00911 // SkipBCPLComment - We have just read the // characters from input. Skip until 00912 // we find the newline character thats terminate the comment. Then update 00913 /// BufferPtr and return. 00914 /// 00915 /// If we're in KeepCommentMode or any CommentHandler has inserted 00916 /// some tokens, this will store the first token and return true. 00917 bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 00918 // If BCPL comments aren't explicitly enabled for this language, emit an 00919 // extension warning. 00920 if (!Features.BCPLComment && !isLexingRawMode()) { 00921 Diag(BufferPtr, diag::ext_bcpl_comment); 00922 00923 // Mark them enabled so we only emit one warning for this translation 00924 // unit. 00925 Features.BCPLComment = true; 00926 } 00927 00928 // Scan over the body of the comment. The common case, when scanning, is that 00929 // the comment contains normal ascii characters with nothing interesting in 00930 // them. As such, optimize for this case with the inner loop. 00931 char C; 00932 do { 00933 C = *CurPtr; 00934 // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character. 00935 // If we find a \n character, scan backwards, checking to see if it's an 00936 // escaped newline, like we do for block comments. 00937 00938 // Skip over characters in the fast loop. 00939 while (C != 0 && // Potentially EOF. 00940 C != '\\' && // Potentially escaped newline. 00941 C != '?' && // Potentially trigraph. 00942 C != '\n' && C != '\r') // Newline or DOS-style newline. 00943 C = *++CurPtr; 00944 00945 // If this is a newline, we're done. 00946 if (C == '\n' || C == '\r') 00947 break; // Found the newline? Break out! 00948 00949 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 00950 // properly decode the character. Read it in raw mode to avoid emitting 00951 // diagnostics about things like trigraphs. If we see an escaped newline, 00952 // we'll handle it below. 00953 const char *OldPtr = CurPtr; 00954 bool OldRawMode = isLexingRawMode(); 00955 LexingRawMode = true; 00956 C = getAndAdvanceChar(CurPtr, Result); 00957 LexingRawMode = OldRawMode; 00958 00959 // If the char that we finally got was a \n, then we must have had something 00960 // like <newline><newline>. We don't want to have consumed the second 00961 // newline, we want CurPtr, to end up pointing to it down below. 00962 if (C == '\n' || C == '\r') { 00963 --CurPtr; 00964 C = 'x'; // doesn't matter what this is. 00965 } 00966 00967 // If we read multiple characters, and one of those characters was a \r or 00968 // \n, then we had an escaped newline within the comment. Emit diagnostic 00969 // unless the next line is also a // comment. 00970 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 00971 for (; OldPtr != CurPtr; ++OldPtr) 00972 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 00973 // Okay, we found a // comment that ends in a newline, if the next 00974 // line is also a // comment, but has spaces, don't emit a diagnostic. 00975 if (isspace(C)) { 00976 const char *ForwardPtr = CurPtr; 00977 while (isspace(*ForwardPtr)) // Skip whitespace. 00978 ++ForwardPtr; 00979 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 00980 break; 00981 } 00982 00983 if (!isLexingRawMode()) 00984 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 00985 break; 00986 } 00987 } 00988 00989 if (CurPtr == BufferEnd+1) { --CurPtr; break; } 00990 } while (C != '\n' && C != '\r'); 00991 00992 // Found but did not consume the newline. Notify comment handlers about the 00993 // comment unless we're in a #if 0 block. 00994 if (PP && !isLexingRawMode() && 00995 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 00996 getSourceLocation(CurPtr)))) { 00997 BufferPtr = CurPtr; 00998 return true; // A token has to be returned. 00999 } 01000 01001 // If we are returning comments as tokens, return this comment as a token. 01002 if (inKeepCommentMode()) 01003 return SaveBCPLComment(Result, CurPtr); 01004 01005 // If we are inside a preprocessor directive and we see the end of line, 01006 // return immediately, so that the lexer can return this as an EOM token. 01007 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 01008 BufferPtr = CurPtr; 01009 return false; 01010 } 01011 01012 // Otherwise, eat the \n character. We don't care if this is a \n\r or 01013 // \r\n sequence. This is an efficiency hack (because we know the \n can't 01014 // contribute to another token), it isn't needed for correctness. Note that 01015 // this is ok even in KeepWhitespaceMode, because we would have returned the 01016 /// comment above in that mode. 01017 ++CurPtr; 01018 01019 // The next returned token is at the start of the line. 01020 Result.setFlag(Token::StartOfLine); 01021 // No leading whitespace seen so far. 01022 Result.clearFlag(Token::LeadingSpace); 01023 BufferPtr = CurPtr; 01024 return false; 01025 } 01026 01027 /// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 01028 /// an appropriate way and return it. 01029 bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 01030 // If we're not in a preprocessor directive, just return the // comment 01031 // directly. 01032 FormTokenWithChars(Result, CurPtr, tok::comment); 01033 01034 if (!ParsingPreprocessorDirective) 01035 return true; 01036 01037 // If this BCPL-style comment is in a macro definition, transmogrify it into 01038 // a C-style block comment. 01039 bool Invalid = false; 01040 std::string Spelling = PP->getSpelling(Result, &Invalid); 01041 if (Invalid) 01042 return true; 01043 01044 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 01045 Spelling[1] = '*'; // Change prefix to "/*". 01046 Spelling += "*/"; // add suffix. 01047 01048 Result.setKind(tok::comment); 01049 PP->CreateString(&Spelling[0], Spelling.size(), Result, 01050 Result.getLocation()); 01051 return true; 01052 } 01053 01054 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 01055 /// character (either \n or \r) is part of an escaped newline sequence. Issue a 01056 /// diagnostic if so. We know that the newline is inside of a block comment. 01057 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 01058 Lexer *L) { 01059 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 01060 01061 // Back up off the newline. 01062 --CurPtr; 01063 01064 // If this is a two-character newline sequence, skip the other character. 01065 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 01066 // \n\n or \r\r -> not escaped newline. 01067 if (CurPtr[0] == CurPtr[1]) 01068 return false; 01069 // \n\r or \r\n -> skip the newline. 01070 --CurPtr; 01071 } 01072 01073 // If we have horizontal whitespace, skip over it. We allow whitespace 01074 // between the slash and newline. 01075 bool HasSpace = false; 01076 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 01077 --CurPtr; 01078 HasSpace = true; 01079 } 01080 01081 // If we have a slash, we know this is an escaped newline. 01082 if (*CurPtr == '\\') { 01083 if (CurPtr[-1] != '*') return false; 01084 } else { 01085 // It isn't a slash, is it the ?? / trigraph? 01086 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 01087 CurPtr[-3] != '*') 01088 return false; 01089 01090 // This is the trigraph ending the comment. Emit a stern warning! 01091 CurPtr -= 2; 01092 01093 // If no trigraphs are enabled, warn that we ignored this trigraph and 01094 // ignore this * character. 01095 if (!L->getFeatures().Trigraphs) { 01096 if (!L->isLexingRawMode()) 01097 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 01098 return false; 01099 } 01100 if (!L->isLexingRawMode()) 01101 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 01102 } 01103 01104 // Warn about having an escaped newline between the */ characters. 01105 if (!L->isLexingRawMode()) 01106 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 01107 01108 // If there was space between the backslash and newline, warn about it. 01109 if (HasSpace && !L->isLexingRawMode()) 01110 L->Diag(CurPtr, diag::backslash_newline_space); 01111 01112 return true; 01113 } 01114 01115 #ifdef __SSE2__ 01116 #include <emmintrin.h> 01117 #elif __ALTIVEC__ 01118 #include <altivec.h> 01119 #undef bool 01120 #endif 01121 01122 /// SkipBlockComment - We have just read the /* characters from input. Read 01123 /// until we find the */ characters that terminate the comment. Note that we 01124 /// don't bother decoding trigraphs or escaped newlines in block comments, 01125 /// because they cannot cause the comment to end. The only thing that can 01126 /// happen is the comment could end with an escaped newline between the */ end 01127 /// of comment. 01128 /// 01129 /// If we're in KeepCommentMode or any CommentHandler has inserted 01130 /// some tokens, this will store the first token and return true. 01131 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 01132 // Scan one character past where we should, looking for a '/' character. Once 01133 // we find it, check to see if it was preceeded by a *. This common 01134 // optimization helps people who like to put a lot of * characters in their 01135 // comments. 01136 01137 // The first character we get with newlines and trigraphs skipped to handle 01138 // the degenerate /*/ case below correctly if the * has an escaped newline 01139 // after it. 01140 unsigned CharSize; 01141 unsigned char C = getCharAndSize(CurPtr, CharSize); 01142 CurPtr += CharSize; 01143 if (C == 0 && CurPtr == BufferEnd+1) { 01144 if (!isLexingRawMode()) 01145 Diag(BufferPtr, diag::err_unterminated_block_comment); 01146 --CurPtr; 01147 01148 // KeepWhitespaceMode should return this broken comment as a token. Since 01149 // it isn't a well formed comment, just return it as an 'unknown' token. 01150 if (isKeepWhitespaceMode()) { 01151 FormTokenWithChars(Result, CurPtr, tok::unknown); 01152 return true; 01153 } 01154 01155 BufferPtr = CurPtr; 01156 return false; 01157 } 01158 01159 // Check to see if the first character after the '/*' is another /. If so, 01160 // then this slash does not end the block comment, it is part of it. 01161 if (C == '/') 01162 C = *CurPtr++; 01163 01164 while (1) { 01165 // Skip over all non-interesting characters until we find end of buffer or a 01166 // (probably ending) '/' character. 01167 if (CurPtr + 24 < BufferEnd) { 01168 // While not aligned to a 16-byte boundary. 01169 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 01170 C = *CurPtr++; 01171 01172 if (C == '/') goto FoundSlash; 01173 01174 #ifdef __SSE2__ 01175 __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/', 01176 '/', '/', '/', '/', '/', '/', '/', '/'); 01177 while (CurPtr+16 <= BufferEnd && 01178 _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0) 01179 CurPtr += 16; 01180 #elif __ALTIVEC__ 01181 __vector unsigned char Slashes = { 01182 '/', '/', '/', '/', '/', '/', '/', '/', 01183 '/', '/', '/', '/', '/', '/', '/', '/' 01184 }; 01185 while (CurPtr+16 <= BufferEnd && 01186 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 01187 CurPtr += 16; 01188 #else 01189 // Scan for '/' quickly. Many block comments are very large. 01190 while (CurPtr[0] != '/' && 01191 CurPtr[1] != '/' && 01192 CurPtr[2] != '/' && 01193 CurPtr[3] != '/' && 01194 CurPtr+4 < BufferEnd) { 01195 CurPtr += 4; 01196 } 01197 #endif 01198 01199 // It has to be one of the bytes scanned, increment to it and read one. 01200 C = *CurPtr++; 01201 } 01202 01203 // Loop to scan the remainder. 01204 while (C != '/' && C != '\0') 01205 C = *CurPtr++; 01206 01207 FoundSlash: 01208 if (C == '/') { 01209 if (CurPtr[-2] == '*') // We found the final */. We're done! 01210 break; 01211 01212 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 01213 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 01214 // We found the final */, though it had an escaped newline between the 01215 // * and /. We're done! 01216 break; 01217 } 01218 } 01219 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 01220 // If this is a /* inside of the comment, emit a warning. Don't do this 01221 // if this is a /*/, which will end the comment. This misses cases with 01222 // embedded escaped newlines, but oh well. 01223 if (!isLexingRawMode()) 01224 Diag(CurPtr-1, diag::warn_nested_block_comment); 01225 } 01226 } else if (C == 0 && CurPtr == BufferEnd+1) { 01227 if (!isLexingRawMode()) 01228 Diag(BufferPtr, diag::err_unterminated_block_comment); 01229 // Note: the user probably forgot a */. We could continue immediately 01230 // after the /*, but this would involve lexing a lot of what really is the 01231 // comment, which surely would confuse the parser. 01232 --CurPtr; 01233 01234 // KeepWhitespaceMode should return this broken comment as a token. Since 01235 // it isn't a well formed comment, just return it as an 'unknown' token. 01236 if (isKeepWhitespaceMode()) { 01237 FormTokenWithChars(Result, CurPtr, tok::unknown); 01238 return true; 01239 } 01240 01241 BufferPtr = CurPtr; 01242 return false; 01243 } 01244 C = *CurPtr++; 01245 } 01246 01247 // Notify comment handlers about the comment unless we're in a #if 0 block. 01248 if (PP && !isLexingRawMode() && 01249 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 01250 getSourceLocation(CurPtr)))) { 01251 BufferPtr = CurPtr; 01252 return true; // A token has to be returned. 01253 } 01254 01255 // If we are returning comments as tokens, return this comment as a token. 01256 if (inKeepCommentMode()) { 01257 FormTokenWithChars(Result, CurPtr, tok::comment); 01258 return true; 01259 } 01260 01261 // It is common for the tokens immediately after a /**/ comment to be 01262 // whitespace. Instead of going through the big switch, handle it 01263 // efficiently now. This is safe even in KeepWhitespaceMode because we would 01264 // have already returned above with the comment as a token. 01265 if (isHorizontalWhitespace(*CurPtr)) { 01266 Result.setFlag(Token::LeadingSpace); 01267 SkipWhitespace(Result, CurPtr+1); 01268 return false; 01269 } 01270 01271 // Otherwise, just return so that the next character will be lexed as a token. 01272 BufferPtr = CurPtr; 01273 Result.setFlag(Token::LeadingSpace); 01274 return false; 01275 } 01276 01277 //===----------------------------------------------------------------------===// 01278 // Primary Lexing Entry Points 01279 //===----------------------------------------------------------------------===// 01280 01281 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 01282 /// uninterpreted string. This switches the lexer out of directive mode. 01283 std::string Lexer::ReadToEndOfLine() { 01284 assert(ParsingPreprocessorDirective && ParsingFilename == false && 01285 "Must be in a preprocessing directive!"); 01286 std::string Result; 01287 Token Tmp; 01288 01289 // CurPtr - Cache BufferPtr in an automatic variable. 01290 const char *CurPtr = BufferPtr; 01291 while (1) { 01292 char Char = getAndAdvanceChar(CurPtr, Tmp); 01293 switch (Char) { 01294 default: 01295 Result += Char; 01296 break; 01297 case 0: // Null. 01298 // Found end of file? 01299 if (CurPtr-1 != BufferEnd) { 01300 // Nope, normal character, continue. 01301 Result += Char; 01302 break; 01303 } 01304 // FALL THROUGH. 01305 case '\r': 01306 case '\n': 01307 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 01308 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 01309 BufferPtr = CurPtr-1; 01310 01311 // Next, lex the character, which should handle the EOM transition. 01312 Lex(Tmp); 01313 assert(Tmp.is(tok::eom) && "Unexpected token!"); 01314 01315 // Finally, we're done, return the string we found. 01316 return Result; 01317 } 01318 } 01319 } 01320 01321 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 01322 /// condition, reporting diagnostics and handling other edge cases as required. 01323 /// This returns true if Result contains a token, false if PP.Lex should be 01324 /// called again. 01325 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 01326 // If we hit the end of the file while parsing a preprocessor directive, 01327 // end the preprocessor directive first. The next token returned will 01328 // then be the end of file. 01329 if (ParsingPreprocessorDirective) { 01330 // Done parsing the "line". 01331 ParsingPreprocessorDirective = false; 01332 // Update the location of token as well as BufferPtr. 01333 FormTokenWithChars(Result, CurPtr, tok::eom); 01334 01335 // Restore comment saving mode, in case it was disabled for directive. 01336 SetCommentRetentionState(PP->getCommentRetentionState()); 01337 return true; // Have a token. 01338 } 01339 01340 // If we are in raw mode, return this event as an EOF token. Let the caller 01341 // that put us in raw mode handle the event. 01342 if (isLexingRawMode()) { 01343 Result.startToken(); 01344 BufferPtr = BufferEnd; 01345 FormTokenWithChars(Result, BufferEnd, tok::eof); 01346 return true; 01347 } 01348 01349 // Otherwise, check if we are code-completing, then issue diagnostics for 01350 // unterminated #if and missing newline. 01351 01352 if (PP && PP->isCodeCompletionFile(FileLoc)) { 01353 // We're at the end of the file, but we've been asked to consider the 01354 // end of the file to be a code-completion token. Return the 01355 // code-completion token. 01356 Result.startToken(); 01357 FormTokenWithChars(Result, CurPtr, tok::code_completion); 01358 01359 // Only do the eof -> code_completion translation once. 01360 PP->SetCodeCompletionPoint(0, 0, 0); 01361 return true; 01362 } 01363 01364 // If we are in a #if directive, emit an error. 01365 while (!ConditionalStack.empty()) { 01366 PP->Diag(ConditionalStack.back().IfLoc, 01367 diag::err_pp_unterminated_conditional); 01368 ConditionalStack.pop_back(); 01369 } 01370 01371 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 01372 // a pedwarn. 01373 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 01374 Diag(BufferEnd, diag::ext_no_newline_eof) 01375 << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n"); 01376 01377 BufferPtr = CurPtr; 01378 01379 // Finally, let the preprocessor handle this. 01380 return PP->HandleEndOfFile(Result); 01381 } 01382 01383 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 01384 /// the specified lexer will return a tok::l_paren token, 0 if it is something 01385 /// else and 2 if there are no more tokens in the buffer controlled by the 01386 /// lexer. 01387 unsigned Lexer::isNextPPTokenLParen() { 01388 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 01389 01390 // Switch to 'skipping' mode. This will ensure that we can lex a token 01391 // without emitting diagnostics, disables macro expansion, and will cause EOF 01392 // to return an EOF token instead of popping the include stack. 01393 LexingRawMode = true; 01394 01395 // Save state that can be changed while lexing so that we can restore it. 01396 const char *TmpBufferPtr = BufferPtr; 01397 bool inPPDirectiveMode = ParsingPreprocessorDirective; 01398 01399 Token Tok; 01400 Tok.startToken(); 01401 LexTokenInternal(Tok); 01402 01403 // Restore state that may have changed. 01404 BufferPtr = TmpBufferPtr; 01405 ParsingPreprocessorDirective = inPPDirectiveMode; 01406 01407 // Restore the lexer back to non-skipping mode. 01408 LexingRawMode = false; 01409 01410 if (Tok.is(tok::eof)) 01411 return 2; 01412 return Tok.is(tok::l_paren); 01413 } 01414 01415 /// FindConflictEnd - Find the end of a version control conflict marker. 01416 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd) { 01417 llvm::StringRef RestOfBuffer(CurPtr+7, BufferEnd-CurPtr-7); 01418 size_t Pos = RestOfBuffer.find(">>>>>>>"); 01419 while (Pos != llvm::StringRef::npos) { 01420 // Must occur at start of line. 01421 if (RestOfBuffer[Pos-1] != '\r' && 01422 RestOfBuffer[Pos-1] != '\n') { 01423 RestOfBuffer = RestOfBuffer.substr(Pos+7); 01424 continue; 01425 } 01426 return RestOfBuffer.data()+Pos; 01427 } 01428 return 0; 01429 } 01430 01431 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 01432 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 01433 /// and recover nicely. This returns true if it is a conflict marker and false 01434 /// if not. 01435 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 01436 // Only a conflict marker if it starts at the beginning of a line. 01437 if (CurPtr != BufferStart && 01438 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 01439 return false; 01440 01441 // Check to see if we have <<<<<<<. 01442 if (BufferEnd-CurPtr < 8 || 01443 llvm::StringRef(CurPtr, 7) != "<<<<<<<") 01444 return false; 01445 01446 // If we have a situation where we don't care about conflict markers, ignore 01447 // it. 01448 if (IsInConflictMarker || isLexingRawMode()) 01449 return false; 01450 01451 // Check to see if there is a >>>>>>> somewhere in the buffer at the start of 01452 // a line to terminate this conflict marker. 01453 if (FindConflictEnd(CurPtr+7, BufferEnd)) { 01454 // We found a match. We are really in a conflict marker. 01455 // Diagnose this, and ignore to the end of line. 01456 Diag(CurPtr, diag::err_conflict_marker); 01457 IsInConflictMarker = true; 01458 01459 // Skip ahead to the end of line. We know this exists because the 01460 // end-of-conflict marker starts with \r or \n. 01461 while (*CurPtr != '\r' && *CurPtr != '\n') { 01462 assert(CurPtr != BufferEnd && "Didn't find end of line"); 01463 ++CurPtr; 01464 } 01465 BufferPtr = CurPtr; 01466 return true; 01467 } 01468 01469 // No end of conflict marker found. 01470 return false; 01471 } 01472 01473 01474 /// HandleEndOfConflictMarker - If this is a '=======' or '|||||||' or '>>>>>>>' 01475 /// marker, then it is the end of a conflict marker. Handle it by ignoring up 01476 /// until the end of the line. This returns true if it is a conflict marker and 01477 /// false if not. 01478 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 01479 // Only a conflict marker if it starts at the beginning of a line. 01480 if (CurPtr != BufferStart && 01481 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 01482 return false; 01483 01484 // If we have a situation where we don't care about conflict markers, ignore 01485 // it. 01486 if (!IsInConflictMarker || isLexingRawMode()) 01487 return false; 01488 01489 // Check to see if we have the marker (7 characters in a row). 01490 for (unsigned i = 1; i != 7; ++i) 01491 if (CurPtr[i] != CurPtr[0]) 01492 return false; 01493 01494 // If we do have it, search for the end of the conflict marker. This could 01495 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 01496 // be the end of conflict marker. 01497 if (const char *End = FindConflictEnd(CurPtr, BufferEnd)) { 01498 CurPtr = End; 01499 01500 // Skip ahead to the end of line. 01501 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 01502 ++CurPtr; 01503 01504 BufferPtr = CurPtr; 01505 01506 // No longer in the conflict marker. 01507 IsInConflictMarker = false; 01508 return true; 01509 } 01510 01511 return false; 01512 } 01513 01514 01515 /// LexTokenInternal - This implements a simple C family lexer. It is an 01516 /// extremely performance critical piece of code. This assumes that the buffer 01517 /// has a null character at the end of the file. This returns a preprocessing 01518 /// token, not a normal token, as such, it is an internal interface. It assumes 01519 /// that the Flags of result have been cleared before calling this. 01520 void Lexer::LexTokenInternal(Token &Result) { 01521 LexNextToken: 01522 // New token, can't need cleaning yet. 01523 Result.clearFlag(Token::NeedsCleaning); 01524 Result.setIdentifierInfo(0); 01525 01526 // CurPtr - Cache BufferPtr in an automatic variable. 01527 const char *CurPtr = BufferPtr; 01528 01529 // Small amounts of horizontal whitespace is very common between tokens. 01530 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 01531 ++CurPtr; 01532 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 01533 ++CurPtr; 01534 01535 // If we are keeping whitespace and other tokens, just return what we just 01536 // skipped. The next lexer invocation will return the token after the 01537 // whitespace. 01538 if (isKeepWhitespaceMode()) { 01539 FormTokenWithChars(Result, CurPtr, tok::unknown); 01540 return; 01541 } 01542 01543 BufferPtr = CurPtr; 01544 Result.setFlag(Token::LeadingSpace); 01545 } 01546 01547 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 01548 01549 // Read a character, advancing over it. 01550 char Char = getAndAdvanceChar(CurPtr, Result); 01551 tok::TokenKind Kind; 01552 01553 switch (Char) { 01554 case 0: // Null. 01555 // Found end of file? 01556 if (CurPtr-1 == BufferEnd) { 01557 // Read the PP instance variable into an automatic variable, because 01558 // LexEndOfFile will often delete 'this'. 01559 Preprocessor *PPCache = PP; 01560 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 01561 return; // Got a token to return. 01562 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 01563 return PPCache->Lex(Result); 01564 } 01565 01566 if (!isLexingRawMode()) 01567 Diag(CurPtr-1, diag::null_in_file); 01568 Result.setFlag(Token::LeadingSpace); 01569 if (SkipWhitespace(Result, CurPtr)) 01570 return; // KeepWhitespaceMode 01571 01572 goto LexNextToken; // GCC isn't tail call eliminating. 01573 01574 case 26: // DOS & CP/M EOF: "^Z". 01575 // If we're in Microsoft extensions mode, treat this as end of file. 01576 if (Features.Microsoft) { 01577 // Read the PP instance variable into an automatic variable, because 01578 // LexEndOfFile will often delete 'this'. 01579 Preprocessor *PPCache = PP; 01580 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 01581 return; // Got a token to return. 01582 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 01583 return PPCache->Lex(Result); 01584 } 01585 // If Microsoft extensions are disabled, this is just random garbage. 01586 Kind = tok::unknown; 01587 break; 01588 01589 case '\n': 01590 case '\r': 01591 // If we are inside a preprocessor directive and we see the end of line, 01592 // we know we are done with the directive, so return an EOM token. 01593 if (ParsingPreprocessorDirective) { 01594 // Done parsing the "line". 01595 ParsingPreprocessorDirective = false; 01596 01597 // Restore comment saving mode, in case it was disabled for directive. 01598 SetCommentRetentionState(PP->getCommentRetentionState()); 01599 01600 // Since we consumed a newline, we are back at the start of a line. 01601 IsAtStartOfLine = true; 01602 01603 Kind = tok::eom; 01604 break; 01605 } 01606 // The returned token is at the start of the line. 01607 Result.setFlag(Token::StartOfLine); 01608 // No leading whitespace seen so far. 01609 Result.clearFlag(Token::LeadingSpace); 01610 01611 if (SkipWhitespace(Result, CurPtr)) 01612 return; // KeepWhitespaceMode 01613 goto LexNextToken; // GCC isn't tail call eliminating. 01614 case ' ': 01615 case '\t': 01616 case '\f': 01617 case '\v': 01618 SkipHorizontalWhitespace: 01619 Result.setFlag(Token::LeadingSpace); 01620 if (SkipWhitespace(Result, CurPtr)) 01621 return; // KeepWhitespaceMode 01622 01623 SkipIgnoredUnits: 01624 CurPtr = BufferPtr; 01625 01626 // If the next token is obviously a // or /* */ comment, skip it efficiently 01627 // too (without going through the big switch stmt). 01628 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 01629 Features.BCPLComment) { 01630 if (SkipBCPLComment(Result, CurPtr+2)) 01631 return; // There is a token to return. 01632 goto SkipIgnoredUnits; 01633 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 01634 if (SkipBlockComment(Result, CurPtr+2)) 01635 return; // There is a token to return. 01636 goto SkipIgnoredUnits; 01637 } else if (isHorizontalWhitespace(*CurPtr)) { 01638 goto SkipHorizontalWhitespace; 01639 } 01640 goto LexNextToken; // GCC isn't tail call eliminating. 01641 01642 // C99 6.4.4.1: Integer Constants. 01643 // C99 6.4.4.2: Floating Constants. 01644 case '0': case '1': case '2': case '3': case '4': 01645 case '5': case '6': case '7': case '8': case '9': 01646 // Notify MIOpt that we read a non-whitespace/non-comment token. 01647 MIOpt.ReadToken(); 01648 return LexNumericConstant(Result, CurPtr); 01649 01650 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 01651 // Notify MIOpt that we read a non-whitespace/non-comment token. 01652 MIOpt.ReadToken(); 01653 Char = getCharAndSize(CurPtr, SizeTmp); 01654 01655 // Wide string literal. 01656 if (Char == '"') 01657 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 01658 true); 01659 01660 // Wide character constant. 01661 if (Char == '\'') 01662 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 01663 // FALL THROUGH, treating L like the start of an identifier. 01664 01665 // C99 6.4.2: Identifiers. 01666 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 01667 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 01668 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 01669 case 'V': case 'W': case 'X': case 'Y': case 'Z': 01670 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 01671 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 01672 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 01673 case 'v': case 'w': case 'x': case 'y': case 'z': 01674 case '_': 01675 // Notify MIOpt that we read a non-whitespace/non-comment token. 01676 MIOpt.ReadToken(); 01677 return LexIdentifier(Result, CurPtr); 01678 01679 case '$': // $ in identifiers. 01680 if (Features.DollarIdents) { 01681 if (!isLexingRawMode()) 01682 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 01683 // Notify MIOpt that we read a non-whitespace/non-comment token. 01684 MIOpt.ReadToken(); 01685 return LexIdentifier(Result, CurPtr); 01686 } 01687 01688 Kind = tok::unknown; 01689 break; 01690 01691 // C99 6.4.4: Character Constants. 01692 case '\'': 01693 // Notify MIOpt that we read a non-whitespace/non-comment token. 01694 MIOpt.ReadToken(); 01695 return LexCharConstant(Result, CurPtr); 01696 01697 // C99 6.4.5: String Literals. 01698 case '"': 01699 // Notify MIOpt that we read a non-whitespace/non-comment token. 01700 MIOpt.ReadToken(); 01701 return LexStringLiteral(Result, CurPtr, false); 01702 01703 // C99 6.4.6: Punctuators. 01704 case '?': 01705 Kind = tok::question; 01706 break; 01707 case '[': 01708 Kind = tok::l_square; 01709 break; 01710 case ']': 01711 Kind = tok::r_square; 01712 break; 01713 case '(': 01714 Kind = tok::l_paren; 01715 break; 01716 case ')': 01717 Kind = tok::r_paren; 01718 break; 01719 case '{': 01720 Kind = tok::l_brace; 01721 break; 01722 case '}': 01723 Kind = tok::r_brace; 01724 break; 01725 case '.': 01726 Char = getCharAndSize(CurPtr, SizeTmp); 01727 if (Char >= '0' && Char <= '9') { 01728 // Notify MIOpt that we read a non-whitespace/non-comment token. 01729 MIOpt.ReadToken(); 01730 01731 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 01732 } else if (Features.CPlusPlus && Char == '*') { 01733 Kind = tok::periodstar; 01734 CurPtr += SizeTmp; 01735 } else if (Char == '.' && 01736 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 01737 Kind = tok::ellipsis; 01738 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 01739 SizeTmp2, Result); 01740 } else { 01741 Kind = tok::period; 01742 } 01743 break; 01744 case '&': 01745 Char = getCharAndSize(CurPtr, SizeTmp); 01746 if (Char == '&') { 01747 Kind = tok::ampamp; 01748 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01749 } else if (Char == '=') { 01750 Kind = tok::ampequal; 01751 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01752 } else { 01753 Kind = tok::amp; 01754 } 01755 break; 01756 case '*': 01757 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 01758 Kind = tok::starequal; 01759 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01760 } else { 01761 Kind = tok::star; 01762 } 01763 break; 01764 case '+': 01765 Char = getCharAndSize(CurPtr, SizeTmp); 01766 if (Char == '+') { 01767 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01768 Kind = tok::plusplus; 01769 } else if (Char == '=') { 01770 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01771 Kind = tok::plusequal; 01772 } else { 01773 Kind = tok::plus; 01774 } 01775 break; 01776 case '-': 01777 Char = getCharAndSize(CurPtr, SizeTmp); 01778 if (Char == '-') { // -- 01779 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01780 Kind = tok::minusminus; 01781 } else if (Char == '>' && Features.CPlusPlus && 01782 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 01783 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 01784 SizeTmp2, Result); 01785 Kind = tok::arrowstar; 01786 } else if (Char == '>') { // -> 01787 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01788 Kind = tok::arrow; 01789 } else if (Char == '=') { // -= 01790 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01791 Kind = tok::minusequal; 01792 } else { 01793 Kind = tok::minus; 01794 } 01795 break; 01796 case '~': 01797 Kind = tok::tilde; 01798 break; 01799 case '!': 01800 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 01801 Kind = tok::exclaimequal; 01802 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01803 } else { 01804 Kind = tok::exclaim; 01805 } 01806 break; 01807 case '/': 01808 // 6.4.9: Comments 01809 Char = getCharAndSize(CurPtr, SizeTmp); 01810 if (Char == '/') { // BCPL comment. 01811 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 01812 // want to lex this as a comment. There is one problem with this though, 01813 // that in one particular corner case, this can change the behavior of the 01814 // resultant program. For example, In "foo //**/ bar", C89 would lex 01815 // this as "foo / bar" and langauges with BCPL comments would lex it as 01816 // "foo". Check to see if the character after the second slash is a '*'. 01817 // If so, we will lex that as a "/" instead of the start of a comment. 01818 if (Features.BCPLComment || 01819 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') { 01820 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 01821 return; // There is a token to return. 01822 01823 // It is common for the tokens immediately after a // comment to be 01824 // whitespace (indentation for the next line). Instead of going through 01825 // the big switch, handle it efficiently now. 01826 goto SkipIgnoredUnits; 01827 } 01828 } 01829 01830 if (Char == '*') { // /**/ comment. 01831 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 01832 return; // There is a token to return. 01833 goto LexNextToken; // GCC isn't tail call eliminating. 01834 } 01835 01836 if (Char == '=') { 01837 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01838 Kind = tok::slashequal; 01839 } else { 01840 Kind = tok::slash; 01841 } 01842 break; 01843 case '%': 01844 Char = getCharAndSize(CurPtr, SizeTmp); 01845 if (Char == '=') { 01846 Kind = tok::percentequal; 01847 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01848 } else if (Features.Digraphs && Char == '>') { 01849 Kind = tok::r_brace; // '%>' -> '}' 01850 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01851 } else if (Features.Digraphs && Char == ':') { 01852 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01853 Char = getCharAndSize(CurPtr, SizeTmp); 01854 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 01855 Kind = tok::hashhash; // '%:%:' -> '##' 01856 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 01857 SizeTmp2, Result); 01858 } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize 01859 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01860 if (!isLexingRawMode()) 01861 Diag(BufferPtr, diag::charize_microsoft_ext); 01862 Kind = tok::hashat; 01863 } else { // '%:' -> '#' 01864 // We parsed a # character. If this occurs at the start of the line, 01865 // it's actually the start of a preprocessing directive. Callback to 01866 // the preprocessor to handle it. 01867 // FIXME: -fpreprocessed mode?? 01868 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 01869 FormTokenWithChars(Result, CurPtr, tok::hash); 01870 PP->HandleDirective(Result); 01871 01872 // As an optimization, if the preprocessor didn't switch lexers, tail 01873 // recurse. 01874 if (PP->isCurrentLexer(this)) { 01875 // Start a new token. If this is a #include or something, the PP may 01876 // want us starting at the beginning of the line again. If so, set 01877 // the StartOfLine flag and clear LeadingSpace. 01878 if (IsAtStartOfLine) { 01879 Result.setFlag(Token::StartOfLine); 01880 Result.clearFlag(Token::LeadingSpace); 01881 IsAtStartOfLine = false; 01882 } 01883 goto LexNextToken; // GCC isn't tail call eliminating. 01884 } 01885 01886 return PP->Lex(Result); 01887 } 01888 01889 Kind = tok::hash; 01890 } 01891 } else { 01892 Kind = tok::percent; 01893 } 01894 break; 01895 case '<': 01896 Char = getCharAndSize(CurPtr, SizeTmp); 01897 if (ParsingFilename) { 01898 return LexAngledStringLiteral(Result, CurPtr); 01899 } else if (Char == '<') { 01900 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 01901 if (After == '=') { 01902 Kind = tok::lesslessequal; 01903 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 01904 SizeTmp2, Result); 01905 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 01906 // If this is actually a '<<<<<<<' version control conflict marker, 01907 // recognize it as such and recover nicely. 01908 goto LexNextToken; 01909 } else { 01910 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01911 Kind = tok::lessless; 01912 } 01913 } else if (Char == '=') { 01914 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01915 Kind = tok::lessequal; 01916 } else if (Features.Digraphs && Char == ':') { // '<:' -> '[' 01917 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01918 Kind = tok::l_square; 01919 } else if (Features.Digraphs && Char == '%') { // '<%' -> '{' 01920 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01921 Kind = tok::l_brace; 01922 } else { 01923 Kind = tok::less; 01924 } 01925 break; 01926 case '>': 01927 Char = getCharAndSize(CurPtr, SizeTmp); 01928 if (Char == '=') { 01929 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01930 Kind = tok::greaterequal; 01931 } else if (Char == '>') { 01932 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 01933 if (After == '=') { 01934 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 01935 SizeTmp2, Result); 01936 Kind = tok::greatergreaterequal; 01937 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 01938 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 01939 goto LexNextToken; 01940 } else { 01941 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01942 Kind = tok::greatergreater; 01943 } 01944 01945 } else { 01946 Kind = tok::greater; 01947 } 01948 break; 01949 case '^': 01950 Char = getCharAndSize(CurPtr, SizeTmp); 01951 if (Char == '=') { 01952 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01953 Kind = tok::caretequal; 01954 } else { 01955 Kind = tok::caret; 01956 } 01957 break; 01958 case '|': 01959 Char = getCharAndSize(CurPtr, SizeTmp); 01960 if (Char == '=') { 01961 Kind = tok::pipeequal; 01962 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01963 } else if (Char == '|') { 01964 // If this is '|||||||' and we're in a conflict marker, ignore it. 01965 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 01966 goto LexNextToken; 01967 Kind = tok::pipepipe; 01968 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01969 } else { 01970 Kind = tok::pipe; 01971 } 01972 break; 01973 case ':': 01974 Char = getCharAndSize(CurPtr, SizeTmp); 01975 if (Features.Digraphs && Char == '>') { 01976 Kind = tok::r_square; // ':>' -> ']' 01977 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01978 } else if (Features.CPlusPlus && Char == ':') { 01979 Kind = tok::coloncolon; 01980 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01981 } else { 01982 Kind = tok::colon; 01983 } 01984 break; 01985 case ';': 01986 Kind = tok::semi; 01987 break; 01988 case '=': 01989 Char = getCharAndSize(CurPtr, SizeTmp); 01990 if (Char == '=') { 01991 // If this is '=======' and we're in a conflict marker, ignore it. 01992 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 01993 goto LexNextToken; 01994 01995 Kind = tok::equalequal; 01996 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01997 } else { 01998 Kind = tok::equal; 01999 } 02000 break; 02001 case ',': 02002 Kind = tok::comma; 02003 break; 02004 case '#': 02005 Char = getCharAndSize(CurPtr, SizeTmp); 02006 if (Char == '#') { 02007 Kind = tok::hashhash; 02008 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02009 } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize 02010 Kind = tok::hashat; 02011 if (!isLexingRawMode()) 02012 Diag(BufferPtr, diag::charize_microsoft_ext); 02013 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02014 } else { 02015 // We parsed a # character. If this occurs at the start of the line, 02016 // it's actually the start of a preprocessing directive. Callback to 02017 // the preprocessor to handle it. 02018 // FIXME: -fpreprocessed mode?? 02019 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 02020 FormTokenWithChars(Result, CurPtr, tok::hash); 02021 PP->HandleDirective(Result); 02022 02023 // As an optimization, if the preprocessor didn't switch lexers, tail 02024 // recurse. 02025 if (PP->isCurrentLexer(this)) { 02026 // Start a new token. If this is a #include or something, the PP may 02027 // want us starting at the beginning of the line again. If so, set 02028 // the StartOfLine flag and clear LeadingSpace. 02029 if (IsAtStartOfLine) { 02030 Result.setFlag(Token::StartOfLine); 02031 Result.clearFlag(Token::LeadingSpace); 02032 IsAtStartOfLine = false; 02033 } 02034 goto LexNextToken; // GCC isn't tail call eliminating. 02035 } 02036 return PP->Lex(Result); 02037 } 02038 02039 Kind = tok::hash; 02040 } 02041 break; 02042 02043 case '@': 02044 // Objective C support. 02045 if (CurPtr[-1] == '@' && Features.ObjC1) 02046 Kind = tok::at; 02047 else 02048 Kind = tok::unknown; 02049 break; 02050 02051 case '\\': 02052 // FIXME: UCN's. 02053 // FALL THROUGH. 02054 default: 02055 Kind = tok::unknown; 02056 break; 02057 } 02058 02059 // Notify MIOpt that we read a non-whitespace/non-comment token. 02060 MIOpt.ReadToken(); 02061 02062 // Update the location of token as well as BufferPtr. 02063 FormTokenWithChars(Result, CurPtr, Kind); 02064 }