clang API Documentation
00001 //===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file implements the Lexer and Token interfaces. 00011 // 00012 //===----------------------------------------------------------------------===// 00013 // 00014 // TODO: GCC Diagnostics emitted by the lexer: 00015 // PEDWARN: (form feed|vertical tab) in preprocessing directive 00016 // 00017 // Universal characters, unicode, char mapping: 00018 // WARNING: `%.*s' is not in NFKC 00019 // WARNING: `%.*s' is not in NFC 00020 // 00021 // Other: 00022 // TODO: Options to support: 00023 // -fexec-charset,-fwide-exec-charset 00024 // 00025 //===----------------------------------------------------------------------===// 00026 00027 #include "clang/Lex/Lexer.h" 00028 #include "clang/Lex/Preprocessor.h" 00029 #include "clang/Lex/LexDiagnostic.h" 00030 #include "clang/Basic/SourceManager.h" 00031 #include "llvm/Support/Compiler.h" 00032 #include "llvm/Support/MemoryBuffer.h" 00033 #include <cctype> 00034 using namespace clang; 00035 00036 static void InitCharacterInfo(); 00037 00038 //===----------------------------------------------------------------------===// 00039 // Token Class Implementation 00040 //===----------------------------------------------------------------------===// 00041 00042 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 00043 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 00044 if (IdentifierInfo *II = getIdentifierInfo()) 00045 return II->getObjCKeywordID() == objcKey; 00046 return false; 00047 } 00048 00049 /// getObjCKeywordID - Return the ObjC keyword kind. 00050 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 00051 IdentifierInfo *specId = getIdentifierInfo(); 00052 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 00053 } 00054 00055 00056 //===----------------------------------------------------------------------===// 00057 // Lexer Class Implementation 00058 //===----------------------------------------------------------------------===// 00059 00060 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 00061 const char *BufEnd) { 00062 InitCharacterInfo(); 00063 00064 BufferStart = BufStart; 00065 BufferPtr = BufPtr; 00066 BufferEnd = BufEnd; 00067 00068 assert(BufEnd[0] == 0 && 00069 "We assume that the input buffer has a null character at the end" 00070 " to simplify lexing!"); 00071 00072 Is_PragmaLexer = false; 00073 IsInConflictMarker = false; 00074 00075 // Start of the file is a start of line. 00076 IsAtStartOfLine = true; 00077 00078 // We are not after parsing a #. 00079 ParsingPreprocessorDirective = false; 00080 00081 // We are not after parsing #include. 00082 ParsingFilename = false; 00083 00084 // We are not in raw mode. Raw mode disables diagnostics and interpretation 00085 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 00086 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 00087 // or otherwise skipping over tokens. 00088 LexingRawMode = false; 00089 00090 // Default to not keeping comments. 00091 ExtendedTokenMode = 0; 00092 } 00093 00094 /// Lexer constructor - Create a new lexer object for the specified buffer 00095 /// with the specified preprocessor managing the lexing process. This lexer 00096 /// assumes that the associated file buffer and Preprocessor objects will 00097 /// outlive it, so it doesn't take ownership of either of them. 00098 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 00099 : PreprocessorLexer(&PP, FID), 00100 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 00101 Features(PP.getLangOptions()) { 00102 00103 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 00104 InputFile->getBufferEnd()); 00105 00106 // Default to keeping comments if the preprocessor wants them. 00107 SetCommentRetentionState(PP.getCommentRetentionState()); 00108 } 00109 00110 /// Lexer constructor - Create a new raw lexer object. This object is only 00111 /// suitable for calls to 'LexRawToken'. This lexer assumes that the text 00112 /// range will outlive it, so it doesn't take ownership of it. 00113 Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, 00114 const char *BufStart, const char *BufPtr, const char *BufEnd) 00115 : FileLoc(fileloc), Features(features) { 00116 00117 InitLexer(BufStart, BufPtr, BufEnd); 00118 00119 // We *are* in raw mode. 00120 LexingRawMode = true; 00121 } 00122 00123 /// Lexer constructor - Create a new raw lexer object. This object is only 00124 /// suitable for calls to 'LexRawToken'. This lexer assumes that the text 00125 /// range will outlive it, so it doesn't take ownership of it. 00126 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 00127 const SourceManager &SM, const LangOptions &features) 00128 : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) { 00129 00130 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 00131 FromFile->getBufferEnd()); 00132 00133 // We *are* in raw mode. 00134 LexingRawMode = true; 00135 } 00136 00137 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 00138 /// _Pragma expansion. This has a variety of magic semantics that this method 00139 /// sets up. It returns a new'd Lexer that must be delete'd when done. 00140 /// 00141 /// On entrance to this routine, TokStartLoc is a macro location which has a 00142 /// spelling loc that indicates the bytes to be lexed for the token and an 00143 /// instantiation location that indicates where all lexed tokens should be 00144 /// "expanded from". 00145 /// 00146 /// FIXME: It would really be nice to make _Pragma just be a wrapper around a 00147 /// normal lexer that remaps tokens as they fly by. This would require making 00148 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 00149 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 00150 /// out of the critical path of the lexer! 00151 /// 00152 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 00153 SourceLocation InstantiationLocStart, 00154 SourceLocation InstantiationLocEnd, 00155 unsigned TokLen, Preprocessor &PP) { 00156 SourceManager &SM = PP.getSourceManager(); 00157 00158 // Create the lexer as if we were going to lex the file normally. 00159 FileID SpellingFID = SM.getFileID(SpellingLoc); 00160 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 00161 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 00162 00163 // Now that the lexer is created, change the start/end locations so that we 00164 // just lex the subsection of the file that we want. This is lexing from a 00165 // scratch buffer. 00166 const char *StrData = SM.getCharacterData(SpellingLoc); 00167 00168 L->BufferPtr = StrData; 00169 L->BufferEnd = StrData+TokLen; 00170 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 00171 00172 // Set the SourceLocation with the remapping information. This ensures that 00173 // GetMappedTokenLoc will remap the tokens as they are lexed. 00174 L->FileLoc = SM.createInstantiationLoc(SM.getLocForStartOfFile(SpellingFID), 00175 InstantiationLocStart, 00176 InstantiationLocEnd, TokLen); 00177 00178 // Ensure that the lexer thinks it is inside a directive, so that end \n will 00179 // return an EOM token. 00180 L->ParsingPreprocessorDirective = true; 00181 00182 // This lexer really is for _Pragma. 00183 L->Is_PragmaLexer = true; 00184 return L; 00185 } 00186 00187 00188 /// Stringify - Convert the specified string into a C string, with surrounding 00189 /// ""'s, and with escaped \ and " characters. 00190 std::string Lexer::Stringify(const std::string &Str, bool Charify) { 00191 std::string Result = Str; 00192 char Quote = Charify ? '\'' : '"'; 00193 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 00194 if (Result[i] == '\\' || Result[i] == Quote) { 00195 Result.insert(Result.begin()+i, '\\'); 00196 ++i; ++e; 00197 } 00198 } 00199 return Result; 00200 } 00201 00202 /// Stringify - Convert the specified string into a C string by escaping '\' 00203 /// and " characters. This does not add surrounding ""'s to the string. 00204 void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) { 00205 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 00206 if (Str[i] == '\\' || Str[i] == '"') { 00207 Str.insert(Str.begin()+i, '\\'); 00208 ++i; ++e; 00209 } 00210 } 00211 } 00212 00213 static bool isWhitespace(unsigned char c); 00214 00215 /// MeasureTokenLength - Relex the token at the specified location and return 00216 /// its length in bytes in the input file. If the token needs cleaning (e.g. 00217 /// includes a trigraph or an escaped newline) then this count includes bytes 00218 /// that are part of that. 00219 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 00220 const SourceManager &SM, 00221 const LangOptions &LangOpts) { 00222 // TODO: this could be special cased for common tokens like identifiers, ')', 00223 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 00224 // all obviously single-char tokens. This could use 00225 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 00226 // something. 00227 00228 // If this comes from a macro expansion, we really do want the macro name, not 00229 // the token this macro expanded to. 00230 Loc = SM.getInstantiationLoc(Loc); 00231 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 00232 bool Invalid = false; 00233 std::pair<const char *,const char *> Buffer = SM.getBufferData(LocInfo.first, 00234 &Invalid); 00235 if (Invalid) 00236 return 0; 00237 00238 const char *StrData = Buffer.first+LocInfo.second; 00239 00240 if (isWhitespace(StrData[0])) 00241 return 0; 00242 00243 // Create a lexer starting at the beginning of this token. 00244 Lexer TheLexer(Loc, LangOpts, Buffer.first, StrData, Buffer.second); 00245 TheLexer.SetCommentRetentionState(true); 00246 Token TheTok; 00247 TheLexer.LexFromRawLexer(TheTok); 00248 return TheTok.getLength(); 00249 } 00250 00251 //===----------------------------------------------------------------------===// 00252 // Character information. 00253 //===----------------------------------------------------------------------===// 00254 00255 enum { 00256 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 00257 CHAR_VERT_WS = 0x02, // '\r', '\n' 00258 CHAR_LETTER = 0x04, // a-z,A-Z 00259 CHAR_NUMBER = 0x08, // 0-9 00260 CHAR_UNDER = 0x10, // _ 00261 CHAR_PERIOD = 0x20 // . 00262 }; 00263 00264 // Statically initialize CharInfo table based on ASCII character set 00265 // Reference: FreeBSD 7.2 /usr/share/misc/ascii 00266 static const unsigned char CharInfo[256] = 00267 { 00268 // 0 NUL 1 SOH 2 STX 3 ETX 00269 // 4 EOT 5 ENQ 6 ACK 7 BEL 00270 0 , 0 , 0 , 0 , 00271 0 , 0 , 0 , 0 , 00272 // 8 BS 9 HT 10 NL 11 VT 00273 //12 NP 13 CR 14 SO 15 SI 00274 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, 00275 CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , 00276 //16 DLE 17 DC1 18 DC2 19 DC3 00277 //20 DC4 21 NAK 22 SYN 23 ETB 00278 0 , 0 , 0 , 0 , 00279 0 , 0 , 0 , 0 , 00280 //24 CAN 25 EM 26 SUB 27 ESC 00281 //28 FS 29 GS 30 RS 31 US 00282 0 , 0 , 0 , 0 , 00283 0 , 0 , 0 , 0 , 00284 //32 SP 33 ! 34 " 35 # 00285 //36 $ 37 % 38 & 39 ' 00286 CHAR_HORZ_WS, 0 , 0 , 0 , 00287 0 , 0 , 0 , 0 , 00288 //40 ( 41 ) 42 * 43 + 00289 //44 , 45 - 46 . 47 / 00290 0 , 0 , 0 , 0 , 00291 0 , 0 , CHAR_PERIOD , 0 , 00292 //48 0 49 1 50 2 51 3 00293 //52 4 53 5 54 6 55 7 00294 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 00295 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 00296 //56 8 57 9 58 : 59 ; 00297 //60 < 61 = 62 > 63 ? 00298 CHAR_NUMBER , CHAR_NUMBER , 0 , 0 , 00299 0 , 0 , 0 , 0 , 00300 //64 @ 65 A 66 B 67 C 00301 //68 D 69 E 70 F 71 G 00302 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00303 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00304 //72 H 73 I 74 J 75 K 00305 //76 L 77 M 78 N 79 O 00306 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00307 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00308 //80 P 81 Q 82 R 83 S 00309 //84 T 85 U 86 V 87 W 00310 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00311 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00312 //88 X 89 Y 90 Z 91 [ 00313 //92 \ 93 ] 94 ^ 95 _ 00314 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 , 00315 0 , 0 , 0 , CHAR_UNDER , 00316 //96 ` 97 a 98 b 99 c 00317 //100 d 101 e 102 f 103 g 00318 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00319 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00320 //104 h 105 i 106 j 107 k 00321 //108 l 109 m 110 n 111 o 00322 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00323 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00324 //112 p 113 q 114 r 115 s 00325 //116 t 117 u 118 v 119 w 00326 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00327 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 00328 //120 x 121 y 122 z 123 { 00329 //124 | 125 } 126 ~ 127 DEL 00330 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 , 00331 0 , 0 , 0 , 0 00332 }; 00333 00334 static void InitCharacterInfo() { 00335 static bool isInited = false; 00336 if (isInited) return; 00337 // check the statically-initialized CharInfo table 00338 assert(CHAR_HORZ_WS == CharInfo[(int)' ']); 00339 assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); 00340 assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); 00341 assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); 00342 assert(CHAR_VERT_WS == CharInfo[(int)'\n']); 00343 assert(CHAR_VERT_WS == CharInfo[(int)'\r']); 00344 assert(CHAR_UNDER == CharInfo[(int)'_']); 00345 assert(CHAR_PERIOD == CharInfo[(int)'.']); 00346 for (unsigned i = 'a'; i <= 'z'; ++i) { 00347 assert(CHAR_LETTER == CharInfo[i]); 00348 assert(CHAR_LETTER == CharInfo[i+'A'-'a']); 00349 } 00350 for (unsigned i = '0'; i <= '9'; ++i) 00351 assert(CHAR_NUMBER == CharInfo[i]); 00352 00353 isInited = true; 00354 } 00355 00356 00357 /// isIdentifierBody - Return true if this is the body character of an 00358 /// identifier, which is [a-zA-Z0-9_]. 00359 static inline bool isIdentifierBody(unsigned char c) { 00360 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 00361 } 00362 00363 /// isHorizontalWhitespace - Return true if this character is horizontal 00364 /// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. 00365 static inline bool isHorizontalWhitespace(unsigned char c) { 00366 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 00367 } 00368 00369 /// isWhitespace - Return true if this character is horizontal or vertical 00370 /// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false 00371 /// for '\0'. 00372 static inline bool isWhitespace(unsigned char c) { 00373 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 00374 } 00375 00376 /// isNumberBody - Return true if this is the body character of an 00377 /// preprocessing number, which is [a-zA-Z0-9_.]. 00378 static inline bool isNumberBody(unsigned char c) { 00379 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 00380 true : false; 00381 } 00382 00383 00384 //===----------------------------------------------------------------------===// 00385 // Diagnostics forwarding code. 00386 //===----------------------------------------------------------------------===// 00387 00388 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 00389 /// lexer buffer was all instantiated at a single point, perform the mapping. 00390 /// This is currently only used for _Pragma implementation, so it is the slow 00391 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 00392 static DISABLE_INLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, 00393 SourceLocation FileLoc, 00394 unsigned CharNo, 00395 unsigned TokLen); 00396 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 00397 SourceLocation FileLoc, 00398 unsigned CharNo, unsigned TokLen) { 00399 assert(FileLoc.isMacroID() && "Must be an instantiation"); 00400 00401 // Otherwise, we're lexing "mapped tokens". This is used for things like 00402 // _Pragma handling. Combine the instantiation location of FileLoc with the 00403 // spelling location. 00404 SourceManager &SM = PP.getSourceManager(); 00405 00406 // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose 00407 // characters come from spelling(FileLoc)+Offset. 00408 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 00409 SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo); 00410 00411 // Figure out the expansion loc range, which is the range covered by the 00412 // original _Pragma(...) sequence. 00413 std::pair<SourceLocation,SourceLocation> II = 00414 SM.getImmediateInstantiationRange(FileLoc); 00415 00416 return SM.createInstantiationLoc(SpellingLoc, II.first, II.second, TokLen); 00417 } 00418 00419 /// getSourceLocation - Return a source location identifier for the specified 00420 /// offset in the current file. 00421 SourceLocation Lexer::getSourceLocation(const char *Loc, 00422 unsigned TokLen) const { 00423 assert(Loc >= BufferStart && Loc <= BufferEnd && 00424 "Location out of range for this buffer!"); 00425 00426 // In the normal case, we're just lexing from a simple file buffer, return 00427 // the file id from FileLoc with the offset specified. 00428 unsigned CharNo = Loc-BufferStart; 00429 if (FileLoc.isFileID()) 00430 return FileLoc.getFileLocWithOffset(CharNo); 00431 00432 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 00433 // tokens are lexed from where the _Pragma was defined. 00434 assert(PP && "This doesn't work on raw lexers"); 00435 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 00436 } 00437 00438 /// Diag - Forwarding function for diagnostics. This translate a source 00439 /// position in the current buffer into a SourceLocation object for rendering. 00440 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 00441 return PP->Diag(getSourceLocation(Loc), DiagID); 00442 } 00443 00444 //===----------------------------------------------------------------------===// 00445 // Trigraph and Escaped Newline Handling Code. 00446 //===----------------------------------------------------------------------===// 00447 00448 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 00449 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 00450 static char GetTrigraphCharForLetter(char Letter) { 00451 switch (Letter) { 00452 default: return 0; 00453 case '=': return '#'; 00454 case ')': return ']'; 00455 case '(': return '['; 00456 case '!': return '|'; 00457 case '\'': return '^'; 00458 case '>': return '}'; 00459 case '/': return '\\'; 00460 case '<': return '{'; 00461 case '-': return '~'; 00462 } 00463 } 00464 00465 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 00466 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 00467 /// return the result character. Finally, emit a warning about trigraph use 00468 /// whether trigraphs are enabled or not. 00469 static char DecodeTrigraphChar(const char *CP, Lexer *L) { 00470 char Res = GetTrigraphCharForLetter(*CP); 00471 if (!Res || !L) return Res; 00472 00473 if (!L->getFeatures().Trigraphs) { 00474 if (!L->isLexingRawMode()) 00475 L->Diag(CP-2, diag::trigraph_ignored); 00476 return 0; 00477 } 00478 00479 if (!L->isLexingRawMode()) 00480 L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res; 00481 return Res; 00482 } 00483 00484 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 00485 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 00486 /// trigraph equivalent on entry to this function. 00487 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 00488 unsigned Size = 0; 00489 while (isWhitespace(Ptr[Size])) { 00490 ++Size; 00491 00492 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 00493 continue; 00494 00495 // If this is a \r\n or \n\r, skip the other half. 00496 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 00497 Ptr[Size-1] != Ptr[Size]) 00498 ++Size; 00499 00500 return Size; 00501 } 00502 00503 // Not an escaped newline, must be a \t or something else. 00504 return 0; 00505 } 00506 00507 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 00508 /// them), skip over them and return the first non-escaped-newline found, 00509 /// otherwise return P. 00510 const char *Lexer::SkipEscapedNewLines(const char *P) { 00511 while (1) { 00512 const char *AfterEscape; 00513 if (*P == '\\') { 00514 AfterEscape = P+1; 00515 } else if (*P == '?') { 00516 // If not a trigraph for escape, bail out. 00517 if (P[1] != '?' || P[2] != '/') 00518 return P; 00519 AfterEscape = P+3; 00520 } else { 00521 return P; 00522 } 00523 00524 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 00525 if (NewLineSize == 0) return P; 00526 P = AfterEscape+NewLineSize; 00527 } 00528 } 00529 00530 00531 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 00532 /// get its size, and return it. This is tricky in several cases: 00533 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 00534 /// then either return the trigraph (skipping 3 chars) or the '?', 00535 /// depending on whether trigraphs are enabled or not. 00536 /// 2. If this is an escaped newline (potentially with whitespace between 00537 /// the backslash and newline), implicitly skip the newline and return 00538 /// the char after it. 00539 /// 3. If this is a UCN, return it. FIXME: C++ UCN's? 00540 /// 00541 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 00542 /// know that we can accumulate into Size, and that we have already incremented 00543 /// Ptr by Size bytes. 00544 /// 00545 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 00546 /// be updated to match. 00547 /// 00548 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 00549 Token *Tok) { 00550 // If we have a slash, look for an escaped newline. 00551 if (Ptr[0] == '\\') { 00552 ++Size; 00553 ++Ptr; 00554 Slash: 00555 // Common case, backslash-char where the char is not whitespace. 00556 if (!isWhitespace(Ptr[0])) return '\\'; 00557 00558 // See if we have optional whitespace characters between the slash and 00559 // newline. 00560 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 00561 // Remember that this token needs to be cleaned. 00562 if (Tok) Tok->setFlag(Token::NeedsCleaning); 00563 00564 // Warn if there was whitespace between the backslash and newline. 00565 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 00566 Diag(Ptr, diag::backslash_newline_space); 00567 00568 // Found backslash<whitespace><newline>. Parse the char after it. 00569 Size += EscapedNewLineSize; 00570 Ptr += EscapedNewLineSize; 00571 // Use slow version to accumulate a correct size field. 00572 return getCharAndSizeSlow(Ptr, Size, Tok); 00573 } 00574 00575 // Otherwise, this is not an escaped newline, just return the slash. 00576 return '\\'; 00577 } 00578 00579 // If this is a trigraph, process it. 00580 if (Ptr[0] == '?' && Ptr[1] == '?') { 00581 // If this is actually a legal trigraph (not something like "??x"), emit 00582 // a trigraph warning. If so, and if trigraphs are enabled, return it. 00583 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 00584 // Remember that this token needs to be cleaned. 00585 if (Tok) Tok->setFlag(Token::NeedsCleaning); 00586 00587 Ptr += 3; 00588 Size += 3; 00589 if (C == '\\') goto Slash; 00590 return C; 00591 } 00592 } 00593 00594 // If this is neither, return a single character. 00595 ++Size; 00596 return *Ptr; 00597 } 00598 00599 00600 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 00601 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 00602 /// and that we have already incremented Ptr by Size bytes. 00603 /// 00604 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 00605 /// be updated to match. 00606 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 00607 const LangOptions &Features) { 00608 // If we have a slash, look for an escaped newline. 00609 if (Ptr[0] == '\\') { 00610 ++Size; 00611 ++Ptr; 00612 Slash: 00613 // Common case, backslash-char where the char is not whitespace. 00614 if (!isWhitespace(Ptr[0])) return '\\'; 00615 00616 // See if we have optional whitespace characters followed by a newline. 00617 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 00618 // Found backslash<whitespace><newline>. Parse the char after it. 00619 Size += EscapedNewLineSize; 00620 Ptr += EscapedNewLineSize; 00621 00622 // Use slow version to accumulate a correct size field. 00623 return getCharAndSizeSlowNoWarn(Ptr, Size, Features); 00624 } 00625 00626 // Otherwise, this is not an escaped newline, just return the slash. 00627 return '\\'; 00628 } 00629 00630 // If this is a trigraph, process it. 00631 if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 00632 // If this is actually a legal trigraph (not something like "??x"), return 00633 // it. 00634 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 00635 Ptr += 3; 00636 Size += 3; 00637 if (C == '\\') goto Slash; 00638 return C; 00639 } 00640 } 00641 00642 // If this is neither, return a single character. 00643 ++Size; 00644 return *Ptr; 00645 } 00646 00647 //===----------------------------------------------------------------------===// 00648 // Helper methods for lexing. 00649 //===----------------------------------------------------------------------===// 00650 00651 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 00652 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 00653 unsigned Size; 00654 unsigned char C = *CurPtr++; 00655 while (isIdentifierBody(C)) 00656 C = *CurPtr++; 00657 00658 --CurPtr; // Back up over the skipped character. 00659 00660 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 00661 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 00662 // FIXME: UCNs. 00663 // 00664 // TODO: Could merge these checks into a CharInfo flag to make the comparison 00665 // cheaper 00666 if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { 00667 FinishIdentifier: 00668 const char *IdStart = BufferPtr; 00669 FormTokenWithChars(Result, CurPtr, tok::identifier); 00670 00671 // If we are in raw mode, return this identifier raw. There is no need to 00672 // look up identifier information or attempt to macro expand it. 00673 if (LexingRawMode) return; 00674 00675 // Fill in Result.IdentifierInfo, looking up the identifier in the 00676 // identifier table. 00677 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart); 00678 00679 // Change the kind of this identifier to the appropriate token kind, e.g. 00680 // turning "for" into a keyword. 00681 Result.setKind(II->getTokenID()); 00682 00683 // Finally, now that we know we have an identifier, pass this off to the 00684 // preprocessor, which may macro expand it or something. 00685 if (II->isHandleIdentifierCase()) 00686 PP->HandleIdentifier(Result); 00687 return; 00688 } 00689 00690 // Otherwise, $,\,? in identifier found. Enter slower path. 00691 00692 C = getCharAndSize(CurPtr, Size); 00693 while (1) { 00694 if (C == '$') { 00695 // If we hit a $ and they are not supported in identifiers, we are done. 00696 if (!Features.DollarIdents) goto FinishIdentifier; 00697 00698 // Otherwise, emit a diagnostic and continue. 00699 if (!isLexingRawMode()) 00700 Diag(CurPtr, diag::ext_dollar_in_identifier); 00701 CurPtr = ConsumeChar(CurPtr, Size, Result); 00702 C = getCharAndSize(CurPtr, Size); 00703 continue; 00704 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 00705 // Found end of identifier. 00706 goto FinishIdentifier; 00707 } 00708 00709 // Otherwise, this character is good, consume it. 00710 CurPtr = ConsumeChar(CurPtr, Size, Result); 00711 00712 C = getCharAndSize(CurPtr, Size); 00713 while (isIdentifierBody(C)) { // FIXME: UCNs. 00714 CurPtr = ConsumeChar(CurPtr, Size, Result); 00715 C = getCharAndSize(CurPtr, Size); 00716 } 00717 } 00718 } 00719 00720 00721 /// LexNumericConstant - Lex the remainder of a integer or floating point 00722 /// constant. From[-1] is the first character lexed. Return the end of the 00723 /// constant. 00724 void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 00725 unsigned Size; 00726 char C = getCharAndSize(CurPtr, Size); 00727 char PrevCh = 0; 00728 while (isNumberBody(C)) { // FIXME: UCNs? 00729 CurPtr = ConsumeChar(CurPtr, Size, Result); 00730 PrevCh = C; 00731 C = getCharAndSize(CurPtr, Size); 00732 } 00733 00734 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 00735 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) 00736 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 00737 00738 // If we have a hex FP constant, continue. 00739 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') && 00740 (!PP || !PP->getLangOptions().CPlusPlus0x)) 00741 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 00742 00743 // Update the location of token as well as BufferPtr. 00744 const char *TokStart = BufferPtr; 00745 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 00746 Result.setLiteralData(TokStart); 00747 } 00748 00749 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 00750 /// either " or L". 00751 void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { 00752 const char *NulCharacter = 0; // Does this string contain the \0 character? 00753 00754 char C = getAndAdvanceChar(CurPtr, Result); 00755 while (C != '"') { 00756 // Skip escaped characters. 00757 if (C == '\\') { 00758 // Skip the escaped character. 00759 C = getAndAdvanceChar(CurPtr, Result); 00760 } else if (C == '\n' || C == '\r' || // Newline. 00761 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 00762 if (!isLexingRawMode() && !Features.AsmPreprocessor) 00763 Diag(BufferPtr, diag::err_unterminated_string); 00764 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 00765 return; 00766 } else if (C == 0) { 00767 NulCharacter = CurPtr-1; 00768 } 00769 C = getAndAdvanceChar(CurPtr, Result); 00770 } 00771 00772 // If a nul character existed in the string, warn about it. 00773 if (NulCharacter && !isLexingRawMode()) 00774 Diag(NulCharacter, diag::null_in_string); 00775 00776 // Update the location of the token as well as the BufferPtr instance var. 00777 const char *TokStart = BufferPtr; 00778 FormTokenWithChars(Result, CurPtr, 00779 Wide ? tok::wide_string_literal : tok::string_literal); 00780 Result.setLiteralData(TokStart); 00781 } 00782 00783 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 00784 /// after having lexed the '<' character. This is used for #include filenames. 00785 void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 00786 const char *NulCharacter = 0; // Does this string contain the \0 character? 00787 const char *AfterLessPos = CurPtr; 00788 char C = getAndAdvanceChar(CurPtr, Result); 00789 while (C != '>') { 00790 // Skip escaped characters. 00791 if (C == '\\') { 00792 // Skip the escaped character. 00793 C = getAndAdvanceChar(CurPtr, Result); 00794 } else if (C == '\n' || C == '\r' || // Newline. 00795 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 00796 // If the filename is unterminated, then it must just be a lone < 00797 // character. Return this as such. 00798 FormTokenWithChars(Result, AfterLessPos, tok::less); 00799 return; 00800 } else if (C == 0) { 00801 NulCharacter = CurPtr-1; 00802 } 00803 C = getAndAdvanceChar(CurPtr, Result); 00804 } 00805 00806 // If a nul character existed in the string, warn about it. 00807 if (NulCharacter && !isLexingRawMode()) 00808 Diag(NulCharacter, diag::null_in_string); 00809 00810 // Update the location of token as well as BufferPtr. 00811 const char *TokStart = BufferPtr; 00812 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 00813 Result.setLiteralData(TokStart); 00814 } 00815 00816 00817 /// LexCharConstant - Lex the remainder of a character constant, after having 00818 /// lexed either ' or L'. 00819 void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { 00820 const char *NulCharacter = 0; // Does this character contain the \0 character? 00821 00822 // Handle the common case of 'x' and '\y' efficiently. 00823 char C = getAndAdvanceChar(CurPtr, Result); 00824 if (C == '\'') { 00825 if (!isLexingRawMode() && !Features.AsmPreprocessor) 00826 Diag(BufferPtr, diag::err_empty_character); 00827 FormTokenWithChars(Result, CurPtr, tok::unknown); 00828 return; 00829 } else if (C == '\\') { 00830 // Skip the escaped character. 00831 // FIXME: UCN's. 00832 C = getAndAdvanceChar(CurPtr, Result); 00833 } 00834 00835 if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') { 00836 ++CurPtr; 00837 } else { 00838 // Fall back on generic code for embedded nulls, newlines, wide chars. 00839 do { 00840 // Skip escaped characters. 00841 if (C == '\\') { 00842 // Skip the escaped character. 00843 C = getAndAdvanceChar(CurPtr, Result); 00844 } else if (C == '\n' || C == '\r' || // Newline. 00845 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 00846 if (!isLexingRawMode() && !Features.AsmPreprocessor) 00847 Diag(BufferPtr, diag::err_unterminated_char); 00848 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 00849 return; 00850 } else if (C == 0) { 00851 NulCharacter = CurPtr-1; 00852 } 00853 C = getAndAdvanceChar(CurPtr, Result); 00854 } while (C != '\''); 00855 } 00856 00857 if (NulCharacter && !isLexingRawMode()) 00858 Diag(NulCharacter, diag::null_in_char); 00859 00860 // Update the location of token as well as BufferPtr. 00861 const char *TokStart = BufferPtr; 00862 FormTokenWithChars(Result, CurPtr, tok::char_constant); 00863 Result.setLiteralData(TokStart); 00864 } 00865 00866 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 00867 /// Update BufferPtr to point to the next non-whitespace character and return. 00868 /// 00869 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 00870 /// 00871 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 00872 // Whitespace - Skip it, then return the token after the whitespace. 00873 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 00874 while (1) { 00875 // Skip horizontal whitespace very aggressively. 00876 while (isHorizontalWhitespace(Char)) 00877 Char = *++CurPtr; 00878 00879 // Otherwise if we have something other than whitespace, we're done. 00880 if (Char != '\n' && Char != '\r') 00881 break; 00882 00883 if (ParsingPreprocessorDirective) { 00884 // End of preprocessor directive line, let LexTokenInternal handle this. 00885 BufferPtr = CurPtr; 00886 return false; 00887 } 00888 00889 // ok, but handle newline. 00890 // The returned token is at the start of the line. 00891 Result.setFlag(Token::StartOfLine); 00892 // No leading whitespace seen so far. 00893 Result.clearFlag(Token::LeadingSpace); 00894 Char = *++CurPtr; 00895 } 00896 00897 // If this isn't immediately after a newline, there is leading space. 00898 char PrevChar = CurPtr[-1]; 00899 if (PrevChar != '\n' && PrevChar != '\r') 00900 Result.setFlag(Token::LeadingSpace); 00901 00902 // If the client wants us to return whitespace, return it now. 00903 if (isKeepWhitespaceMode()) { 00904 FormTokenWithChars(Result, CurPtr, tok::unknown); 00905 return true; 00906 } 00907 00908 BufferPtr = CurPtr; 00909 return false; 00910 } 00911 00912 // SkipBCPLComment - We have just read the // characters from input. Skip until 00913 // we find the newline character thats terminate the comment. Then update 00914 /// BufferPtr and return. 00915 /// 00916 /// If we're in KeepCommentMode or any CommentHandler has inserted 00917 /// some tokens, this will store the first token and return true. 00918 bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 00919 // If BCPL comments aren't explicitly enabled for this language, emit an 00920 // extension warning. 00921 if (!Features.BCPLComment && !isLexingRawMode()) { 00922 Diag(BufferPtr, diag::ext_bcpl_comment); 00923 00924 // Mark them enabled so we only emit one warning for this translation 00925 // unit. 00926 Features.BCPLComment = true; 00927 } 00928 00929 // Scan over the body of the comment. The common case, when scanning, is that 00930 // the comment contains normal ascii characters with nothing interesting in 00931 // them. As such, optimize for this case with the inner loop. 00932 char C; 00933 do { 00934 C = *CurPtr; 00935 // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character. 00936 // If we find a \n character, scan backwards, checking to see if it's an 00937 // escaped newline, like we do for block comments. 00938 00939 // Skip over characters in the fast loop. 00940 while (C != 0 && // Potentially EOF. 00941 C != '\\' && // Potentially escaped newline. 00942 C != '?' && // Potentially trigraph. 00943 C != '\n' && C != '\r') // Newline or DOS-style newline. 00944 C = *++CurPtr; 00945 00946 // If this is a newline, we're done. 00947 if (C == '\n' || C == '\r') 00948 break; // Found the newline? Break out! 00949 00950 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 00951 // properly decode the character. Read it in raw mode to avoid emitting 00952 // diagnostics about things like trigraphs. If we see an escaped newline, 00953 // we'll handle it below. 00954 const char *OldPtr = CurPtr; 00955 bool OldRawMode = isLexingRawMode(); 00956 LexingRawMode = true; 00957 C = getAndAdvanceChar(CurPtr, Result); 00958 LexingRawMode = OldRawMode; 00959 00960 // If the char that we finally got was a \n, then we must have had something 00961 // like <newline><newline>. We don't want to have consumed the second 00962 // newline, we want CurPtr, to end up pointing to it down below. 00963 if (C == '\n' || C == '\r') { 00964 --CurPtr; 00965 C = 'x'; // doesn't matter what this is. 00966 } 00967 00968 // If we read multiple characters, and one of those characters was a \r or 00969 // \n, then we had an escaped newline within the comment. Emit diagnostic 00970 // unless the next line is also a // comment. 00971 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 00972 for (; OldPtr != CurPtr; ++OldPtr) 00973 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 00974 // Okay, we found a // comment that ends in a newline, if the next 00975 // line is also a // comment, but has spaces, don't emit a diagnostic. 00976 if (isspace(C)) { 00977 const char *ForwardPtr = CurPtr; 00978 while (isspace(*ForwardPtr)) // Skip whitespace. 00979 ++ForwardPtr; 00980 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 00981 break; 00982 } 00983 00984 if (!isLexingRawMode()) 00985 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 00986 break; 00987 } 00988 } 00989 00990 if (CurPtr == BufferEnd+1) { --CurPtr; break; } 00991 } while (C != '\n' && C != '\r'); 00992 00993 // Found but did not consume the newline. Notify comment handlers about the 00994 // comment unless we're in a #if 0 block. 00995 if (PP && !isLexingRawMode() && 00996 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 00997 getSourceLocation(CurPtr)))) { 00998 BufferPtr = CurPtr; 00999 return true; // A token has to be returned. 01000 } 01001 01002 // If we are returning comments as tokens, return this comment as a token. 01003 if (inKeepCommentMode()) 01004 return SaveBCPLComment(Result, CurPtr); 01005 01006 // If we are inside a preprocessor directive and we see the end of line, 01007 // return immediately, so that the lexer can return this as an EOM token. 01008 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 01009 BufferPtr = CurPtr; 01010 return false; 01011 } 01012 01013 // Otherwise, eat the \n character. We don't care if this is a \n\r or 01014 // \r\n sequence. This is an efficiency hack (because we know the \n can't 01015 // contribute to another token), it isn't needed for correctness. Note that 01016 // this is ok even in KeepWhitespaceMode, because we would have returned the 01017 /// comment above in that mode. 01018 ++CurPtr; 01019 01020 // The next returned token is at the start of the line. 01021 Result.setFlag(Token::StartOfLine); 01022 // No leading whitespace seen so far. 01023 Result.clearFlag(Token::LeadingSpace); 01024 BufferPtr = CurPtr; 01025 return false; 01026 } 01027 01028 /// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 01029 /// an appropriate way and return it. 01030 bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 01031 // If we're not in a preprocessor directive, just return the // comment 01032 // directly. 01033 FormTokenWithChars(Result, CurPtr, tok::comment); 01034 01035 if (!ParsingPreprocessorDirective) 01036 return true; 01037 01038 // If this BCPL-style comment is in a macro definition, transmogrify it into 01039 // a C-style block comment. 01040 std::string Spelling = PP->getSpelling(Result); 01041 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 01042 Spelling[1] = '*'; // Change prefix to "/*". 01043 Spelling += "*/"; // add suffix. 01044 01045 Result.setKind(tok::comment); 01046 PP->CreateString(&Spelling[0], Spelling.size(), Result, 01047 Result.getLocation()); 01048 return true; 01049 } 01050 01051 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 01052 /// character (either \n or \r) is part of an escaped newline sequence. Issue a 01053 /// diagnostic if so. We know that the newline is inside of a block comment. 01054 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 01055 Lexer *L) { 01056 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 01057 01058 // Back up off the newline. 01059 --CurPtr; 01060 01061 // If this is a two-character newline sequence, skip the other character. 01062 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 01063 // \n\n or \r\r -> not escaped newline. 01064 if (CurPtr[0] == CurPtr[1]) 01065 return false; 01066 // \n\r or \r\n -> skip the newline. 01067 --CurPtr; 01068 } 01069 01070 // If we have horizontal whitespace, skip over it. We allow whitespace 01071 // between the slash and newline. 01072 bool HasSpace = false; 01073 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 01074 --CurPtr; 01075 HasSpace = true; 01076 } 01077 01078 // If we have a slash, we know this is an escaped newline. 01079 if (*CurPtr == '\\') { 01080 if (CurPtr[-1] != '*') return false; 01081 } else { 01082 // It isn't a slash, is it the ?? / trigraph? 01083 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 01084 CurPtr[-3] != '*') 01085 return false; 01086 01087 // This is the trigraph ending the comment. Emit a stern warning! 01088 CurPtr -= 2; 01089 01090 // If no trigraphs are enabled, warn that we ignored this trigraph and 01091 // ignore this * character. 01092 if (!L->getFeatures().Trigraphs) { 01093 if (!L->isLexingRawMode()) 01094 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 01095 return false; 01096 } 01097 if (!L->isLexingRawMode()) 01098 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 01099 } 01100 01101 // Warn about having an escaped newline between the */ characters. 01102 if (!L->isLexingRawMode()) 01103 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 01104 01105 // If there was space between the backslash and newline, warn about it. 01106 if (HasSpace && !L->isLexingRawMode()) 01107 L->Diag(CurPtr, diag::backslash_newline_space); 01108 01109 return true; 01110 } 01111 01112 #ifdef __SSE2__ 01113 #include <emmintrin.h> 01114 #elif __ALTIVEC__ 01115 #include <altivec.h> 01116 #undef bool 01117 #endif 01118 01119 /// SkipBlockComment - We have just read the /* characters from input. Read 01120 /// until we find the */ characters that terminate the comment. Note that we 01121 /// don't bother decoding trigraphs or escaped newlines in block comments, 01122 /// because they cannot cause the comment to end. The only thing that can 01123 /// happen is the comment could end with an escaped newline between the */ end 01124 /// of comment. 01125 /// 01126 /// If we're in KeepCommentMode or any CommentHandler has inserted 01127 /// some tokens, this will store the first token and return true. 01128 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 01129 // Scan one character past where we should, looking for a '/' character. Once 01130 // we find it, check to see if it was preceeded by a *. This common 01131 // optimization helps people who like to put a lot of * characters in their 01132 // comments. 01133 01134 // The first character we get with newlines and trigraphs skipped to handle 01135 // the degenerate /*/ case below correctly if the * has an escaped newline 01136 // after it. 01137 unsigned CharSize; 01138 unsigned char C = getCharAndSize(CurPtr, CharSize); 01139 CurPtr += CharSize; 01140 if (C == 0 && CurPtr == BufferEnd+1) { 01141 if (!isLexingRawMode()) 01142 Diag(BufferPtr, diag::err_unterminated_block_comment); 01143 --CurPtr; 01144 01145 // KeepWhitespaceMode should return this broken comment as a token. Since 01146 // it isn't a well formed comment, just return it as an 'unknown' token. 01147 if (isKeepWhitespaceMode()) { 01148 FormTokenWithChars(Result, CurPtr, tok::unknown); 01149 return true; 01150 } 01151 01152 BufferPtr = CurPtr; 01153 return false; 01154 } 01155 01156 // Check to see if the first character after the '/*' is another /. If so, 01157 // then this slash does not end the block comment, it is part of it. 01158 if (C == '/') 01159 C = *CurPtr++; 01160 01161 while (1) { 01162 // Skip over all non-interesting characters until we find end of buffer or a 01163 // (probably ending) '/' character. 01164 if (CurPtr + 24 < BufferEnd) { 01165 // While not aligned to a 16-byte boundary. 01166 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 01167 C = *CurPtr++; 01168 01169 if (C == '/') goto FoundSlash; 01170 01171 #ifdef __SSE2__ 01172 __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/', 01173 '/', '/', '/', '/', '/', '/', '/', '/'); 01174 while (CurPtr+16 <= BufferEnd && 01175 _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0) 01176 CurPtr += 16; 01177 #elif __ALTIVEC__ 01178 __vector unsigned char Slashes = { 01179 '/', '/', '/', '/', '/', '/', '/', '/', 01180 '/', '/', '/', '/', '/', '/', '/', '/' 01181 }; 01182 while (CurPtr+16 <= BufferEnd && 01183 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 01184 CurPtr += 16; 01185 #else 01186 // Scan for '/' quickly. Many block comments are very large. 01187 while (CurPtr[0] != '/' && 01188 CurPtr[1] != '/' && 01189 CurPtr[2] != '/' && 01190 CurPtr[3] != '/' && 01191 CurPtr+4 < BufferEnd) { 01192 CurPtr += 4; 01193 } 01194 #endif 01195 01196 // It has to be one of the bytes scanned, increment to it and read one. 01197 C = *CurPtr++; 01198 } 01199 01200 // Loop to scan the remainder. 01201 while (C != '/' && C != '\0') 01202 C = *CurPtr++; 01203 01204 FoundSlash: 01205 if (C == '/') { 01206 if (CurPtr[-2] == '*') // We found the final */. We're done! 01207 break; 01208 01209 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 01210 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 01211 // We found the final */, though it had an escaped newline between the 01212 // * and /. We're done! 01213 break; 01214 } 01215 } 01216 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 01217 // If this is a /* inside of the comment, emit a warning. Don't do this 01218 // if this is a /*/, which will end the comment. This misses cases with 01219 // embedded escaped newlines, but oh well. 01220 if (!isLexingRawMode()) 01221 Diag(CurPtr-1, diag::warn_nested_block_comment); 01222 } 01223 } else if (C == 0 && CurPtr == BufferEnd+1) { 01224 if (!isLexingRawMode()) 01225 Diag(BufferPtr, diag::err_unterminated_block_comment); 01226 // Note: the user probably forgot a */. We could continue immediately 01227 // after the /*, but this would involve lexing a lot of what really is the 01228 // comment, which surely would confuse the parser. 01229 --CurPtr; 01230 01231 // KeepWhitespaceMode should return this broken comment as a token. Since 01232 // it isn't a well formed comment, just return it as an 'unknown' token. 01233 if (isKeepWhitespaceMode()) { 01234 FormTokenWithChars(Result, CurPtr, tok::unknown); 01235 return true; 01236 } 01237 01238 BufferPtr = CurPtr; 01239 return false; 01240 } 01241 C = *CurPtr++; 01242 } 01243 01244 // Notify comment handlers about the comment unless we're in a #if 0 block. 01245 if (PP && !isLexingRawMode() && 01246 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 01247 getSourceLocation(CurPtr)))) { 01248 BufferPtr = CurPtr; 01249 return true; // A token has to be returned. 01250 } 01251 01252 // If we are returning comments as tokens, return this comment as a token. 01253 if (inKeepCommentMode()) { 01254 FormTokenWithChars(Result, CurPtr, tok::comment); 01255 return true; 01256 } 01257 01258 // It is common for the tokens immediately after a /**/ comment to be 01259 // whitespace. Instead of going through the big switch, handle it 01260 // efficiently now. This is safe even in KeepWhitespaceMode because we would 01261 // have already returned above with the comment as a token. 01262 if (isHorizontalWhitespace(*CurPtr)) { 01263 Result.setFlag(Token::LeadingSpace); 01264 SkipWhitespace(Result, CurPtr+1); 01265 return false; 01266 } 01267 01268 // Otherwise, just return so that the next character will be lexed as a token. 01269 BufferPtr = CurPtr; 01270 Result.setFlag(Token::LeadingSpace); 01271 return false; 01272 } 01273 01274 //===----------------------------------------------------------------------===// 01275 // Primary Lexing Entry Points 01276 //===----------------------------------------------------------------------===// 01277 01278 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 01279 /// uninterpreted string. This switches the lexer out of directive mode. 01280 std::string Lexer::ReadToEndOfLine() { 01281 assert(ParsingPreprocessorDirective && ParsingFilename == false && 01282 "Must be in a preprocessing directive!"); 01283 std::string Result; 01284 Token Tmp; 01285 01286 // CurPtr - Cache BufferPtr in an automatic variable. 01287 const char *CurPtr = BufferPtr; 01288 while (1) { 01289 char Char = getAndAdvanceChar(CurPtr, Tmp); 01290 switch (Char) { 01291 default: 01292 Result += Char; 01293 break; 01294 case 0: // Null. 01295 // Found end of file? 01296 if (CurPtr-1 != BufferEnd) { 01297 // Nope, normal character, continue. 01298 Result += Char; 01299 break; 01300 } 01301 // FALL THROUGH. 01302 case '\r': 01303 case '\n': 01304 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 01305 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 01306 BufferPtr = CurPtr-1; 01307 01308 // Next, lex the character, which should handle the EOM transition. 01309 Lex(Tmp); 01310 assert(Tmp.is(tok::eom) && "Unexpected token!"); 01311 01312 // Finally, we're done, return the string we found. 01313 return Result; 01314 } 01315 } 01316 } 01317 01318 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 01319 /// condition, reporting diagnostics and handling other edge cases as required. 01320 /// This returns true if Result contains a token, false if PP.Lex should be 01321 /// called again. 01322 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 01323 // If we hit the end of the file while parsing a preprocessor directive, 01324 // end the preprocessor directive first. The next token returned will 01325 // then be the end of file. 01326 if (ParsingPreprocessorDirective) { 01327 // Done parsing the "line". 01328 ParsingPreprocessorDirective = false; 01329 // Update the location of token as well as BufferPtr. 01330 FormTokenWithChars(Result, CurPtr, tok::eom); 01331 01332 // Restore comment saving mode, in case it was disabled for directive. 01333 SetCommentRetentionState(PP->getCommentRetentionState()); 01334 return true; // Have a token. 01335 } 01336 01337 // If we are in raw mode, return this event as an EOF token. Let the caller 01338 // that put us in raw mode handle the event. 01339 if (isLexingRawMode()) { 01340 Result.startToken(); 01341 BufferPtr = BufferEnd; 01342 FormTokenWithChars(Result, BufferEnd, tok::eof); 01343 return true; 01344 } 01345 01346 // Otherwise, check if we are code-completing, then issue diagnostics for 01347 // unterminated #if and missing newline. 01348 01349 if (PP && PP->isCodeCompletionFile(FileLoc)) { 01350 // We're at the end of the file, but we've been asked to consider the 01351 // end of the file to be a code-completion token. Return the 01352 // code-completion token. 01353 Result.startToken(); 01354 FormTokenWithChars(Result, CurPtr, tok::code_completion); 01355 01356 // Only do the eof -> code_completion translation once. 01357 PP->SetCodeCompletionPoint(0, 0, 0); 01358 return true; 01359 } 01360 01361 // If we are in a #if directive, emit an error. 01362 while (!ConditionalStack.empty()) { 01363 PP->Diag(ConditionalStack.back().IfLoc, 01364 diag::err_pp_unterminated_conditional); 01365 ConditionalStack.pop_back(); 01366 } 01367 01368 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 01369 // a pedwarn. 01370 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 01371 Diag(BufferEnd, diag::ext_no_newline_eof) 01372 << CodeModificationHint::CreateInsertion(getSourceLocation(BufferEnd), 01373 "\n"); 01374 01375 BufferPtr = CurPtr; 01376 01377 // Finally, let the preprocessor handle this. 01378 return PP->HandleEndOfFile(Result); 01379 } 01380 01381 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 01382 /// the specified lexer will return a tok::l_paren token, 0 if it is something 01383 /// else and 2 if there are no more tokens in the buffer controlled by the 01384 /// lexer. 01385 unsigned Lexer::isNextPPTokenLParen() { 01386 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 01387 01388 // Switch to 'skipping' mode. This will ensure that we can lex a token 01389 // without emitting diagnostics, disables macro expansion, and will cause EOF 01390 // to return an EOF token instead of popping the include stack. 01391 LexingRawMode = true; 01392 01393 // Save state that can be changed while lexing so that we can restore it. 01394 const char *TmpBufferPtr = BufferPtr; 01395 bool inPPDirectiveMode = ParsingPreprocessorDirective; 01396 01397 Token Tok; 01398 Tok.startToken(); 01399 LexTokenInternal(Tok); 01400 01401 // Restore state that may have changed. 01402 BufferPtr = TmpBufferPtr; 01403 ParsingPreprocessorDirective = inPPDirectiveMode; 01404 01405 // Restore the lexer back to non-skipping mode. 01406 LexingRawMode = false; 01407 01408 if (Tok.is(tok::eof)) 01409 return 2; 01410 return Tok.is(tok::l_paren); 01411 } 01412 01413 /// FindConflictEnd - Find the end of a version control conflict marker. 01414 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd) { 01415 llvm::StringRef RestOfBuffer(CurPtr+7, BufferEnd-CurPtr-7); 01416 size_t Pos = RestOfBuffer.find(">>>>>>>"); 01417 while (Pos != llvm::StringRef::npos) { 01418 // Must occur at start of line. 01419 if (RestOfBuffer[Pos-1] != '\r' && 01420 RestOfBuffer[Pos-1] != '\n') { 01421 RestOfBuffer = RestOfBuffer.substr(Pos+7); 01422 continue; 01423 } 01424 return RestOfBuffer.data()+Pos; 01425 } 01426 return 0; 01427 } 01428 01429 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 01430 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 01431 /// and recover nicely. This returns true if it is a conflict marker and false 01432 /// if not. 01433 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 01434 // Only a conflict marker if it starts at the beginning of a line. 01435 if (CurPtr != BufferStart && 01436 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 01437 return false; 01438 01439 // Check to see if we have <<<<<<<. 01440 if (BufferEnd-CurPtr < 8 || 01441 llvm::StringRef(CurPtr, 7) != "<<<<<<<") 01442 return false; 01443 01444 // If we have a situation where we don't care about conflict markers, ignore 01445 // it. 01446 if (IsInConflictMarker || isLexingRawMode()) 01447 return false; 01448 01449 // Check to see if there is a >>>>>>> somewhere in the buffer at the start of 01450 // a line to terminate this conflict marker. 01451 if (FindConflictEnd(CurPtr+7, BufferEnd)) { 01452 // We found a match. We are really in a conflict marker. 01453 // Diagnose this, and ignore to the end of line. 01454 Diag(CurPtr, diag::err_conflict_marker); 01455 IsInConflictMarker = true; 01456 01457 // Skip ahead to the end of line. We know this exists because the 01458 // end-of-conflict marker starts with \r or \n. 01459 while (*CurPtr != '\r' && *CurPtr != '\n') { 01460 assert(CurPtr != BufferEnd && "Didn't find end of line"); 01461 ++CurPtr; 01462 } 01463 BufferPtr = CurPtr; 01464 return true; 01465 } 01466 01467 // No end of conflict marker found. 01468 return false; 01469 } 01470 01471 01472 /// HandleEndOfConflictMarker - If this is a '=======' or '|||||||' or '>>>>>>>' 01473 /// marker, then it is the end of a conflict marker. Handle it by ignoring up 01474 /// until the end of the line. This returns true if it is a conflict marker and 01475 /// false if not. 01476 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 01477 // Only a conflict marker if it starts at the beginning of a line. 01478 if (CurPtr != BufferStart && 01479 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 01480 return false; 01481 01482 // If we have a situation where we don't care about conflict markers, ignore 01483 // it. 01484 if (!IsInConflictMarker || isLexingRawMode()) 01485 return false; 01486 01487 // Check to see if we have the marker (7 characters in a row). 01488 for (unsigned i = 1; i != 7; ++i) 01489 if (CurPtr[i] != CurPtr[0]) 01490 return false; 01491 01492 // If we do have it, search for the end of the conflict marker. This could 01493 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 01494 // be the end of conflict marker. 01495 if (const char *End = FindConflictEnd(CurPtr, BufferEnd)) { 01496 CurPtr = End; 01497 01498 // Skip ahead to the end of line. 01499 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 01500 ++CurPtr; 01501 01502 BufferPtr = CurPtr; 01503 01504 // No longer in the conflict marker. 01505 IsInConflictMarker = false; 01506 return true; 01507 } 01508 01509 return false; 01510 } 01511 01512 01513 /// LexTokenInternal - This implements a simple C family lexer. It is an 01514 /// extremely performance critical piece of code. This assumes that the buffer 01515 /// has a null character at the end of the file. This returns a preprocessing 01516 /// token, not a normal token, as such, it is an internal interface. It assumes 01517 /// that the Flags of result have been cleared before calling this. 01518 void Lexer::LexTokenInternal(Token &Result) { 01519 LexNextToken: 01520 // New token, can't need cleaning yet. 01521 Result.clearFlag(Token::NeedsCleaning); 01522 Result.setIdentifierInfo(0); 01523 01524 // CurPtr - Cache BufferPtr in an automatic variable. 01525 const char *CurPtr = BufferPtr; 01526 01527 // Small amounts of horizontal whitespace is very common between tokens. 01528 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 01529 ++CurPtr; 01530 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 01531 ++CurPtr; 01532 01533 // If we are keeping whitespace and other tokens, just return what we just 01534 // skipped. The next lexer invocation will return the token after the 01535 // whitespace. 01536 if (isKeepWhitespaceMode()) { 01537 FormTokenWithChars(Result, CurPtr, tok::unknown); 01538 return; 01539 } 01540 01541 BufferPtr = CurPtr; 01542 Result.setFlag(Token::LeadingSpace); 01543 } 01544 01545 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 01546 01547 // Read a character, advancing over it. 01548 char Char = getAndAdvanceChar(CurPtr, Result); 01549 tok::TokenKind Kind; 01550 01551 switch (Char) { 01552 case 0: // Null. 01553 // Found end of file? 01554 if (CurPtr-1 == BufferEnd) { 01555 // Read the PP instance variable into an automatic variable, because 01556 // LexEndOfFile will often delete 'this'. 01557 Preprocessor *PPCache = PP; 01558 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 01559 return; // Got a token to return. 01560 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 01561 return PPCache->Lex(Result); 01562 } 01563 01564 if (!isLexingRawMode()) 01565 Diag(CurPtr-1, diag::null_in_file); 01566 Result.setFlag(Token::LeadingSpace); 01567 if (SkipWhitespace(Result, CurPtr)) 01568 return; // KeepWhitespaceMode 01569 01570 goto LexNextToken; // GCC isn't tail call eliminating. 01571 01572 case 26: // DOS & CP/M EOF: "^Z". 01573 // If we're in Microsoft extensions mode, treat this as end of file. 01574 if (Features.Microsoft) { 01575 // Read the PP instance variable into an automatic variable, because 01576 // LexEndOfFile will often delete 'this'. 01577 Preprocessor *PPCache = PP; 01578 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 01579 return; // Got a token to return. 01580 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 01581 return PPCache->Lex(Result); 01582 } 01583 // If Microsoft extensions are disabled, this is just random garbage. 01584 Kind = tok::unknown; 01585 break; 01586 01587 case '\n': 01588 case '\r': 01589 // If we are inside a preprocessor directive and we see the end of line, 01590 // we know we are done with the directive, so return an EOM token. 01591 if (ParsingPreprocessorDirective) { 01592 // Done parsing the "line". 01593 ParsingPreprocessorDirective = false; 01594 01595 // Restore comment saving mode, in case it was disabled for directive. 01596 SetCommentRetentionState(PP->getCommentRetentionState()); 01597 01598 // Since we consumed a newline, we are back at the start of a line. 01599 IsAtStartOfLine = true; 01600 01601 Kind = tok::eom; 01602 break; 01603 } 01604 // The returned token is at the start of the line. 01605 Result.setFlag(Token::StartOfLine); 01606 // No leading whitespace seen so far. 01607 Result.clearFlag(Token::LeadingSpace); 01608 01609 if (SkipWhitespace(Result, CurPtr)) 01610 return; // KeepWhitespaceMode 01611 goto LexNextToken; // GCC isn't tail call eliminating. 01612 case ' ': 01613 case '\t': 01614 case '\f': 01615 case '\v': 01616 SkipHorizontalWhitespace: 01617 Result.setFlag(Token::LeadingSpace); 01618 if (SkipWhitespace(Result, CurPtr)) 01619 return; // KeepWhitespaceMode 01620 01621 SkipIgnoredUnits: 01622 CurPtr = BufferPtr; 01623 01624 // If the next token is obviously a // or /* */ comment, skip it efficiently 01625 // too (without going through the big switch stmt). 01626 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 01627 Features.BCPLComment) { 01628 if (SkipBCPLComment(Result, CurPtr+2)) 01629 return; // There is a token to return. 01630 goto SkipIgnoredUnits; 01631 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 01632 if (SkipBlockComment(Result, CurPtr+2)) 01633 return; // There is a token to return. 01634 goto SkipIgnoredUnits; 01635 } else if (isHorizontalWhitespace(*CurPtr)) { 01636 goto SkipHorizontalWhitespace; 01637 } 01638 goto LexNextToken; // GCC isn't tail call eliminating. 01639 01640 // C99 6.4.4.1: Integer Constants. 01641 // C99 6.4.4.2: Floating Constants. 01642 case '0': case '1': case '2': case '3': case '4': 01643 case '5': case '6': case '7': case '8': case '9': 01644 // Notify MIOpt that we read a non-whitespace/non-comment token. 01645 MIOpt.ReadToken(); 01646 return LexNumericConstant(Result, CurPtr); 01647 01648 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 01649 // Notify MIOpt that we read a non-whitespace/non-comment token. 01650 MIOpt.ReadToken(); 01651 Char = getCharAndSize(CurPtr, SizeTmp); 01652 01653 // Wide string literal. 01654 if (Char == '"') 01655 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 01656 true); 01657 01658 // Wide character constant. 01659 if (Char == '\'') 01660 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 01661 // FALL THROUGH, treating L like the start of an identifier. 01662 01663 // C99 6.4.2: Identifiers. 01664 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 01665 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 01666 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 01667 case 'V': case 'W': case 'X': case 'Y': case 'Z': 01668 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 01669 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 01670 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 01671 case 'v': case 'w': case 'x': case 'y': case 'z': 01672 case '_': 01673 // Notify MIOpt that we read a non-whitespace/non-comment token. 01674 MIOpt.ReadToken(); 01675 return LexIdentifier(Result, CurPtr); 01676 01677 case '$': // $ in identifiers. 01678 if (Features.DollarIdents) { 01679 if (!isLexingRawMode()) 01680 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 01681 // Notify MIOpt that we read a non-whitespace/non-comment token. 01682 MIOpt.ReadToken(); 01683 return LexIdentifier(Result, CurPtr); 01684 } 01685 01686 Kind = tok::unknown; 01687 break; 01688 01689 // C99 6.4.4: Character Constants. 01690 case '\'': 01691 // Notify MIOpt that we read a non-whitespace/non-comment token. 01692 MIOpt.ReadToken(); 01693 return LexCharConstant(Result, CurPtr); 01694 01695 // C99 6.4.5: String Literals. 01696 case '"': 01697 // Notify MIOpt that we read a non-whitespace/non-comment token. 01698 MIOpt.ReadToken(); 01699 return LexStringLiteral(Result, CurPtr, false); 01700 01701 // C99 6.4.6: Punctuators. 01702 case '?': 01703 Kind = tok::question; 01704 break; 01705 case '[': 01706 Kind = tok::l_square; 01707 break; 01708 case ']': 01709 Kind = tok::r_square; 01710 break; 01711 case '(': 01712 Kind = tok::l_paren; 01713 break; 01714 case ')': 01715 Kind = tok::r_paren; 01716 break; 01717 case '{': 01718 Kind = tok::l_brace; 01719 break; 01720 case '}': 01721 Kind = tok::r_brace; 01722 break; 01723 case '.': 01724 Char = getCharAndSize(CurPtr, SizeTmp); 01725 if (Char >= '0' && Char <= '9') { 01726 // Notify MIOpt that we read a non-whitespace/non-comment token. 01727 MIOpt.ReadToken(); 01728 01729 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 01730 } else if (Features.CPlusPlus && Char == '*') { 01731 Kind = tok::periodstar; 01732 CurPtr += SizeTmp; 01733 } else if (Char == '.' && 01734 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 01735 Kind = tok::ellipsis; 01736 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 01737 SizeTmp2, Result); 01738 } else { 01739 Kind = tok::period; 01740 } 01741 break; 01742 case '&': 01743 Char = getCharAndSize(CurPtr, SizeTmp); 01744 if (Char == '&') { 01745 Kind = tok::ampamp; 01746 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01747 } else if (Char == '=') { 01748 Kind = tok::ampequal; 01749 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01750 } else { 01751 Kind = tok::amp; 01752 } 01753 break; 01754 case '*': 01755 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 01756 Kind = tok::starequal; 01757 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01758 } else { 01759 Kind = tok::star; 01760 } 01761 break; 01762 case '+': 01763 Char = getCharAndSize(CurPtr, SizeTmp); 01764 if (Char == '+') { 01765 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01766 Kind = tok::plusplus; 01767 } else if (Char == '=') { 01768 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01769 Kind = tok::plusequal; 01770 } else { 01771 Kind = tok::plus; 01772 } 01773 break; 01774 case '-': 01775 Char = getCharAndSize(CurPtr, SizeTmp); 01776 if (Char == '-') { // -- 01777 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01778 Kind = tok::minusminus; 01779 } else if (Char == '>' && Features.CPlusPlus && 01780 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 01781 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 01782 SizeTmp2, Result); 01783 Kind = tok::arrowstar; 01784 } else if (Char == '>') { // -> 01785 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01786 Kind = tok::arrow; 01787 } else if (Char == '=') { // -= 01788 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01789 Kind = tok::minusequal; 01790 } else { 01791 Kind = tok::minus; 01792 } 01793 break; 01794 case '~': 01795 Kind = tok::tilde; 01796 break; 01797 case '!': 01798 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 01799 Kind = tok::exclaimequal; 01800 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01801 } else { 01802 Kind = tok::exclaim; 01803 } 01804 break; 01805 case '/': 01806 // 6.4.9: Comments 01807 Char = getCharAndSize(CurPtr, SizeTmp); 01808 if (Char == '/') { // BCPL comment. 01809 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 01810 // want to lex this as a comment. There is one problem with this though, 01811 // that in one particular corner case, this can change the behavior of the 01812 // resultant program. For example, In "foo //**/ bar", C89 would lex 01813 // this as "foo / bar" and langauges with BCPL comments would lex it as 01814 // "foo". Check to see if the character after the second slash is a '*'. 01815 // If so, we will lex that as a "/" instead of the start of a comment. 01816 if (Features.BCPLComment || 01817 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') { 01818 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 01819 return; // There is a token to return. 01820 01821 // It is common for the tokens immediately after a // comment to be 01822 // whitespace (indentation for the next line). Instead of going through 01823 // the big switch, handle it efficiently now. 01824 goto SkipIgnoredUnits; 01825 } 01826 } 01827 01828 if (Char == '*') { // /**/ comment. 01829 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 01830 return; // There is a token to return. 01831 goto LexNextToken; // GCC isn't tail call eliminating. 01832 } 01833 01834 if (Char == '=') { 01835 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01836 Kind = tok::slashequal; 01837 } else { 01838 Kind = tok::slash; 01839 } 01840 break; 01841 case '%': 01842 Char = getCharAndSize(CurPtr, SizeTmp); 01843 if (Char == '=') { 01844 Kind = tok::percentequal; 01845 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01846 } else if (Features.Digraphs && Char == '>') { 01847 Kind = tok::r_brace; // '%>' -> '}' 01848 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01849 } else if (Features.Digraphs && Char == ':') { 01850 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01851 Char = getCharAndSize(CurPtr, SizeTmp); 01852 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 01853 Kind = tok::hashhash; // '%:%:' -> '##' 01854 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 01855 SizeTmp2, Result); 01856 } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize 01857 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01858 if (!isLexingRawMode()) 01859 Diag(BufferPtr, diag::charize_microsoft_ext); 01860 Kind = tok::hashat; 01861 } else { // '%:' -> '#' 01862 // We parsed a # character. If this occurs at the start of the line, 01863 // it's actually the start of a preprocessing directive. Callback to 01864 // the preprocessor to handle it. 01865 // FIXME: -fpreprocessed mode?? 01866 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 01867 FormTokenWithChars(Result, CurPtr, tok::hash); 01868 PP->HandleDirective(Result); 01869 01870 // As an optimization, if the preprocessor didn't switch lexers, tail 01871 // recurse. 01872 if (PP->isCurrentLexer(this)) { 01873 // Start a new token. If this is a #include or something, the PP may 01874 // want us starting at the beginning of the line again. If so, set 01875 // the StartOfLine flag. 01876 if (IsAtStartOfLine) { 01877 Result.setFlag(Token::StartOfLine); 01878 IsAtStartOfLine = false; 01879 } 01880 goto LexNextToken; // GCC isn't tail call eliminating. 01881 } 01882 01883 return PP->Lex(Result); 01884 } 01885 01886 Kind = tok::hash; 01887 } 01888 } else { 01889 Kind = tok::percent; 01890 } 01891 break; 01892 case '<': 01893 Char = getCharAndSize(CurPtr, SizeTmp); 01894 if (ParsingFilename) { 01895 return LexAngledStringLiteral(Result, CurPtr); 01896 } else if (Char == '<') { 01897 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 01898 if (After == '=') { 01899 Kind = tok::lesslessequal; 01900 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 01901 SizeTmp2, Result); 01902 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 01903 // If this is actually a '<<<<<<<' version control conflict marker, 01904 // recognize it as such and recover nicely. 01905 goto LexNextToken; 01906 } else { 01907 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01908 Kind = tok::lessless; 01909 } 01910 } else if (Char == '=') { 01911 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01912 Kind = tok::lessequal; 01913 } else if (Features.Digraphs && Char == ':') { // '<:' -> '[' 01914 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01915 Kind = tok::l_square; 01916 } else if (Features.Digraphs && Char == '%') { // '<%' -> '{' 01917 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01918 Kind = tok::l_brace; 01919 } else { 01920 Kind = tok::less; 01921 } 01922 break; 01923 case '>': 01924 Char = getCharAndSize(CurPtr, SizeTmp); 01925 if (Char == '=') { 01926 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01927 Kind = tok::greaterequal; 01928 } else if (Char == '>') { 01929 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 01930 if (After == '=') { 01931 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 01932 SizeTmp2, Result); 01933 Kind = tok::greatergreaterequal; 01934 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 01935 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 01936 goto LexNextToken; 01937 } else { 01938 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01939 Kind = tok::greatergreater; 01940 } 01941 01942 } else { 01943 Kind = tok::greater; 01944 } 01945 break; 01946 case '^': 01947 Char = getCharAndSize(CurPtr, SizeTmp); 01948 if (Char == '=') { 01949 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01950 Kind = tok::caretequal; 01951 } else { 01952 Kind = tok::caret; 01953 } 01954 break; 01955 case '|': 01956 Char = getCharAndSize(CurPtr, SizeTmp); 01957 if (Char == '=') { 01958 Kind = tok::pipeequal; 01959 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01960 } else if (Char == '|') { 01961 // If this is '|||||||' and we're in a conflict marker, ignore it. 01962 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 01963 goto LexNextToken; 01964 Kind = tok::pipepipe; 01965 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01966 } else { 01967 Kind = tok::pipe; 01968 } 01969 break; 01970 case ':': 01971 Char = getCharAndSize(CurPtr, SizeTmp); 01972 if (Features.Digraphs && Char == '>') { 01973 Kind = tok::r_square; // ':>' -> ']' 01974 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01975 } else if (Features.CPlusPlus && Char == ':') { 01976 Kind = tok::coloncolon; 01977 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01978 } else { 01979 Kind = tok::colon; 01980 } 01981 break; 01982 case ';': 01983 Kind = tok::semi; 01984 break; 01985 case '=': 01986 Char = getCharAndSize(CurPtr, SizeTmp); 01987 if (Char == '=') { 01988 // If this is '=======' and we're in a conflict marker, ignore it. 01989 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 01990 goto LexNextToken; 01991 01992 Kind = tok::equalequal; 01993 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 01994 } else { 01995 Kind = tok::equal; 01996 } 01997 break; 01998 case ',': 01999 Kind = tok::comma; 02000 break; 02001 case '#': 02002 Char = getCharAndSize(CurPtr, SizeTmp); 02003 if (Char == '#') { 02004 Kind = tok::hashhash; 02005 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02006 } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize 02007 Kind = tok::hashat; 02008 if (!isLexingRawMode()) 02009 Diag(BufferPtr, diag::charize_microsoft_ext); 02010 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 02011 } else { 02012 // We parsed a # character. If this occurs at the start of the line, 02013 // it's actually the start of a preprocessing directive. Callback to 02014 // the preprocessor to handle it. 02015 // FIXME: -fpreprocessed mode?? 02016 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 02017 FormTokenWithChars(Result, CurPtr, tok::hash); 02018 PP->HandleDirective(Result); 02019 02020 // As an optimization, if the preprocessor didn't switch lexers, tail 02021 // recurse. 02022 if (PP->isCurrentLexer(this)) { 02023 // Start a new token. If this is a #include or something, the PP may 02024 // want us starting at the beginning of the line again. If so, set 02025 // the StartOfLine flag. 02026 if (IsAtStartOfLine) { 02027 Result.setFlag(Token::StartOfLine); 02028 IsAtStartOfLine = false; 02029 } 02030 goto LexNextToken; // GCC isn't tail call eliminating. 02031 } 02032 return PP->Lex(Result); 02033 } 02034 02035 Kind = tok::hash; 02036 } 02037 break; 02038 02039 case '@': 02040 // Objective C support. 02041 if (CurPtr[-1] == '@' && Features.ObjC1) 02042 Kind = tok::at; 02043 else 02044 Kind = tok::unknown; 02045 break; 02046 02047 case '\\': 02048 // FIXME: UCN's. 02049 // FALL THROUGH. 02050 default: 02051 Kind = tok::unknown; 02052 break; 02053 } 02054 02055 // Notify MIOpt that we read a non-whitespace/non-comment token. 02056 MIOpt.ReadToken(); 02057 02058 // Update the location of token as well as BufferPtr. 02059 FormTokenWithChars(Result, CurPtr, Kind); 02060 }