clang API Documentation
00001 //===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file defines facilities for reading and writing on-disk hash 00011 // tables. 00012 // 00013 //===----------------------------------------------------------------------===// 00014 #ifndef LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H 00015 #define LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H 00016 00017 #include "llvm/Support/Allocator.h" 00018 #include "llvm/Support/DataTypes.h" 00019 #include "llvm/Support/MathExtras.h" 00020 #include "llvm/Support/raw_ostream.h" 00021 #include "llvm/Support/Host.h" 00022 #include <cassert> 00023 #include <cstdlib> 00024 00025 namespace clang { 00026 00027 namespace io { 00028 00029 typedef uint32_t Offset; 00030 00031 inline void Emit8(raw_ostream& Out, uint32_t V) { 00032 Out << (unsigned char)(V); 00033 } 00034 00035 inline void Emit16(raw_ostream& Out, uint32_t V) { 00036 Out << (unsigned char)(V); 00037 Out << (unsigned char)(V >> 8); 00038 assert((V >> 16) == 0); 00039 } 00040 00041 inline void Emit24(raw_ostream& Out, uint32_t V) { 00042 Out << (unsigned char)(V); 00043 Out << (unsigned char)(V >> 8); 00044 Out << (unsigned char)(V >> 16); 00045 assert((V >> 24) == 0); 00046 } 00047 00048 inline void Emit32(raw_ostream& Out, uint32_t V) { 00049 Out << (unsigned char)(V); 00050 Out << (unsigned char)(V >> 8); 00051 Out << (unsigned char)(V >> 16); 00052 Out << (unsigned char)(V >> 24); 00053 } 00054 00055 inline void Emit64(raw_ostream& Out, uint64_t V) { 00056 Out << (unsigned char)(V); 00057 Out << (unsigned char)(V >> 8); 00058 Out << (unsigned char)(V >> 16); 00059 Out << (unsigned char)(V >> 24); 00060 Out << (unsigned char)(V >> 32); 00061 Out << (unsigned char)(V >> 40); 00062 Out << (unsigned char)(V >> 48); 00063 Out << (unsigned char)(V >> 56); 00064 } 00065 00066 inline void Pad(raw_ostream& Out, unsigned A) { 00067 Offset off = (Offset) Out.tell(); 00068 uint32_t n = ((uintptr_t)(off+A-1) & ~(uintptr_t)(A-1)) - off; 00069 for (; n ; --n) 00070 Emit8(Out, 0); 00071 } 00072 00073 inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) { 00074 uint16_t V = ((uint16_t)Data[0]) | 00075 ((uint16_t)Data[1] << 8); 00076 Data += 2; 00077 return V; 00078 } 00079 00080 inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) { 00081 uint32_t V = ((uint32_t)Data[0]) | 00082 ((uint32_t)Data[1] << 8) | 00083 ((uint32_t)Data[2] << 16) | 00084 ((uint32_t)Data[3] << 24); 00085 Data += 4; 00086 return V; 00087 } 00088 00089 inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) { 00090 uint64_t V = ((uint64_t)Data[0]) | 00091 ((uint64_t)Data[1] << 8) | 00092 ((uint64_t)Data[2] << 16) | 00093 ((uint64_t)Data[3] << 24) | 00094 ((uint64_t)Data[4] << 32) | 00095 ((uint64_t)Data[5] << 40) | 00096 ((uint64_t)Data[6] << 48) | 00097 ((uint64_t)Data[7] << 56); 00098 Data += 8; 00099 return V; 00100 } 00101 00102 inline uint32_t ReadLE32(const unsigned char *&Data) { 00103 // Hosts that directly support little-endian 32-bit loads can just 00104 // use them. Big-endian hosts need a bswap. 00105 uint32_t V = *((uint32_t*)Data); 00106 if (llvm::sys::isBigEndianHost()) 00107 V = llvm::ByteSwap_32(V); 00108 Data += 4; 00109 return V; 00110 } 00111 00112 } // end namespace io 00113 00114 template<typename Info> 00115 class OnDiskChainedHashTableGenerator { 00116 unsigned NumBuckets; 00117 unsigned NumEntries; 00118 llvm::BumpPtrAllocator BA; 00119 00120 class Item { 00121 public: 00122 typename Info::key_type key; 00123 typename Info::data_type data; 00124 Item *next; 00125 const uint32_t hash; 00126 00127 Item(typename Info::key_type_ref k, typename Info::data_type_ref d, 00128 Info &InfoObj) 00129 : key(k), data(d), next(0), hash(InfoObj.ComputeHash(k)) {} 00130 }; 00131 00132 class Bucket { 00133 public: 00134 io::Offset off; 00135 Item* head; 00136 unsigned length; 00137 00138 Bucket() {} 00139 }; 00140 00141 Bucket* Buckets; 00142 00143 private: 00144 void insert(Bucket* b, size_t size, Item* E) { 00145 unsigned idx = E->hash & (size - 1); 00146 Bucket& B = b[idx]; 00147 E->next = B.head; 00148 ++B.length; 00149 B.head = E; 00150 } 00151 00152 void resize(size_t newsize) { 00153 Bucket* newBuckets = (Bucket*) std::calloc(newsize, sizeof(Bucket)); 00154 // Populate newBuckets with the old entries. 00155 for (unsigned i = 0; i < NumBuckets; ++i) 00156 for (Item* E = Buckets[i].head; E ; ) { 00157 Item* N = E->next; 00158 E->next = 0; 00159 insert(newBuckets, newsize, E); 00160 E = N; 00161 } 00162 00163 free(Buckets); 00164 NumBuckets = newsize; 00165 Buckets = newBuckets; 00166 } 00167 00168 public: 00169 00170 void insert(typename Info::key_type_ref key, 00171 typename Info::data_type_ref data) { 00172 Info InfoObj; 00173 insert(key, data, InfoObj); 00174 } 00175 00176 void insert(typename Info::key_type_ref key, 00177 typename Info::data_type_ref data, Info &InfoObj) { 00178 00179 ++NumEntries; 00180 if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2); 00181 insert(Buckets, NumBuckets, new (BA.Allocate<Item>()) Item(key, data, 00182 InfoObj)); 00183 } 00184 00185 io::Offset Emit(raw_ostream &out) { 00186 Info InfoObj; 00187 return Emit(out, InfoObj); 00188 } 00189 00190 io::Offset Emit(raw_ostream &out, Info &InfoObj) { 00191 using namespace clang::io; 00192 00193 // Emit the payload of the table. 00194 for (unsigned i = 0; i < NumBuckets; ++i) { 00195 Bucket& B = Buckets[i]; 00196 if (!B.head) continue; 00197 00198 // Store the offset for the data of this bucket. 00199 B.off = out.tell(); 00200 assert(B.off && "Cannot write a bucket at offset 0. Please add padding."); 00201 00202 // Write out the number of items in the bucket. 00203 Emit16(out, B.length); 00204 assert(B.length != 0 && "Bucket has a head but zero length?"); 00205 00206 // Write out the entries in the bucket. 00207 for (Item *I = B.head; I ; I = I->next) { 00208 Emit32(out, I->hash); 00209 const std::pair<unsigned, unsigned>& Len = 00210 InfoObj.EmitKeyDataLength(out, I->key, I->data); 00211 InfoObj.EmitKey(out, I->key, Len.first); 00212 InfoObj.EmitData(out, I->key, I->data, Len.second); 00213 } 00214 } 00215 00216 // Emit the hashtable itself. 00217 Pad(out, 4); 00218 io::Offset TableOff = out.tell(); 00219 Emit32(out, NumBuckets); 00220 Emit32(out, NumEntries); 00221 for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off); 00222 00223 return TableOff; 00224 } 00225 00226 OnDiskChainedHashTableGenerator() { 00227 NumEntries = 0; 00228 NumBuckets = 64; 00229 // Note that we do not need to run the constructors of the individual 00230 // Bucket objects since 'calloc' returns bytes that are all 0. 00231 Buckets = (Bucket*) std::calloc(NumBuckets, sizeof(Bucket)); 00232 } 00233 00234 ~OnDiskChainedHashTableGenerator() { 00235 std::free(Buckets); 00236 } 00237 }; 00238 00239 template<typename Info> 00240 class OnDiskChainedHashTable { 00241 const unsigned NumBuckets; 00242 const unsigned NumEntries; 00243 const unsigned char* const Buckets; 00244 const unsigned char* const Base; 00245 Info InfoObj; 00246 00247 public: 00248 typedef typename Info::internal_key_type internal_key_type; 00249 typedef typename Info::external_key_type external_key_type; 00250 typedef typename Info::data_type data_type; 00251 00252 OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries, 00253 const unsigned char* buckets, 00254 const unsigned char* base, 00255 const Info &InfoObj = Info()) 00256 : NumBuckets(numBuckets), NumEntries(numEntries), 00257 Buckets(buckets), Base(base), InfoObj(InfoObj) { 00258 assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 && 00259 "'buckets' must have a 4-byte alignment"); 00260 } 00261 00262 unsigned getNumBuckets() const { return NumBuckets; } 00263 unsigned getNumEntries() const { return NumEntries; } 00264 const unsigned char* getBase() const { return Base; } 00265 const unsigned char* getBuckets() const { return Buckets; } 00266 00267 bool isEmpty() const { return NumEntries == 0; } 00268 00269 class iterator { 00270 internal_key_type key; 00271 const unsigned char* const data; 00272 const unsigned len; 00273 Info *InfoObj; 00274 public: 00275 iterator() : data(0), len(0) {} 00276 iterator(const internal_key_type k, const unsigned char* d, unsigned l, 00277 Info *InfoObj) 00278 : key(k), data(d), len(l), InfoObj(InfoObj) {} 00279 00280 data_type operator*() const { return InfoObj->ReadData(key, data, len); } 00281 bool operator==(const iterator& X) const { return X.data == data; } 00282 bool operator!=(const iterator& X) const { return X.data != data; } 00283 }; 00284 00285 iterator find(const external_key_type& eKey, Info *InfoPtr = 0) { 00286 if (!InfoPtr) 00287 InfoPtr = &InfoObj; 00288 00289 using namespace io; 00290 const internal_key_type& iKey = InfoObj.GetInternalKey(eKey); 00291 unsigned key_hash = InfoObj.ComputeHash(iKey); 00292 00293 // Each bucket is just a 32-bit offset into the hash table file. 00294 unsigned idx = key_hash & (NumBuckets - 1); 00295 const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx; 00296 00297 unsigned offset = ReadLE32(Bucket); 00298 if (offset == 0) return iterator(); // Empty bucket. 00299 const unsigned char* Items = Base + offset; 00300 00301 // 'Items' starts with a 16-bit unsigned integer representing the 00302 // number of items in this bucket. 00303 unsigned len = ReadUnalignedLE16(Items); 00304 00305 for (unsigned i = 0; i < len; ++i) { 00306 // Read the hash. 00307 uint32_t item_hash = ReadUnalignedLE32(Items); 00308 00309 // Determine the length of the key and the data. 00310 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Items); 00311 unsigned item_len = L.first + L.second; 00312 00313 // Compare the hashes. If they are not the same, skip the entry entirely. 00314 if (item_hash != key_hash) { 00315 Items += item_len; 00316 continue; 00317 } 00318 00319 // Read the key. 00320 const internal_key_type& X = 00321 InfoPtr->ReadKey((const unsigned char* const) Items, L.first); 00322 00323 // If the key doesn't match just skip reading the value. 00324 if (!InfoPtr->EqualKey(X, iKey)) { 00325 Items += item_len; 00326 continue; 00327 } 00328 00329 // The key matches! 00330 return iterator(X, Items + L.first, L.second, InfoPtr); 00331 } 00332 00333 return iterator(); 00334 } 00335 00336 iterator end() const { return iterator(); } 00337 00338 /// \brief Iterates over all of the keys in the table. 00339 class key_iterator { 00340 const unsigned char* Ptr; 00341 unsigned NumItemsInBucketLeft; 00342 unsigned NumEntriesLeft; 00343 Info *InfoObj; 00344 public: 00345 typedef external_key_type value_type; 00346 00347 key_iterator(const unsigned char* const Ptr, unsigned NumEntries, 00348 Info *InfoObj) 00349 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries), 00350 InfoObj(InfoObj) { } 00351 key_iterator() 00352 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { } 00353 00354 friend bool operator==(const key_iterator &X, const key_iterator &Y) { 00355 return X.NumEntriesLeft == Y.NumEntriesLeft; 00356 } 00357 friend bool operator!=(const key_iterator& X, const key_iterator &Y) { 00358 return X.NumEntriesLeft != Y.NumEntriesLeft; 00359 } 00360 00361 key_iterator& operator++() { // Preincrement 00362 if (!NumItemsInBucketLeft) { 00363 // 'Items' starts with a 16-bit unsigned integer representing the 00364 // number of items in this bucket. 00365 NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr); 00366 } 00367 Ptr += 4; // Skip the hash. 00368 // Determine the length of the key and the data. 00369 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr); 00370 Ptr += L.first + L.second; 00371 assert(NumItemsInBucketLeft); 00372 --NumItemsInBucketLeft; 00373 assert(NumEntriesLeft); 00374 --NumEntriesLeft; 00375 return *this; 00376 } 00377 key_iterator operator++(int) { // Postincrement 00378 key_iterator tmp = *this; ++*this; return tmp; 00379 } 00380 00381 value_type operator*() const { 00382 const unsigned char* LocalPtr = Ptr; 00383 if (!NumItemsInBucketLeft) 00384 LocalPtr += 2; // number of items in bucket 00385 LocalPtr += 4; // Skip the hash. 00386 00387 // Determine the length of the key and the data. 00388 const std::pair<unsigned, unsigned>& L 00389 = Info::ReadKeyDataLength(LocalPtr); 00390 00391 // Read the key. 00392 const internal_key_type& Key = InfoObj->ReadKey(LocalPtr, L.first); 00393 return InfoObj->GetExternalKey(Key); 00394 } 00395 }; 00396 00397 key_iterator key_begin() { 00398 return key_iterator(Base + 4, getNumEntries(), &InfoObj); 00399 } 00400 key_iterator key_end() { return key_iterator(); } 00401 00402 /// \brief Iterates over all the entries in the table, returning the data. 00403 class data_iterator { 00404 const unsigned char* Ptr; 00405 unsigned NumItemsInBucketLeft; 00406 unsigned NumEntriesLeft; 00407 Info *InfoObj; 00408 public: 00409 typedef data_type value_type; 00410 00411 data_iterator(const unsigned char* const Ptr, unsigned NumEntries, 00412 Info *InfoObj) 00413 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries), 00414 InfoObj(InfoObj) { } 00415 data_iterator() 00416 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { } 00417 00418 bool operator==(const data_iterator& X) const { 00419 return X.NumEntriesLeft == NumEntriesLeft; 00420 } 00421 bool operator!=(const data_iterator& X) const { 00422 return X.NumEntriesLeft != NumEntriesLeft; 00423 } 00424 00425 data_iterator& operator++() { // Preincrement 00426 if (!NumItemsInBucketLeft) { 00427 // 'Items' starts with a 16-bit unsigned integer representing the 00428 // number of items in this bucket. 00429 NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr); 00430 } 00431 Ptr += 4; // Skip the hash. 00432 // Determine the length of the key and the data. 00433 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr); 00434 Ptr += L.first + L.second; 00435 assert(NumItemsInBucketLeft); 00436 --NumItemsInBucketLeft; 00437 assert(NumEntriesLeft); 00438 --NumEntriesLeft; 00439 return *this; 00440 } 00441 data_iterator operator++(int) { // Postincrement 00442 data_iterator tmp = *this; ++*this; return tmp; 00443 } 00444 00445 value_type operator*() const { 00446 const unsigned char* LocalPtr = Ptr; 00447 if (!NumItemsInBucketLeft) 00448 LocalPtr += 2; // number of items in bucket 00449 LocalPtr += 4; // Skip the hash. 00450 00451 // Determine the length of the key and the data. 00452 const std::pair<unsigned, unsigned>& L =Info::ReadKeyDataLength(LocalPtr); 00453 00454 // Read the key. 00455 const internal_key_type& Key = 00456 InfoObj->ReadKey(LocalPtr, L.first); 00457 return InfoObj->ReadData(Key, LocalPtr + L.first, L.second); 00458 } 00459 }; 00460 00461 data_iterator data_begin() { 00462 return data_iterator(Base + 4, getNumEntries(), &InfoObj); 00463 } 00464 data_iterator data_end() { return data_iterator(); } 00465 00466 Info &getInfoObj() { return InfoObj; } 00467 00468 static OnDiskChainedHashTable* Create(const unsigned char* buckets, 00469 const unsigned char* const base, 00470 const Info &InfoObj = Info()) { 00471 using namespace io; 00472 assert(buckets > base); 00473 assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 && 00474 "buckets should be 4-byte aligned."); 00475 00476 unsigned numBuckets = ReadLE32(buckets); 00477 unsigned numEntries = ReadLE32(buckets); 00478 return new OnDiskChainedHashTable<Info>(numBuckets, numEntries, buckets, 00479 base, InfoObj); 00480 } 00481 }; 00482 00483 } // end namespace clang 00484 00485 #endif