clang API Documentation

OnDiskHashTable.h
Go to the documentation of this file.
00001 //===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 //  This file defines facilities for reading and writing on-disk hash
00011 //  tables.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 #ifndef LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
00015 #define LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
00016 
00017 #include "llvm/Support/Allocator.h"
00018 #include "llvm/Support/DataTypes.h"
00019 #include "llvm/Support/MathExtras.h"
00020 #include "llvm/Support/raw_ostream.h"
00021 #include "llvm/Support/Host.h"
00022 #include <cassert>
00023 #include <cstdlib>
00024 
00025 namespace clang {
00026 
00027 namespace io {
00028 
00029 typedef uint32_t Offset;
00030 
00031 inline void Emit8(raw_ostream& Out, uint32_t V) {
00032   Out << (unsigned char)(V);
00033 }
00034 
00035 inline void Emit16(raw_ostream& Out, uint32_t V) {
00036   Out << (unsigned char)(V);
00037   Out << (unsigned char)(V >>  8);
00038   assert((V >> 16) == 0);
00039 }
00040 
00041 inline void Emit24(raw_ostream& Out, uint32_t V) {
00042   Out << (unsigned char)(V);
00043   Out << (unsigned char)(V >>  8);
00044   Out << (unsigned char)(V >> 16);
00045   assert((V >> 24) == 0);
00046 }
00047 
00048 inline void Emit32(raw_ostream& Out, uint32_t V) {
00049   Out << (unsigned char)(V);
00050   Out << (unsigned char)(V >>  8);
00051   Out << (unsigned char)(V >> 16);
00052   Out << (unsigned char)(V >> 24);
00053 }
00054 
00055 inline void Emit64(raw_ostream& Out, uint64_t V) {
00056   Out << (unsigned char)(V);
00057   Out << (unsigned char)(V >>  8);
00058   Out << (unsigned char)(V >> 16);
00059   Out << (unsigned char)(V >> 24);
00060   Out << (unsigned char)(V >> 32);
00061   Out << (unsigned char)(V >> 40);
00062   Out << (unsigned char)(V >> 48);
00063   Out << (unsigned char)(V >> 56);
00064 }
00065 
00066 inline void Pad(raw_ostream& Out, unsigned A) {
00067   Offset off = (Offset) Out.tell();
00068   uint32_t n = ((uintptr_t)(off+A-1) & ~(uintptr_t)(A-1)) - off;
00069   for (; n ; --n)
00070     Emit8(Out, 0);
00071 }
00072 
00073 inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) {
00074   uint16_t V = ((uint16_t)Data[0]) |
00075                ((uint16_t)Data[1] <<  8);
00076   Data += 2;
00077   return V;
00078 }
00079 
00080 inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) {
00081   uint32_t V = ((uint32_t)Data[0])  |
00082                ((uint32_t)Data[1] << 8)  |
00083                ((uint32_t)Data[2] << 16) |
00084                ((uint32_t)Data[3] << 24);
00085   Data += 4;
00086   return V;
00087 }
00088 
00089 inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) {
00090   uint64_t V = ((uint64_t)Data[0])  |
00091     ((uint64_t)Data[1] << 8)  |
00092     ((uint64_t)Data[2] << 16) |
00093     ((uint64_t)Data[3] << 24) |
00094     ((uint64_t)Data[4] << 32) |
00095     ((uint64_t)Data[5] << 40) |
00096     ((uint64_t)Data[6] << 48) |
00097     ((uint64_t)Data[7] << 56);
00098   Data += 8;
00099   return V;
00100 }
00101 
00102 inline uint32_t ReadLE32(const unsigned char *&Data) {
00103   // Hosts that directly support little-endian 32-bit loads can just
00104   // use them.  Big-endian hosts need a bswap.
00105   uint32_t V = *((uint32_t*)Data);
00106   if (llvm::sys::isBigEndianHost())
00107     V = llvm::ByteSwap_32(V);
00108   Data += 4;
00109   return V;
00110 }
00111 
00112 } // end namespace io
00113 
00114 template<typename Info>
00115 class OnDiskChainedHashTableGenerator {
00116   unsigned NumBuckets;
00117   unsigned NumEntries;
00118   llvm::BumpPtrAllocator BA;
00119 
00120   class Item {
00121   public:
00122     typename Info::key_type key;
00123     typename Info::data_type data;
00124     Item *next;
00125     const uint32_t hash;
00126 
00127     Item(typename Info::key_type_ref k, typename Info::data_type_ref d,
00128          Info &InfoObj)
00129     : key(k), data(d), next(0), hash(InfoObj.ComputeHash(k)) {}
00130   };
00131 
00132   class Bucket {
00133   public:
00134     io::Offset off;
00135     Item* head;
00136     unsigned length;
00137 
00138     Bucket() {}
00139   };
00140 
00141   Bucket* Buckets;
00142 
00143 private:
00144   void insert(Bucket* b, size_t size, Item* E) {
00145     unsigned idx = E->hash & (size - 1);
00146     Bucket& B = b[idx];
00147     E->next = B.head;
00148     ++B.length;
00149     B.head = E;
00150   }
00151 
00152   void resize(size_t newsize) {
00153     Bucket* newBuckets = (Bucket*) std::calloc(newsize, sizeof(Bucket));
00154     // Populate newBuckets with the old entries.
00155     for (unsigned i = 0; i < NumBuckets; ++i)
00156       for (Item* E = Buckets[i].head; E ; ) {
00157         Item* N = E->next;
00158         E->next = 0;
00159         insert(newBuckets, newsize, E);
00160         E = N;
00161       }
00162 
00163     free(Buckets);
00164     NumBuckets = newsize;
00165     Buckets = newBuckets;
00166   }
00167 
00168 public:
00169 
00170   void insert(typename Info::key_type_ref key,
00171               typename Info::data_type_ref data) {
00172     Info InfoObj;
00173     insert(key, data, InfoObj);
00174   }
00175 
00176   void insert(typename Info::key_type_ref key,
00177               typename Info::data_type_ref data, Info &InfoObj) {
00178 
00179     ++NumEntries;
00180     if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2);
00181     insert(Buckets, NumBuckets, new (BA.Allocate<Item>()) Item(key, data,
00182                                                                InfoObj));
00183   }
00184 
00185   io::Offset Emit(raw_ostream &out) {
00186     Info InfoObj;
00187     return Emit(out, InfoObj);
00188   }
00189 
00190   io::Offset Emit(raw_ostream &out, Info &InfoObj) {
00191     using namespace clang::io;
00192 
00193     // Emit the payload of the table.
00194     for (unsigned i = 0; i < NumBuckets; ++i) {
00195       Bucket& B = Buckets[i];
00196       if (!B.head) continue;
00197 
00198       // Store the offset for the data of this bucket.
00199       B.off = out.tell();
00200       assert(B.off && "Cannot write a bucket at offset 0. Please add padding.");
00201 
00202       // Write out the number of items in the bucket.
00203       Emit16(out, B.length);
00204       assert(B.length != 0  && "Bucket has a head but zero length?");
00205 
00206       // Write out the entries in the bucket.
00207       for (Item *I = B.head; I ; I = I->next) {
00208         Emit32(out, I->hash);
00209         const std::pair<unsigned, unsigned>& Len =
00210           InfoObj.EmitKeyDataLength(out, I->key, I->data);
00211         InfoObj.EmitKey(out, I->key, Len.first);
00212         InfoObj.EmitData(out, I->key, I->data, Len.second);
00213       }
00214     }
00215 
00216     // Emit the hashtable itself.
00217     Pad(out, 4);
00218     io::Offset TableOff = out.tell();
00219     Emit32(out, NumBuckets);
00220     Emit32(out, NumEntries);
00221     for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off);
00222 
00223     return TableOff;
00224   }
00225 
00226   OnDiskChainedHashTableGenerator() {
00227     NumEntries = 0;
00228     NumBuckets = 64;
00229     // Note that we do not need to run the constructors of the individual
00230     // Bucket objects since 'calloc' returns bytes that are all 0.
00231     Buckets = (Bucket*) std::calloc(NumBuckets, sizeof(Bucket));
00232   }
00233 
00234   ~OnDiskChainedHashTableGenerator() {
00235     std::free(Buckets);
00236   }
00237 };
00238 
00239 template<typename Info>
00240 class OnDiskChainedHashTable {
00241   const unsigned NumBuckets;
00242   const unsigned NumEntries;
00243   const unsigned char* const Buckets;
00244   const unsigned char* const Base;
00245   Info InfoObj;
00246 
00247 public:
00248   typedef typename Info::internal_key_type internal_key_type;
00249   typedef typename Info::external_key_type external_key_type;
00250   typedef typename Info::data_type         data_type;
00251 
00252   OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries,
00253                          const unsigned char* buckets,
00254                          const unsigned char* base,
00255                          const Info &InfoObj = Info())
00256     : NumBuckets(numBuckets), NumEntries(numEntries),
00257       Buckets(buckets), Base(base), InfoObj(InfoObj) {
00258         assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
00259                "'buckets' must have a 4-byte alignment");
00260       }
00261 
00262   unsigned getNumBuckets() const { return NumBuckets; }
00263   unsigned getNumEntries() const { return NumEntries; }
00264   const unsigned char* getBase() const { return Base; }
00265   const unsigned char* getBuckets() const { return Buckets; }
00266 
00267   bool isEmpty() const { return NumEntries == 0; }
00268 
00269   class iterator {
00270     internal_key_type key;
00271     const unsigned char* const data;
00272     const unsigned len;
00273     Info *InfoObj;
00274   public:
00275     iterator() : data(0), len(0) {}
00276     iterator(const internal_key_type k, const unsigned char* d, unsigned l,
00277              Info *InfoObj)
00278       : key(k), data(d), len(l), InfoObj(InfoObj) {}
00279 
00280     data_type operator*() const { return InfoObj->ReadData(key, data, len); }
00281     bool operator==(const iterator& X) const { return X.data == data; }
00282     bool operator!=(const iterator& X) const { return X.data != data; }
00283   };
00284 
00285   iterator find(const external_key_type& eKey, Info *InfoPtr = 0) {
00286     if (!InfoPtr)
00287       InfoPtr = &InfoObj;
00288 
00289     using namespace io;
00290     const internal_key_type& iKey = InfoObj.GetInternalKey(eKey);
00291     unsigned key_hash = InfoObj.ComputeHash(iKey);
00292 
00293     // Each bucket is just a 32-bit offset into the hash table file.
00294     unsigned idx = key_hash & (NumBuckets - 1);
00295     const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx;
00296 
00297     unsigned offset = ReadLE32(Bucket);
00298     if (offset == 0) return iterator(); // Empty bucket.
00299     const unsigned char* Items = Base + offset;
00300 
00301     // 'Items' starts with a 16-bit unsigned integer representing the
00302     // number of items in this bucket.
00303     unsigned len = ReadUnalignedLE16(Items);
00304 
00305     for (unsigned i = 0; i < len; ++i) {
00306       // Read the hash.
00307       uint32_t item_hash = ReadUnalignedLE32(Items);
00308 
00309       // Determine the length of the key and the data.
00310       const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Items);
00311       unsigned item_len = L.first + L.second;
00312 
00313       // Compare the hashes.  If they are not the same, skip the entry entirely.
00314       if (item_hash != key_hash) {
00315         Items += item_len;
00316         continue;
00317       }
00318 
00319       // Read the key.
00320       const internal_key_type& X =
00321         InfoPtr->ReadKey((const unsigned char* const) Items, L.first);
00322 
00323       // If the key doesn't match just skip reading the value.
00324       if (!InfoPtr->EqualKey(X, iKey)) {
00325         Items += item_len;
00326         continue;
00327       }
00328 
00329       // The key matches!
00330       return iterator(X, Items + L.first, L.second, InfoPtr);
00331     }
00332 
00333     return iterator();
00334   }
00335 
00336   iterator end() const { return iterator(); }
00337 
00338   /// \brief Iterates over all of the keys in the table.
00339   class key_iterator {
00340     const unsigned char* Ptr;
00341     unsigned NumItemsInBucketLeft;
00342     unsigned NumEntriesLeft;
00343     Info *InfoObj;
00344   public:
00345     typedef external_key_type value_type;
00346 
00347     key_iterator(const unsigned char* const Ptr, unsigned NumEntries,
00348                   Info *InfoObj)
00349       : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
00350         InfoObj(InfoObj) { }
00351     key_iterator()
00352       : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
00353 
00354     friend bool operator==(const key_iterator &X, const key_iterator &Y) {
00355       return X.NumEntriesLeft == Y.NumEntriesLeft;
00356     }
00357     friend bool operator!=(const key_iterator& X, const key_iterator &Y) {
00358       return X.NumEntriesLeft != Y.NumEntriesLeft;
00359     }
00360 
00361     key_iterator& operator++() {  // Preincrement
00362       if (!NumItemsInBucketLeft) {
00363         // 'Items' starts with a 16-bit unsigned integer representing the
00364         // number of items in this bucket.
00365         NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
00366       }
00367       Ptr += 4; // Skip the hash.
00368       // Determine the length of the key and the data.
00369       const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
00370       Ptr += L.first + L.second;
00371       assert(NumItemsInBucketLeft);
00372       --NumItemsInBucketLeft;
00373       assert(NumEntriesLeft);
00374       --NumEntriesLeft;
00375       return *this;
00376     }
00377     key_iterator operator++(int) {  // Postincrement
00378       key_iterator tmp = *this; ++*this; return tmp;
00379     }
00380 
00381     value_type operator*() const {
00382       const unsigned char* LocalPtr = Ptr;
00383       if (!NumItemsInBucketLeft)
00384         LocalPtr += 2; // number of items in bucket
00385       LocalPtr += 4; // Skip the hash.
00386 
00387       // Determine the length of the key and the data.
00388       const std::pair<unsigned, unsigned>& L
00389         = Info::ReadKeyDataLength(LocalPtr);
00390 
00391       // Read the key.
00392       const internal_key_type& Key = InfoObj->ReadKey(LocalPtr, L.first);
00393       return InfoObj->GetExternalKey(Key);
00394     }
00395   };
00396 
00397   key_iterator key_begin() {
00398     return key_iterator(Base + 4, getNumEntries(), &InfoObj);
00399   }
00400   key_iterator key_end() { return key_iterator(); }
00401 
00402   /// \brief Iterates over all the entries in the table, returning the data.
00403   class data_iterator {
00404     const unsigned char* Ptr;
00405     unsigned NumItemsInBucketLeft;
00406     unsigned NumEntriesLeft;
00407     Info *InfoObj;
00408   public:
00409     typedef data_type value_type;
00410 
00411     data_iterator(const unsigned char* const Ptr, unsigned NumEntries,
00412                   Info *InfoObj)
00413       : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
00414         InfoObj(InfoObj) { }
00415     data_iterator()
00416       : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
00417 
00418     bool operator==(const data_iterator& X) const {
00419       return X.NumEntriesLeft == NumEntriesLeft;
00420     }
00421     bool operator!=(const data_iterator& X) const {
00422       return X.NumEntriesLeft != NumEntriesLeft;
00423     }
00424 
00425     data_iterator& operator++() {  // Preincrement
00426       if (!NumItemsInBucketLeft) {
00427         // 'Items' starts with a 16-bit unsigned integer representing the
00428         // number of items in this bucket.
00429         NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
00430       }
00431       Ptr += 4; // Skip the hash.
00432       // Determine the length of the key and the data.
00433       const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
00434       Ptr += L.first + L.second;
00435       assert(NumItemsInBucketLeft);
00436       --NumItemsInBucketLeft;
00437       assert(NumEntriesLeft);
00438       --NumEntriesLeft;
00439       return *this;
00440     }
00441     data_iterator operator++(int) {  // Postincrement
00442       data_iterator tmp = *this; ++*this; return tmp;
00443     }
00444 
00445     value_type operator*() const {
00446       const unsigned char* LocalPtr = Ptr;
00447       if (!NumItemsInBucketLeft)
00448         LocalPtr += 2; // number of items in bucket
00449       LocalPtr += 4; // Skip the hash.
00450 
00451       // Determine the length of the key and the data.
00452       const std::pair<unsigned, unsigned>& L =Info::ReadKeyDataLength(LocalPtr);
00453 
00454       // Read the key.
00455       const internal_key_type& Key =
00456         InfoObj->ReadKey(LocalPtr, L.first);
00457       return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
00458     }
00459   };
00460 
00461   data_iterator data_begin() {
00462     return data_iterator(Base + 4, getNumEntries(), &InfoObj);
00463   }
00464   data_iterator data_end() { return data_iterator(); }
00465 
00466   Info &getInfoObj() { return InfoObj; }
00467 
00468   static OnDiskChainedHashTable* Create(const unsigned char* buckets,
00469                                         const unsigned char* const base,
00470                                         const Info &InfoObj = Info()) {
00471     using namespace io;
00472     assert(buckets > base);
00473     assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
00474            "buckets should be 4-byte aligned.");
00475 
00476     unsigned numBuckets = ReadLE32(buckets);
00477     unsigned numEntries = ReadLE32(buckets);
00478     return new OnDiskChainedHashTable<Info>(numBuckets, numEntries, buckets,
00479                                             base, InfoObj);
00480   }
00481 };
00482 
00483 } // end namespace clang
00484 
00485 #endif