Skip to content

Commit

Permalink
bloom filter added, Page size exception added.
Browse files Browse the repository at this point in the history
  • Loading branch information
kkli08 committed Oct 30, 2024
1 parent 329cc7e commit 67a7677
Show file tree
Hide file tree
Showing 14 changed files with 815 additions and 190 deletions.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ add_library(veloxdb_lib STATIC
VeloxDB/VeloxDB.h
VeloxDB/VeloxDB.tpp

# LSMTree
LSMTree/LSMTree.cpp
LSMTree/LSMTree.h

# Generated Protobuf source files
${PROTO_SRCS}
)
Expand All @@ -108,6 +112,7 @@ target_include_directories(veloxdb_lib PUBLIC
${PROJECT_SOURCE_DIR}/Tree/RedBlackTree
${PROJECT_SOURCE_DIR}/Tree/TreeNode
${PROJECT_SOURCE_DIR}/VeloxDB
${PROJECT_SOURCE_DIR}/LSMTree
${CMAKE_CURRENT_BINARY_DIR}
${PROJECT_SOURCE_DIR}
${GENERATED_PROTO_DIR}
Expand Down Expand Up @@ -144,6 +149,7 @@ add_executable(runTests
tests/SST_File_Manager_unittest.cpp
tests/VeloxDB_api_unittest.cpp
tests/veloxdb_GET_benchmark.cpp
tests/bloom_filter_unittests.cpp
)

# Include directories for runTests
Expand Down
9 changes: 9 additions & 0 deletions LSMTree/LSMTree.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//
// Created by Damian Li on 2024-10-10.
//

//
// LSMTree.cpp
//

#include "LSMTree.h"
17 changes: 17 additions & 0 deletions LSMTree/LSMTree.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
//
// Created by Damian Li on 2024-10-10.
//

//
// LSMTree.h
//

#ifndef LSM_TREE_H
#define LSM_TREE_H

class LSMTree {

};

#endif // LSM_TREE_H

7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,12 @@ db->Close();
```

### Benchmark

> _Hardware Resources_
```text
System: macOS Sonoma Version 14.3.1
Chip: Apple M3 Max
Memory: 48 GB
```
#### `VeloxDB::Put` throughput with different `Memtable` size
```text
B Tree Degree = 3
Expand Down
135 changes: 135 additions & 0 deletions Storage/BloomFilter/BloomFilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,138 @@
//

#include "BloomFilter.h"
#include <functional>
#include <cstring>
#include <stdexcept>

BloomFilter::BloomFilter(size_t m, size_t n)
: numBits(m), expectedElements(n) {
// Validate parameters
if (m == 0) {
throw std::invalid_argument("Number of bits (m) must be greater than 0");
}
if (n == 0) {
throw std::invalid_argument("Expected number of elements (n) must be greater than 0");
}

bitArray.resize((m + 7) / 8, 0);

// Calculate the optimal number of hash functions: k = (m / n) * ln 2
double k = (static_cast<double>(m) / n) * std::log(2.0);
numHashFuncs = static_cast<size_t>(std::round(k));
if (numHashFuncs == 0) {
numHashFuncs = 1;
}
}

BloomFilter::BloomFilter() : numBits(0), numHashFuncs(0), expectedElements(0) {
// Default constructor for deserialization
}

void BloomFilter::add(const KeyValueWrapper& kv) {
auto hashIndices = hash(kv);
for (size_t index : hashIndices) {
index %= numBits;
bitArray[index / 8] |= (1 << (index % 8));
}
}

bool BloomFilter::possiblyContains(const KeyValueWrapper& kv) const {
auto hashIndices = hash(kv);
for (size_t index : hashIndices) {
index %= numBits;
if (!(bitArray[index / 8] & (1 << (index % 8)))) {
return false;
}
}
return true;
}

std::vector<char> BloomFilter::serialize() const {
std::vector<char> data;

// Serialize numBits, numHashFuncs, expectedElements
data.resize(sizeof(numBits) + sizeof(numHashFuncs) + sizeof(expectedElements));
size_t offset = 0;
std::memcpy(data.data() + offset, &numBits, sizeof(numBits));
offset += sizeof(numBits);
std::memcpy(data.data() + offset, &numHashFuncs, sizeof(numHashFuncs));
offset += sizeof(numHashFuncs);
std::memcpy(data.data() + offset, &expectedElements, sizeof(expectedElements));
offset += sizeof(expectedElements);

// Append bitArray
data.insert(data.end(), bitArray.begin(), bitArray.end());

return data;
}

void BloomFilter::deserialize(const std::vector<char>& data) {
if (data.size() < sizeof(numBits) + sizeof(numHashFuncs) + sizeof(expectedElements)) {
throw std::runtime_error("Invalid Bloom filter data");
}

size_t offset = 0;
std::memcpy(&numBits, data.data() + offset, sizeof(numBits));
offset += sizeof(numBits);
std::memcpy(&numHashFuncs, data.data() + offset, sizeof(numHashFuncs));
offset += sizeof(numHashFuncs);
std::memcpy(&expectedElements, data.data() + offset, sizeof(expectedElements));
offset += sizeof(expectedElements);

// Extract bitArray
bitArray.assign(data.begin() + offset, data.end());
}

std::vector<size_t> BloomFilter::hash(const KeyValueWrapper& kv) const {
// Use a combination of hash functions
std::vector<size_t> hashValues(numHashFuncs);

// Serialize the key to a string (only the key, not the value)
std::string keyString;

// Extract the key based on its type
if (kv.kv.has_int_key()) {
keyString = std::to_string(kv.kv.int_key());
} else if (kv.kv.has_long_key()) {
keyString = std::to_string(kv.kv.long_key());
} else if (kv.kv.has_double_key()) {
keyString = std::to_string(kv.kv.double_key());
} else if (kv.kv.has_string_key()) {
keyString = kv.kv.string_key();
} else if (kv.kv.has_char_key()) {
keyString = kv.kv.char_key();
} else {
// Handle error or empty key
keyString = "";
}

// Seed for hash functions
std::hash<std::string> hasher;
size_t baseHash = hasher(keyString);

// Second hash function seed
size_t hash2Seed = std::hash<size_t>{}(baseHash);

if (hash2Seed == 0) {
hash2Seed = 0x27d4eb2d; // Use a non-zero seed if zero
}

for (size_t i = 0; i < numHashFuncs; ++i) {
// Double Hashing: hash_i(x) = (hash1(x) + i * hash2(x)) % m
size_t hashValue = (baseHash + i * hash2Seed) % numBits;
hashValues[i] = hashValue;
}

return hashValues;
}

size_t BloomFilter::getSerializedSize() const {
size_t size = 0;
size += sizeof(numBits);
size += sizeof(numHashFuncs);
size += sizeof(expectedElements);
size += bitArray.size();
return size;
}

49 changes: 46 additions & 3 deletions Storage/BloomFilter/BloomFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,58 @@
// Created by Damian Li on 2024-10-08.
//

#ifndef BLOOMFILTER_H
#define BLOOMFILTER_H
//
// BloomFilter.h
//

#ifndef BLOOM_FILTER_H
#define BLOOM_FILTER_H

#include "KeyValue.h"
#include <vector>
#include <cstdint>
#include <string>
#include <cmath>
#include <stdexcept>

class BloomFilter {
public:
// Constructor taking m (number of bits) and n (expected number of elements)
BloomFilter(size_t m, size_t n);

// Default constructor for deserialization
BloomFilter();

// Add a key to the Bloom filter
void add(const KeyValueWrapper& kv);

// Check if a key is possibly in the Bloom filter
bool possiblyContains(const KeyValueWrapper& kv) const;

// Serialization and deserialization
std::vector<char> serialize() const;
void deserialize(const std::vector<char>& data);

// Getters for testing and internal use
size_t getNumBits() const { return numBits; }
size_t getNumHashFuncs() const { return numHashFuncs; }

// Get the estimated size of the serialized Bloom filter
size_t getSerializedSize() const;

private:
size_t numBits; // m - number of bits in the filter
size_t numHashFuncs; // k - number of hash functions
size_t expectedElements; // n - expected number of elements

std::vector<uint8_t> bitArray; // Bit array representing the filter

// Hash function that only considers the key
std::vector<size_t> hash(const KeyValueWrapper& kv) const;
};

#endif // BLOOM_FILTER_H




#endif //BLOOMFILTER_H
Loading

0 comments on commit 67a7677

Please sign in to comment.