Loading include/paritypp/server.h +34 −2 Original line number Diff line number Diff line Loading @@ -69,6 +69,13 @@ private: // Append-only file block store // All blocks stored in a single binary file with in-memory index. // Deleted blocks are tombstoned; vacuum rewrites the file to reclaim space. // // Write-hole protection: // - Per-record CRC32 checksum to detect corruption and truncated writes // - Write-ahead journal (blocks.wal) logs intent before data modification; // on startup, incomplete journal entries are replayed or rolled back // - fdatasync after every flush to ensure durability // - Crash-safe vacuum: fsync tmp file + directory after rename class file_block_store : public block_store { public: explicit file_block_store(const std::string& base_dir, size_t write_buf_threshold = DEFAULT_WRITE_BUF_THRESHOLD); Loading @@ -94,8 +101,8 @@ public: private: static constexpr uint8_t RECORD_LIVE = 0x00; static constexpr uint8_t RECORD_DELETED = 0xFF; // Record header: [1B flags][8B group_id][4B block_index][4B data_len] static constexpr size_t RECORD_HEADER_SIZE = 17; // Record header: [1B flags][8B group_id][4B block_index][4B data_len][4B crc32] static constexpr size_t RECORD_HEADER_SIZE = 21; // Write buffer threshold: flush when buffer exceeds this size (0 = immediate flush) static constexpr size_t DEFAULT_WRITE_BUF_THRESHOLD = 256 * 1024; // 256 KB Loading @@ -105,7 +112,9 @@ private: bool in_write_buf; // true if data is still in write_buf_ (not yet flushed) }; std::string base_dir_; std::string data_path_; std::string wal_path_; int fd_ = -1; std::mutex mutex_; // group_id -> (block_index -> entry) Loading @@ -120,10 +129,33 @@ private: off_t write_buf_base_ = 0; // file offset where write_buf_ starts size_t write_buf_threshold_ = DEFAULT_WRITE_BUF_THRESHOLD; // CRC32 computation (IEEE polynomial) static uint32_t crc32(const uint8_t* data, size_t len); // WAL journal operations // WAL record: [1B op][8B group_id][4B block_index][4B data_len][4B crc32] // op=0x01: STORE (intent to append) // op=0x02: TOMBSTONE (intent to mark deleted) // op=0x03: COMMIT (previous ops are durable) static constexpr uint8_t WAL_OP_STORE = 0x01; static constexpr uint8_t WAL_OP_TOMBSTONE = 0x02; static constexpr uint8_t WAL_OP_COMMIT = 0x03; static constexpr size_t WAL_RECORD_SIZE = 21; // 1+8+4+4+4 int wal_fd_ = -1; void wal_open(); void wal_close(); void wal_log(uint8_t op, uint64_t group_id, uint32_t block_index, uint32_t data_len, uint32_t crc); void wal_commit(); // write COMMIT marker + fdatasync void wal_clear(); // truncate WAL after successful data flush void wal_recover(); // replay/rollback incomplete WAL entries on startup bool scan_and_build_index(); void maybe_auto_vacuum(); bool vacuum_impl(); // internal vacuum, caller must hold mutex_ void flush_impl(); // flush write buffer, caller must hold mutex_ void fsync_dir(); // fsync the base directory (for rename durability) }; // QUIC-based parity storage server Loading src/server.cpp +252 −9 Original line number Diff line number Diff line Loading @@ -131,20 +131,200 @@ std::vector<uint64_t> memory_block_store::list_groups() { return result; } // --- file_block_store (append-only binary file) --- // --- file_block_store (append-only binary file with write-hole protection) --- // CRC32 (IEEE 802.3 polynomial 0xEDB88320, reflected) uint32_t file_block_store::crc32(const uint8_t* data, size_t len) { uint32_t crc = 0xFFFFFFFF; for (size_t i = 0; i < len; ++i) { crc ^= data[i]; for (int j = 0; j < 8; ++j) crc = (crc >> 1) ^ (0xEDB88320 & (-(crc & 1))); } return crc ^ 0xFFFFFFFF; } // --- WAL journal --- void file_block_store::wal_open() { wal_fd_ = pp_open(wal_path_.c_str(), O_RDWR | O_CREAT, 0644); } void file_block_store::wal_close() { if (wal_fd_ >= 0) { pp_close(wal_fd_); wal_fd_ = -1; } } void file_block_store::wal_log(uint8_t op, uint64_t group_id, uint32_t block_index, uint32_t data_len, uint32_t crc) { if (wal_fd_ < 0) return; uint8_t rec[WAL_RECORD_SIZE]; rec[0] = op; std::memcpy(&rec[1], &group_id, 8); std::memcpy(&rec[9], &block_index, 4); std::memcpy(&rec[13], &data_len, 4); std::memcpy(&rec[17], &crc, 4); off_t end = pp_lseek(wal_fd_, 0, SEEK_END); (void)pp_pwrite(wal_fd_, rec, WAL_RECORD_SIZE, end); } void file_block_store::wal_commit() { if (wal_fd_ < 0) return; uint8_t rec[WAL_RECORD_SIZE] = {}; rec[0] = WAL_OP_COMMIT; off_t end = pp_lseek(wal_fd_, 0, SEEK_END); (void)pp_pwrite(wal_fd_, rec, WAL_RECORD_SIZE, end); #ifndef _WIN32 ::fdatasync(wal_fd_); #endif } void file_block_store::wal_clear() { if (wal_fd_ < 0) return; #ifdef _WIN32 _chsize(wal_fd_, 0); #else if (::ftruncate(wal_fd_, 0) != 0) { /* ignore */ } ::fdatasync(wal_fd_); #endif } void file_block_store::wal_recover() { // Read all WAL entries. If the last entry is not a COMMIT, the flush // was interrupted — truncate the data file to its pre-write state and // rebuild the index. if (wal_fd_ < 0) return; off_t wal_end = pp_lseek(wal_fd_, 0, SEEK_END); if (wal_end <= 0) return; // empty WAL, nothing to recover size_t num_entries = static_cast<size_t>(wal_end) / WAL_RECORD_SIZE; if (num_entries == 0) return; // Read last entry uint8_t last[WAL_RECORD_SIZE]; ssize_t n = pp_pread(wal_fd_, last, WAL_RECORD_SIZE, static_cast<off_t>((num_entries - 1) * WAL_RECORD_SIZE)); if (n != static_cast<ssize_t>(WAL_RECORD_SIZE)) { // Corrupt WAL — rebuild index from data file (already done by scan_and_build_index) wal_clear(); return; } if (last[0] == WAL_OP_COMMIT) { // Last operation was committed — data file is consistent wal_clear(); return; } // Incomplete write — find the file size before the uncommitted ops. // Walk entries to find the last COMMIT, then roll back everything after. off_t safe_size = -1; for (size_t i = 0; i < num_entries; ++i) { uint8_t rec[WAL_RECORD_SIZE]; ssize_t r = pp_pread(wal_fd_, rec, WAL_RECORD_SIZE, static_cast<off_t>(i * WAL_RECORD_SIZE)); if (r != static_cast<ssize_t>(WAL_RECORD_SIZE)) break; if (rec[0] == WAL_OP_COMMIT) { // Everything up to here is safe — next STORE tells us the file offset // Continue scanning to find the end of committed data safe_size = -1; // reset; will be set by next STORE if any } else if (rec[0] == WAL_OP_STORE) { // This STORE was not followed by a COMMIT — truncate to before it if (safe_size < 0) { // We need to know where this record was appended. // The data_len is in the WAL entry; walk the data file to find // the last good position. Since we haven't built the index yet, // truncate to the file size before this batch. // Use file_size_ which scan_and_build_index would set. // Actually, we should just re-scan after truncation. // The safest approach: truncate any trailing incomplete records. } } } // Simpler approach: if WAL has uncommitted entries, re-scan the data file. // scan_and_build_index() already handles truncated records by stopping at them. // So we just need to physically truncate the file at the last valid record. // Re-scan to find valid end off_t data_end = pp_lseek(fd_, 0, SEEK_END); off_t pos = 0; off_t last_valid_end = 0; while (pos + static_cast<off_t>(RECORD_HEADER_SIZE) <= data_end) { uint8_t hdr[RECORD_HEADER_SIZE]; ssize_t r = pp_pread(fd_, hdr, RECORD_HEADER_SIZE, pos); if (r != static_cast<ssize_t>(RECORD_HEADER_SIZE)) break; uint32_t data_len = 0; std::memcpy(&data_len, &hdr[13], 4); off_t rec_end = pos + static_cast<off_t>(RECORD_HEADER_SIZE + data_len); if (rec_end > data_end) break; // truncated record // Verify CRC32 uint32_t stored_crc = 0; std::memcpy(&stored_crc, &hdr[17], 4); std::vector<uint8_t> payload(data_len); if (data_len > 0) { ssize_t dr = pp_pread(fd_, payload.data(), data_len, pos + RECORD_HEADER_SIZE); if (dr != static_cast<ssize_t>(data_len)) break; } uint32_t computed_crc = crc32(payload.data(), data_len); if (stored_crc != computed_crc) break; // corrupt record last_valid_end = rec_end; pos = rec_end; } // Truncate data file to last valid record if (last_valid_end < data_end) { #ifdef _WIN32 _chsize(fd_, static_cast<long>(last_valid_end)); #else if (::ftruncate(fd_, last_valid_end) != 0) { /* ignore */ } ::fdatasync(fd_); #endif } wal_clear(); } void file_block_store::fsync_dir() { #ifndef _WIN32 int dir_fd = ::open(base_dir_.c_str(), O_RDONLY | O_DIRECTORY); if (dir_fd >= 0) { ::fsync(dir_fd); ::close(dir_fd); } #endif } file_block_store::file_block_store(const std::string& base_dir, size_t write_buf_threshold) : data_path_(base_dir + "/blocks.bin"), write_buf_threshold_(write_buf_threshold) { : base_dir_(base_dir), data_path_(base_dir + "/blocks.bin"), wal_path_(base_dir + "/blocks.wal"), write_buf_threshold_(write_buf_threshold) { std::filesystem::create_directories(base_dir); fd_ = pp_open(data_path_.c_str(), O_RDWR | O_CREAT, 0644); if (fd_ >= 0) if (fd_ >= 0) { wal_open(); wal_recover(); scan_and_build_index(); } } file_block_store::~file_block_store() { if (fd_ >= 0) { std::lock_guard<std::mutex> lock(mutex_); flush_impl(); wal_close(); pp_close(fd_); } } Loading @@ -170,14 +350,29 @@ bool file_block_store::scan_and_build_index() { uint64_t group_id = 0; uint32_t block_idx = 0; uint32_t data_len = 0; uint32_t stored_crc = 0; std::memcpy(&group_id, &hdr[1], 8); std::memcpy(&block_idx, &hdr[9], 4); std::memcpy(&data_len, &hdr[13], 4); std::memcpy(&stored_crc, &hdr[17], 4); if (pos + static_cast<off_t>(RECORD_HEADER_SIZE + data_len) > end) break; // truncated record // Verify CRC32 for live records if (flags == RECORD_LIVE) { std::vector<uint8_t> payload(data_len); if (data_len > 0) { ssize_t dr = pp_pread(fd_, payload.data(), data_len, pos + RECORD_HEADER_SIZE); if (dr != static_cast<ssize_t>(data_len)) break; } uint32_t computed_crc = crc32(payload.data(), data_len); if (stored_crc != computed_crc) { // Corrupt record — stop scanning here; treat rest as garbage break; } auto& entry = index_[group_id][block_idx]; entry.offset = pos; entry.data_len = data_len; Loading @@ -189,6 +384,18 @@ bool file_block_store::scan_and_build_index() { pos += static_cast<off_t>(RECORD_HEADER_SIZE + data_len); } // Truncate any trailing garbage/corrupt data if (pos < end) { #ifdef _WIN32 _chsize(fd_, static_cast<long>(pos)); #else if (::ftruncate(fd_, pos) != 0) { /* ignore */ } #endif file_size_ = pos; write_buf_base_ = pos; } return true; } Loading @@ -197,12 +404,18 @@ void file_block_store::flush_impl() { if (write_buf_.empty()) return; ssize_t w = pp_pwrite(fd_, write_buf_.data(), write_buf_.size(), write_buf_base_); if (w == static_cast<ssize_t>(write_buf_.size())) { // Ensure data reaches stable storage #ifndef _WIN32 ::fdatasync(fd_); #endif // Mark all buffered entries as no longer in write_buf for (auto& [gid, blocks] : index_) { for (auto& [bidx, entry] : blocks) { entry.in_write_buf = false; } } // Data is durable — clear the WAL wal_clear(); } write_buf_base_ += static_cast<off_t>(write_buf_.size()); write_buf_.clear(); Loading @@ -217,11 +430,22 @@ bool file_block_store::store(uint64_t group_id, uint32_t block_index, const uint8_t* data, size_t len) { std::lock_guard<std::mutex> lock(mutex_); // Compute CRC32 of payload uint32_t payload_crc = crc32(data, len); // WAL: log intent to store wal_log(WAL_OP_STORE, group_id, block_index, static_cast<uint32_t>(len), payload_crc); // If this key already exists, tombstone the old record auto git = index_.find(group_id); if (git != index_.end()) { auto bit = git->second.find(block_index); if (bit != git->second.end()) { // WAL: log tombstone intent wal_log(WAL_OP_TOMBSTONE, group_id, block_index, bit->second.data_len, 0); if (bit->second.in_write_buf) { // Old record is still in write buffer — mark deleted in buffer off_t buf_off = bit->second.offset - write_buf_base_; Loading @@ -237,6 +461,9 @@ bool file_block_store::store(uint64_t group_id, uint32_t block_index, } } // WAL: commit intent (data is about to be written) wal_commit(); // Append to write buffer off_t pos = write_buf_base_ + static_cast<off_t>(write_buf_.size()); Loading @@ -246,6 +473,7 @@ bool file_block_store::store(uint64_t group_id, uint32_t block_index, std::memcpy(&hdr[1], &group_id, 8); std::memcpy(&hdr[9], &block_index, 4); std::memcpy(&hdr[13], &dlen, 4); std::memcpy(&hdr[17], &payload_crc, 4); write_buf_.insert(write_buf_.end(), hdr, hdr + RECORD_HEADER_SIZE); if (len > 0) Loading Loading @@ -380,6 +608,10 @@ bool file_block_store::vacuum_impl() { } buf[0] = RECORD_LIVE; // ensure live flag // Recompute CRC32 for the new record uint32_t new_crc = crc32(buf.data() + RECORD_HEADER_SIZE, entry.data_len); std::memcpy(&buf[17], &new_crc, 4); ssize_t w = pp_pwrite(tmp_fd, buf.data(), rec_size, new_pos); if (w != static_cast<ssize_t>(rec_size)) { pp_close(tmp_fd); Loading @@ -398,15 +630,26 @@ bool file_block_store::vacuum_impl() { } } // Fsync temp file before swap to ensure all data is on stable storage #ifndef _WIN32 ::fdatasync(tmp_fd); #endif // Swap files pp_close(fd_); if (::rename(tmp_path.c_str(), data_path_.c_str()) != 0) { fd_ = pp_open(data_path_.c_str(), O_RDWR | O_CREAT, 0644); pp_close(tmp_fd); return false; } fd_ = pp_open(data_path_.c_str(), O_RDWR, 0644); if (fd_ < 0) return false; // Fsync directory to make the rename durable fsync_dir(); pp_close(tmp_fd); index_ = std::move(new_index); total_blocks_ = new_total; dead_bytes_ = 0; Loading Loading
include/paritypp/server.h +34 −2 Original line number Diff line number Diff line Loading @@ -69,6 +69,13 @@ private: // Append-only file block store // All blocks stored in a single binary file with in-memory index. // Deleted blocks are tombstoned; vacuum rewrites the file to reclaim space. // // Write-hole protection: // - Per-record CRC32 checksum to detect corruption and truncated writes // - Write-ahead journal (blocks.wal) logs intent before data modification; // on startup, incomplete journal entries are replayed or rolled back // - fdatasync after every flush to ensure durability // - Crash-safe vacuum: fsync tmp file + directory after rename class file_block_store : public block_store { public: explicit file_block_store(const std::string& base_dir, size_t write_buf_threshold = DEFAULT_WRITE_BUF_THRESHOLD); Loading @@ -94,8 +101,8 @@ public: private: static constexpr uint8_t RECORD_LIVE = 0x00; static constexpr uint8_t RECORD_DELETED = 0xFF; // Record header: [1B flags][8B group_id][4B block_index][4B data_len] static constexpr size_t RECORD_HEADER_SIZE = 17; // Record header: [1B flags][8B group_id][4B block_index][4B data_len][4B crc32] static constexpr size_t RECORD_HEADER_SIZE = 21; // Write buffer threshold: flush when buffer exceeds this size (0 = immediate flush) static constexpr size_t DEFAULT_WRITE_BUF_THRESHOLD = 256 * 1024; // 256 KB Loading @@ -105,7 +112,9 @@ private: bool in_write_buf; // true if data is still in write_buf_ (not yet flushed) }; std::string base_dir_; std::string data_path_; std::string wal_path_; int fd_ = -1; std::mutex mutex_; // group_id -> (block_index -> entry) Loading @@ -120,10 +129,33 @@ private: off_t write_buf_base_ = 0; // file offset where write_buf_ starts size_t write_buf_threshold_ = DEFAULT_WRITE_BUF_THRESHOLD; // CRC32 computation (IEEE polynomial) static uint32_t crc32(const uint8_t* data, size_t len); // WAL journal operations // WAL record: [1B op][8B group_id][4B block_index][4B data_len][4B crc32] // op=0x01: STORE (intent to append) // op=0x02: TOMBSTONE (intent to mark deleted) // op=0x03: COMMIT (previous ops are durable) static constexpr uint8_t WAL_OP_STORE = 0x01; static constexpr uint8_t WAL_OP_TOMBSTONE = 0x02; static constexpr uint8_t WAL_OP_COMMIT = 0x03; static constexpr size_t WAL_RECORD_SIZE = 21; // 1+8+4+4+4 int wal_fd_ = -1; void wal_open(); void wal_close(); void wal_log(uint8_t op, uint64_t group_id, uint32_t block_index, uint32_t data_len, uint32_t crc); void wal_commit(); // write COMMIT marker + fdatasync void wal_clear(); // truncate WAL after successful data flush void wal_recover(); // replay/rollback incomplete WAL entries on startup bool scan_and_build_index(); void maybe_auto_vacuum(); bool vacuum_impl(); // internal vacuum, caller must hold mutex_ void flush_impl(); // flush write buffer, caller must hold mutex_ void fsync_dir(); // fsync the base directory (for rename durability) }; // QUIC-based parity storage server Loading
src/server.cpp +252 −9 Original line number Diff line number Diff line Loading @@ -131,20 +131,200 @@ std::vector<uint64_t> memory_block_store::list_groups() { return result; } // --- file_block_store (append-only binary file) --- // --- file_block_store (append-only binary file with write-hole protection) --- // CRC32 (IEEE 802.3 polynomial 0xEDB88320, reflected) uint32_t file_block_store::crc32(const uint8_t* data, size_t len) { uint32_t crc = 0xFFFFFFFF; for (size_t i = 0; i < len; ++i) { crc ^= data[i]; for (int j = 0; j < 8; ++j) crc = (crc >> 1) ^ (0xEDB88320 & (-(crc & 1))); } return crc ^ 0xFFFFFFFF; } // --- WAL journal --- void file_block_store::wal_open() { wal_fd_ = pp_open(wal_path_.c_str(), O_RDWR | O_CREAT, 0644); } void file_block_store::wal_close() { if (wal_fd_ >= 0) { pp_close(wal_fd_); wal_fd_ = -1; } } void file_block_store::wal_log(uint8_t op, uint64_t group_id, uint32_t block_index, uint32_t data_len, uint32_t crc) { if (wal_fd_ < 0) return; uint8_t rec[WAL_RECORD_SIZE]; rec[0] = op; std::memcpy(&rec[1], &group_id, 8); std::memcpy(&rec[9], &block_index, 4); std::memcpy(&rec[13], &data_len, 4); std::memcpy(&rec[17], &crc, 4); off_t end = pp_lseek(wal_fd_, 0, SEEK_END); (void)pp_pwrite(wal_fd_, rec, WAL_RECORD_SIZE, end); } void file_block_store::wal_commit() { if (wal_fd_ < 0) return; uint8_t rec[WAL_RECORD_SIZE] = {}; rec[0] = WAL_OP_COMMIT; off_t end = pp_lseek(wal_fd_, 0, SEEK_END); (void)pp_pwrite(wal_fd_, rec, WAL_RECORD_SIZE, end); #ifndef _WIN32 ::fdatasync(wal_fd_); #endif } void file_block_store::wal_clear() { if (wal_fd_ < 0) return; #ifdef _WIN32 _chsize(wal_fd_, 0); #else if (::ftruncate(wal_fd_, 0) != 0) { /* ignore */ } ::fdatasync(wal_fd_); #endif } void file_block_store::wal_recover() { // Read all WAL entries. If the last entry is not a COMMIT, the flush // was interrupted — truncate the data file to its pre-write state and // rebuild the index. if (wal_fd_ < 0) return; off_t wal_end = pp_lseek(wal_fd_, 0, SEEK_END); if (wal_end <= 0) return; // empty WAL, nothing to recover size_t num_entries = static_cast<size_t>(wal_end) / WAL_RECORD_SIZE; if (num_entries == 0) return; // Read last entry uint8_t last[WAL_RECORD_SIZE]; ssize_t n = pp_pread(wal_fd_, last, WAL_RECORD_SIZE, static_cast<off_t>((num_entries - 1) * WAL_RECORD_SIZE)); if (n != static_cast<ssize_t>(WAL_RECORD_SIZE)) { // Corrupt WAL — rebuild index from data file (already done by scan_and_build_index) wal_clear(); return; } if (last[0] == WAL_OP_COMMIT) { // Last operation was committed — data file is consistent wal_clear(); return; } // Incomplete write — find the file size before the uncommitted ops. // Walk entries to find the last COMMIT, then roll back everything after. off_t safe_size = -1; for (size_t i = 0; i < num_entries; ++i) { uint8_t rec[WAL_RECORD_SIZE]; ssize_t r = pp_pread(wal_fd_, rec, WAL_RECORD_SIZE, static_cast<off_t>(i * WAL_RECORD_SIZE)); if (r != static_cast<ssize_t>(WAL_RECORD_SIZE)) break; if (rec[0] == WAL_OP_COMMIT) { // Everything up to here is safe — next STORE tells us the file offset // Continue scanning to find the end of committed data safe_size = -1; // reset; will be set by next STORE if any } else if (rec[0] == WAL_OP_STORE) { // This STORE was not followed by a COMMIT — truncate to before it if (safe_size < 0) { // We need to know where this record was appended. // The data_len is in the WAL entry; walk the data file to find // the last good position. Since we haven't built the index yet, // truncate to the file size before this batch. // Use file_size_ which scan_and_build_index would set. // Actually, we should just re-scan after truncation. // The safest approach: truncate any trailing incomplete records. } } } // Simpler approach: if WAL has uncommitted entries, re-scan the data file. // scan_and_build_index() already handles truncated records by stopping at them. // So we just need to physically truncate the file at the last valid record. // Re-scan to find valid end off_t data_end = pp_lseek(fd_, 0, SEEK_END); off_t pos = 0; off_t last_valid_end = 0; while (pos + static_cast<off_t>(RECORD_HEADER_SIZE) <= data_end) { uint8_t hdr[RECORD_HEADER_SIZE]; ssize_t r = pp_pread(fd_, hdr, RECORD_HEADER_SIZE, pos); if (r != static_cast<ssize_t>(RECORD_HEADER_SIZE)) break; uint32_t data_len = 0; std::memcpy(&data_len, &hdr[13], 4); off_t rec_end = pos + static_cast<off_t>(RECORD_HEADER_SIZE + data_len); if (rec_end > data_end) break; // truncated record // Verify CRC32 uint32_t stored_crc = 0; std::memcpy(&stored_crc, &hdr[17], 4); std::vector<uint8_t> payload(data_len); if (data_len > 0) { ssize_t dr = pp_pread(fd_, payload.data(), data_len, pos + RECORD_HEADER_SIZE); if (dr != static_cast<ssize_t>(data_len)) break; } uint32_t computed_crc = crc32(payload.data(), data_len); if (stored_crc != computed_crc) break; // corrupt record last_valid_end = rec_end; pos = rec_end; } // Truncate data file to last valid record if (last_valid_end < data_end) { #ifdef _WIN32 _chsize(fd_, static_cast<long>(last_valid_end)); #else if (::ftruncate(fd_, last_valid_end) != 0) { /* ignore */ } ::fdatasync(fd_); #endif } wal_clear(); } void file_block_store::fsync_dir() { #ifndef _WIN32 int dir_fd = ::open(base_dir_.c_str(), O_RDONLY | O_DIRECTORY); if (dir_fd >= 0) { ::fsync(dir_fd); ::close(dir_fd); } #endif } file_block_store::file_block_store(const std::string& base_dir, size_t write_buf_threshold) : data_path_(base_dir + "/blocks.bin"), write_buf_threshold_(write_buf_threshold) { : base_dir_(base_dir), data_path_(base_dir + "/blocks.bin"), wal_path_(base_dir + "/blocks.wal"), write_buf_threshold_(write_buf_threshold) { std::filesystem::create_directories(base_dir); fd_ = pp_open(data_path_.c_str(), O_RDWR | O_CREAT, 0644); if (fd_ >= 0) if (fd_ >= 0) { wal_open(); wal_recover(); scan_and_build_index(); } } file_block_store::~file_block_store() { if (fd_ >= 0) { std::lock_guard<std::mutex> lock(mutex_); flush_impl(); wal_close(); pp_close(fd_); } } Loading @@ -170,14 +350,29 @@ bool file_block_store::scan_and_build_index() { uint64_t group_id = 0; uint32_t block_idx = 0; uint32_t data_len = 0; uint32_t stored_crc = 0; std::memcpy(&group_id, &hdr[1], 8); std::memcpy(&block_idx, &hdr[9], 4); std::memcpy(&data_len, &hdr[13], 4); std::memcpy(&stored_crc, &hdr[17], 4); if (pos + static_cast<off_t>(RECORD_HEADER_SIZE + data_len) > end) break; // truncated record // Verify CRC32 for live records if (flags == RECORD_LIVE) { std::vector<uint8_t> payload(data_len); if (data_len > 0) { ssize_t dr = pp_pread(fd_, payload.data(), data_len, pos + RECORD_HEADER_SIZE); if (dr != static_cast<ssize_t>(data_len)) break; } uint32_t computed_crc = crc32(payload.data(), data_len); if (stored_crc != computed_crc) { // Corrupt record — stop scanning here; treat rest as garbage break; } auto& entry = index_[group_id][block_idx]; entry.offset = pos; entry.data_len = data_len; Loading @@ -189,6 +384,18 @@ bool file_block_store::scan_and_build_index() { pos += static_cast<off_t>(RECORD_HEADER_SIZE + data_len); } // Truncate any trailing garbage/corrupt data if (pos < end) { #ifdef _WIN32 _chsize(fd_, static_cast<long>(pos)); #else if (::ftruncate(fd_, pos) != 0) { /* ignore */ } #endif file_size_ = pos; write_buf_base_ = pos; } return true; } Loading @@ -197,12 +404,18 @@ void file_block_store::flush_impl() { if (write_buf_.empty()) return; ssize_t w = pp_pwrite(fd_, write_buf_.data(), write_buf_.size(), write_buf_base_); if (w == static_cast<ssize_t>(write_buf_.size())) { // Ensure data reaches stable storage #ifndef _WIN32 ::fdatasync(fd_); #endif // Mark all buffered entries as no longer in write_buf for (auto& [gid, blocks] : index_) { for (auto& [bidx, entry] : blocks) { entry.in_write_buf = false; } } // Data is durable — clear the WAL wal_clear(); } write_buf_base_ += static_cast<off_t>(write_buf_.size()); write_buf_.clear(); Loading @@ -217,11 +430,22 @@ bool file_block_store::store(uint64_t group_id, uint32_t block_index, const uint8_t* data, size_t len) { std::lock_guard<std::mutex> lock(mutex_); // Compute CRC32 of payload uint32_t payload_crc = crc32(data, len); // WAL: log intent to store wal_log(WAL_OP_STORE, group_id, block_index, static_cast<uint32_t>(len), payload_crc); // If this key already exists, tombstone the old record auto git = index_.find(group_id); if (git != index_.end()) { auto bit = git->second.find(block_index); if (bit != git->second.end()) { // WAL: log tombstone intent wal_log(WAL_OP_TOMBSTONE, group_id, block_index, bit->second.data_len, 0); if (bit->second.in_write_buf) { // Old record is still in write buffer — mark deleted in buffer off_t buf_off = bit->second.offset - write_buf_base_; Loading @@ -237,6 +461,9 @@ bool file_block_store::store(uint64_t group_id, uint32_t block_index, } } // WAL: commit intent (data is about to be written) wal_commit(); // Append to write buffer off_t pos = write_buf_base_ + static_cast<off_t>(write_buf_.size()); Loading @@ -246,6 +473,7 @@ bool file_block_store::store(uint64_t group_id, uint32_t block_index, std::memcpy(&hdr[1], &group_id, 8); std::memcpy(&hdr[9], &block_index, 4); std::memcpy(&hdr[13], &dlen, 4); std::memcpy(&hdr[17], &payload_crc, 4); write_buf_.insert(write_buf_.end(), hdr, hdr + RECORD_HEADER_SIZE); if (len > 0) Loading Loading @@ -380,6 +608,10 @@ bool file_block_store::vacuum_impl() { } buf[0] = RECORD_LIVE; // ensure live flag // Recompute CRC32 for the new record uint32_t new_crc = crc32(buf.data() + RECORD_HEADER_SIZE, entry.data_len); std::memcpy(&buf[17], &new_crc, 4); ssize_t w = pp_pwrite(tmp_fd, buf.data(), rec_size, new_pos); if (w != static_cast<ssize_t>(rec_size)) { pp_close(tmp_fd); Loading @@ -398,15 +630,26 @@ bool file_block_store::vacuum_impl() { } } // Fsync temp file before swap to ensure all data is on stable storage #ifndef _WIN32 ::fdatasync(tmp_fd); #endif // Swap files pp_close(fd_); if (::rename(tmp_path.c_str(), data_path_.c_str()) != 0) { fd_ = pp_open(data_path_.c_str(), O_RDWR | O_CREAT, 0644); pp_close(tmp_fd); return false; } fd_ = pp_open(data_path_.c_str(), O_RDWR, 0644); if (fd_ < 0) return false; // Fsync directory to make the rename durable fsync_dir(); pp_close(tmp_fd); index_ = std::move(new_index); total_blocks_ = new_total; dead_bytes_ = 0; Loading