Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class HomestoreConan(ConanFile):
name = "homestore"
version = "7.5.9"
version = "7.5.10"

homepage = "https://github.com/eBay/Homestore"
description = "HomeStore Storage Engine"
Expand Down
21 changes: 21 additions & 0 deletions src/lib/common/homestore_config.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,27 @@ table Consensus {
// Log frequency for gc_repl_reqs messages (log every N times)
// 60 * repl_dev_cleanup_interval_sec (60s) means every 1 hour we will log the gc repl reqs info.
gc_repl_reqs_log_frequency: uint32 = 60 (hotswap);

// Controls whether senders compute and attach a CRC32 checksum to pushdata/fetchdata payloads.
// The receiver always skips CRC verification when the checksum field is zero, so this flag only
// affects the send side.
//
// ROLLING UPGRADE SAFETY — two directions to consider:
// Safe: old sender → new receiver. Old nodes send raw block data with no FlatBuffer header;
// new receivers detect the absence of the "FDRS" identifier and fall back to legacy
// processing. checksum=0 (FlatBuffer field default) causes CRC verification to be
// skipped automatically.
// UNSAFE: new sender (data_checksum_enabled=true) → old receiver. An old follower has no
// try-and-fallback logic; it will treat the prepended FlatBuffer header bytes as raw
// block data and write them to disk — SILENT DATA CORRUPTION with no error logged.
//
// DEPLOYMENT RULE: do NOT enable this flag via hotswap until every node in the cluster has been
// upgraded to a version that contains the try-and-fallback FetchData receiver. Enable only
// after the rolling upgrade is complete.
//
// Not enabled by default: TCP already detects partial-transfer corruption; CRC is opt-in for
// environments requiring additional durability guarantees.
data_checksum_enabled: bool = false (hotswap);
}

table HomeStoreSettings {
Expand Down
1 change: 1 addition & 0 deletions src/lib/replication/fetch_data_rpc.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ table ResponseEntry {
dsn : uint64; // Data Sequence number
raft_term : uint64; // Raft term number
data_size : uint32; // Size of the data which is sent as separate non flatbuffer
checksum: uint32; // CRC32 over the data for this entry; 0 when checksum is disabled
}

table FetchDataResponse {
Expand Down
1 change: 1 addition & 0 deletions src/lib/replication/push_data_rpc.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ table PushDataRequest {
user_key : [ubyte]; // User key data
data_size : uint32; // Data size, actual data is sent as separate blob not by flatbuffer
time_ms: uint64; // time point when originator pushed this request;
checksum: uint32; // CRC32 over the data payload; 0 when checksum is disabled
}

root_type PushDataRequest;
227 changes: 214 additions & 13 deletions src/lib/replication/repl_dev/raft_repl_dev.cpp

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions src/lib/replication/repl_dev/raft_repl_dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ class RaftReplDevMetrics : public sisl::MetricsGroup {
REGISTER_COUNTER(read_err_cnt, "total read error count", "read_err_cnt", {"op", "read"});
REGISTER_COUNTER(write_err_cnt, "total write error count", "write_err_cnt", {"op", "write"});
REGISTER_COUNTER(fetch_err_cnt, "total fetch data error count", "fetch_err_cnt", {"op", "fetch"});
REGISTER_COUNTER(push_data_checksum_mismatch_cnt, "CRC32 mismatches on push data channel",
"push_data_checksum_mismatch_cnt", {"op", "checksum_push"});
REGISTER_COUNTER(fetch_data_checksum_mismatch_cnt, "CRC32 mismatches on fetch data channel",
"fetch_data_checksum_mismatch_cnt", {"op", "checksum_fetch"});

REGISTER_COUNTER(fetch_rreq_cnt, "total fetch data count", "fetch_data_req_cnt", {"op", "fetch"});
REGISTER_COUNTER(fetch_total_blk_size, "total fetch data blocks size", "fetch_total_blk_size", {"op", "fetch"});
Expand Down
120 changes: 120 additions & 0 deletions src/tests/test_raft_repl_dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,127 @@ TEST_F(RaftReplDevTest, Follower_Fetch_OnActive_ReplicaGroup) {
g_helper->sync_for_cleanup_start();
if (g_helper->replica_num() != 0) { g_helper->remove_flip("drop_push_data_request"); }
}
// Also validates the try-and-fallback receive path: with data_checksum_enabled=false (default),
// no FetchDataResponse FlatBuffer header is emitted by the sender. The new receiver code must
// treat the raw block bytes as old-format data — exactly what an old sender would produce during
// a rolling upgrade. Passes = old-format fallback is correct.
#endif

// Verifies the happy path with checksums explicitly enabled: writes should commit correctly
// and all replicas should hold the same data.
TEST_F(RaftReplDevTest, Checksum_Enabled_PushData_Path) {
LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
g_helper->sync_for_test_start();

LOGINFO("Enabling data_checksum_enabled");
HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.data_checksum_enabled = true; });
HS_SETTINGS_FACTORY().save();

this->write_on_leader(20, true /* wait_for_commit */);

g_helper->sync_for_verify_start();
LOGINFO("Validate all data written so far by reading them");
this->validate_data();

HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.data_checksum_enabled = false; });
HS_SETTINGS_FACTORY().save();
g_helper->sync_for_cleanup_start();
}

#ifdef _PRERELEASE
// Verifies that the fetch path works correctly with checksums enabled.
// Drops all push-data on non-leader replicas so they are forced to fetch, then checks that
// the framing header is correctly parsed and data arrives intact.
TEST_F(RaftReplDevTest, Checksum_Enabled_FetchData_Path) {
LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
g_helper->sync_for_test_start();

LOGINFO("Enabling data_checksum_enabled");
HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.data_checksum_enabled = true; });
HS_SETTINGS_FACTORY().save();

if (g_helper->replica_num() != 0) {
LOGINFO("Drop all push-data so follower {} must fetch with checksum header", g_helper->replica_num());
g_helper->set_basic_flip("drop_push_data_request");
}

this->write_on_leader(20, true /* wait_for_commit */);

g_helper->sync_for_verify_start();
LOGINFO("Validate all data written so far by reading them");
this->validate_data();

HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.data_checksum_enabled = false; });
HS_SETTINGS_FACTORY().save();
g_helper->sync_for_cleanup_start();
if (g_helper->replica_num() != 0) { g_helper->remove_flip("drop_push_data_request"); }
}

// Verifies that a PushData checksum mismatch is detected and the follower recovers correctly.
// The corrupt_push_data_checksum flip causes the receiver to treat a valid CRC as wrong,
// simulating bit corruption in transit. Followers drop the packet and fall back to fetch;
// data must still commit correctly on all replicas.
TEST_F(RaftReplDevTest, Checksum_Mismatch_PushData_Path) {
LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
g_helper->sync_for_test_start();

HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.data_checksum_enabled = true; });
HS_SETTINGS_FACTORY().save();

if (g_helper->replica_num() != 0) {
LOGINFO("Enabling corrupt_push_data_checksum on follower {} to simulate CRC mismatch",
g_helper->replica_num());
g_helper->set_basic_flip("corrupt_push_data_checksum");
}

this->write_on_leader(10, true /* wait_for_commit */);

g_helper->sync_for_verify_start();
LOGINFO("Validate data: follower must have recovered via fetch despite push checksum mismatch");
this->validate_data();

HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.data_checksum_enabled = false; });
HS_SETTINGS_FACTORY().save();
g_helper->sync_for_cleanup_start();
if (g_helper->replica_num() != 0) { g_helper->remove_flip("corrupt_push_data_checksum"); }
}

// Verifies that a FetchData checksum mismatch triggers an immediate re-fetch and the follower
// recovers correctly. Drops push-data to force the fetch path, then fires
// corrupt_fetch_data_checksum once per entry so the first fetch response looks corrupted.
// The immediate re-fetch (check_and_fetch_remote_data) must succeed and data must commit.
TEST_F(RaftReplDevTest, Checksum_Mismatch_FetchData_Path) {
LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
g_helper->sync_for_test_start();

HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.data_checksum_enabled = true; });
HS_SETTINGS_FACTORY().save();

if (g_helper->replica_num() != 0) {
LOGINFO("Follower {}: dropping push-data and injecting one-shot fetch checksum corruption",
g_helper->replica_num());
g_helper->set_basic_flip("drop_push_data_request");
// Fire once: first fetch response is checksum-corrupted; the immediate re-fetch succeeds.
g_helper->set_basic_flip("corrupt_fetch_data_checksum", 1, 100);
}

this->write_on_leader(10, true /* wait_for_commit */);

g_helper->sync_for_verify_start();
LOGINFO("Validate data: follower must have recovered via re-fetch after fetch checksum mismatch");
this->validate_data();

HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.data_checksum_enabled = false; });
HS_SETTINGS_FACTORY().save();
g_helper->sync_for_cleanup_start();
if (g_helper->replica_num() != 0) {
g_helper->remove_flip("drop_push_data_request");
g_helper->remove_flip("corrupt_fetch_data_checksum");
}
}
#endif

#ifdef _PRERELEASE
TEST_F(RaftReplDevTest, Write_With_Diabled_Leader_Push_Data) {
g_helper->set_basic_flip("disable_leader_push_data", std::numeric_limits< int >::max(), 100);
LOGINFO("Homestore replica={} setup completed, all the push_data from leader are disabled",
Expand Down
Loading