From: Peter Iannucci Date: Tue, 8 Oct 2013 14:13:49 +0000 (-0400) Subject: Reduced timeouts by 10x X-Git-Url: http://xvm.mit.edu/gitweb/invirt/third/libt4.git/commitdiff_plain/603bac8fcb3697f283e6537d81b4a92e457ebbad Reduced timeouts by 10x --- diff --git a/Makefile.osx b/Makefile.osx index 91ce5ee..7b2b74a 100644 --- a/Makefile.osx +++ b/Makefile.osx @@ -1,5 +1,7 @@ -PEDANTRY = -Weverything -pedantic-errors -Werror -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded -Wno-missing-prototypes -Wmissing-declarations -PEDANTRY += -Wno-weak-vtables -Wno-global-constructors -Wno-exit-time-destructors +PEDANTRY = -Weverything -pedantic-errors -Werror -Wno-c++98-compat \ + -Wno-c++98-compat-pedantic -Wno-padded -Wno-missing-prototypes \ + -Wmissing-declarations -Wno-weak-vtables -Wno-global-constructors \ + -Wno-exit-time-destructors CXXFLAGS = -ggdb -MMD -I. -std=c++11 -stdlib=libc++ $(PEDANTRY) LDFLAGS = -stdlib=libc++ CXX = clang++ diff --git a/config.cc b/config.cc index 727277e..5373007 100644 --- a/config.cc +++ b/config.cc @@ -149,7 +149,7 @@ void config::heartbeater() [[noreturn]] { lock cfg_mutex_lock(cfg_mutex); while (1) { - auto next_timeout = steady_clock::now() + seconds(3); + auto next_timeout = steady_clock::now() + milliseconds(300); LOG("go to sleep"); config_cond.wait_until(cfg_mutex_lock, next_timeout); @@ -204,7 +204,7 @@ config::heartbeat_t config::doheartbeat(const string &m, lock &cfg_mutex_lock) { cfg_mutex_lock.unlock(); int r = 0, ret = rpc_const::bind_failure; if (rpcc *cl = h.safebind()) - ret = cl->call_timeout(paxos_protocol::heartbeat, rpcc::to(1000), r, me, vid); + ret = cl->call_timeout(paxos_protocol::heartbeat, rpcc::to(100), r, me, vid); cfg_mutex_lock.lock(); heartbeat_t res = OK; diff --git a/lock_tester.cc b/lock_tester.cc index f8e2196..84ac5cc 100644 --- a/lock_tester.cc +++ b/lock_tester.cc @@ -71,7 +71,7 @@ void test2(int i) { lc[i]->acquire(a); LOG_NONMEMBER("test2: client " << i << " acquire done"); check_grant(a); - sleep(1); + usleep(100000); LOG_NONMEMBER("test2: client " << i << " release"); check_release(a); lc[i]->release(a); diff --git a/paxos.cc b/paxos.cc index 85507d5..b83a044 100644 --- a/paxos.cc +++ b/paxos.cc @@ -94,7 +94,7 @@ bool proposer_acceptor::prepare(unsigned instance, nodes_t & accepts, if (!r) continue; auto status = (paxos_protocol::status)r->call_timeout( - paxos_protocol::preparereq, rpcc::to(1000), res, me, instance, proposal); + paxos_protocol::preparereq, rpcc::to(100), res, me, instance, proposal); if (status == paxos_protocol::OK) { if (res.oldinstance) { LOG("commiting old instance!"); @@ -125,7 +125,7 @@ void proposer_acceptor::accept(unsigned instance, nodes_t & accepts, continue; bool accept = false; int status = r->call_timeout( - paxos_protocol::acceptreq, rpcc::to(1000), accept, me, instance, proposal, v); + paxos_protocol::acceptreq, rpcc::to(100), accept, me, instance, proposal, v); if (status == paxos_protocol::OK && accept) accepts.push_back(i); } @@ -138,7 +138,7 @@ void proposer_acceptor::decide(unsigned instance, const nodes_t & accepts, const if (!r) continue; int res = 0; - r->call_timeout(paxos_protocol::decidereq, rpcc::to(1000), res, me, instance, v); + r->call_timeout(paxos_protocol::decidereq, rpcc::to(100), res, me, instance, v); } } diff --git a/rpc/rpctest.cc b/rpc/rpctest.cc index 47d5bce..2f58e5d 100644 --- a/rpc/rpctest.cc +++ b/rpc/rpctest.cc @@ -52,7 +52,7 @@ srv::handle_fast(int &r, const int a) int srv::handle_slow(int &r, const int a) { - usleep(random() % 5000); + usleep(random() % 500); r = a + 2; return 0; } @@ -166,7 +166,7 @@ client3(void *xx) for(int i = 0; i < 4; i++){ int rep = 0; - int ret = c->call_timeout(24, rpcc::to(3000), rep, i); + int ret = c->call_timeout(24, rpcc::to(300), rep, i); VERIFY(ret == rpc_const::timeout_failure || rep == i+2); } } @@ -187,14 +187,14 @@ simple_tests(rpcc *c) cout << " -- string concat RPC .. ok" << endl; // small request, big reply (perhaps req via UDP, reply via TCP) - intret = c->call_timeout(25, rpcc::to(200000), rep, 70000); + intret = c->call_timeout(25, rpcc::to(20000), rep, 70000); VERIFY(intret == 0); VERIFY(rep.size() == 70000); cout << " -- small request, big reply .. ok" << endl; // specify a timeout value to an RPC that should succeed (udp) int xx = 0; - intret = c->call_timeout(23, rpcc::to(3000), xx, 77); + intret = c->call_timeout(23, rpcc::to(300), xx, 77); VERIFY(intret == 0 && xx == 78); cout << " -- no spurious timeout .. ok" << endl; @@ -202,7 +202,7 @@ simple_tests(rpcc *c) { string arg(1000, 'x'); string rep2; - c->call_timeout(22, rpcc::to(3000), rep2, arg, (string)"x"); + c->call_timeout(22, rpcc::to(300), rep2, arg, (string)"x"); VERIFY(rep2.size() == 1001); cout << " -- no spurious timeout .. ok" << endl; } @@ -217,7 +217,7 @@ simple_tests(rpcc *c) string non_existent = "127.0.0.1:7661"; rpcc *c1 = new rpcc(non_existent); time_t t0 = time(0); - intret = c1->bind(rpcc::to(3000)); + intret = c1->bind(rpcc::to(300)); time_t t1 = time(0); VERIFY(intret < 0 && (t1 - t0) <= 4); cout << " -- rpc timeout .. ok" << endl; @@ -425,6 +425,6 @@ main(int argc, char *argv[]) } while (1) { - sleep(1); + usleep(100000); } } diff --git a/rsm.cc b/rsm.cc index 255d281..81f0e4c 100644 --- a/rsm.cc +++ b/rsm.cc @@ -135,7 +135,7 @@ void rsm::recovery() [[noreturn]] { commit_change(cfg->view_id(), ml); } else { ml.unlock(); - this_thread::sleep_for(seconds(30)); // XXX make another node in cfg primary? + this_thread::sleep_for(seconds(3)); // XXX make another node in cfg primary? ml.lock(); } } @@ -214,7 +214,7 @@ bool rsm::statetransfer(const string & m, lock & rsm_mutex_lock) rsm_mutex_lock.unlock(); cl = h.safebind(); if (cl) { - ret = cl->call_timeout(rsm_protocol::transferreq, rpcc::to(1000), + ret = cl->call_timeout(rsm_protocol::transferreq, rpcc::to(100), r, cfg->myaddr(), last_myvs, vid_insync); } rsm_mutex_lock.lock(); @@ -257,7 +257,7 @@ bool rsm::join(const string & m, lock & rsm_mutex_lock) { rsm_mutex_lock.unlock(); cl = h.safebind(); if (cl != 0) { - ret = cl->call_timeout(rsm_protocol::joinreq, rpcc::to(120000), log, + ret = cl->call_timeout(rsm_protocol::joinreq, rpcc::to(12000), log, cfg->myaddr(), last_myvs); } rsm_mutex_lock.lock(); @@ -347,7 +347,7 @@ rsm_client_protocol::status rsm::client_invoke(string & r, int procno, const str if (!cl) return rsm_client_protocol::BUSY; int ignored_rval; - auto ret = (rsm_protocol::status)cl->call_timeout(rsm_protocol::invoke, rpcc::to(1000), ignored_rval, procno, vs, req); + auto ret = (rsm_protocol::status)cl->call_timeout(rsm_protocol::invoke, rpcc::to(100), ignored_rval, procno, vs, req); LOG("Invoke returned " << ret); if (ret != rsm_protocol::OK) return rsm_client_protocol::BUSY; diff --git a/rsm_client.cc b/rsm_client.cc index e5e4c2b..01098d6 100644 --- a/rsm_client.cc +++ b/rsm_client.cc @@ -26,7 +26,7 @@ rsm_protocol::status rsm_client::invoke(unsigned int proc, string &rep, const st rpcc *cl = h.safebind(); auto ret = rsm_client_protocol::OK; if (cl) - ret = (rsm_client_protocol::status)cl->call_timeout(rsm_client_protocol::invoke, rpcc::to(5000), rep, proc, req); + ret = (rsm_client_protocol::status)cl->call_timeout(rsm_client_protocol::invoke, rpcc::to(500), rep, proc, req); ml.lock(); if (!cl) @@ -37,7 +37,7 @@ rsm_protocol::status rsm_client::invoke(unsigned int proc, string &rep, const st return rsm_protocol::OK; if (ret == rsm_client_protocol::BUSY) { LOG("rsm is busy " << primary); - sleep(3); + usleep(300000); continue; } if (ret == rsm_client_protocol::NOTPRIMARY) { @@ -61,7 +61,7 @@ bool rsm_client::init_members(lock & rsm_client_mutex_lock) { rsm_client_mutex_lock.unlock(); cl = h.safebind(); if (cl) - ret = cl->call_timeout(rsm_client_protocol::members, rpcc::to(1000), known_mems, 0); + ret = cl->call_timeout(rsm_client_protocol::members, rpcc::to(100), known_mems, 0); rsm_client_mutex_lock.lock(); } if (cl == 0 || ret != rsm_protocol::OK) diff --git a/rsm_tester.pl b/rsm_tester.pl index 3d94744..43a0fb4 100755 --- a/rsm_tester.pl +++ b/rsm_tester.pl @@ -2,6 +2,7 @@ use POSIX ":sys_wait_h"; use Getopt::Std; +use Time::HiRes (usleep); use strict; @@ -34,7 +35,7 @@ sub killprocess { sub cleanup { kill 9, @pid; unlink(@logs); - sleep 2; + usleep 200000; } sub spawn { @@ -162,7 +163,7 @@ sub wait_for_view_change { my $lastv = `grep done $log | tail -n 1`; chomp $lastv; print " Waiting for $including to be present in >=$num_views views in $log (Last view: $lastv)\n"; - sleep 1; + usleep 100000; } if( get_num_views( $log, $including ) < $num_views) { @@ -179,7 +180,7 @@ sub waitpid_to { my $start = time(); my $done_pid; do { - sleep 1; + usleep 100000; $done_pid = waitpid($pid, POSIX::WNOHANG); } while( $done_pid <= 0 and (time() - $start) < $to ); @@ -224,7 +225,7 @@ sub start_nodes ($$){ @pid = (@pid, spawn_ls($p[0],$p[$i])); print "Start lock_server on $p[$i]\n"; } - sleep 1; + usleep 100000; my @vv = @p[0..$i]; wait_and_check_expected_view(\@vv); @@ -266,7 +267,7 @@ if ($do_run[0]) { print "test0: start 3-process lock server\n"; start_nodes(3,"ls"); cleanup(); - sleep 2; + usleep 200000; } if ($do_run[1]) { @@ -276,14 +277,14 @@ if ($do_run[1]) { print "Kill third server (PID: $pid[2]) on port $p[2]\n"; kill "TERM", $pid[2]; - sleep 5; + usleep 500000; # it should go through 4 views my @v4 = ($p[0], $p[1]); wait_and_check_expected_view(\@v4); cleanup(); - sleep 2; + usleep 200000; } if ($do_run[2]) { @@ -293,14 +294,14 @@ if ($do_run[2]) { print "Kill first (PID: $pid[0]) on port $p[0]\n"; kill "TERM", $pid[0]; - sleep 5; + usleep 500000; # it should go through 4 views my @v4 = ($p[1], $p[2]); wait_and_check_expected_view(\@v4); cleanup(); - sleep 2; + usleep 200000; } @@ -312,7 +313,7 @@ if ($do_run[3]) { print "Kill server (PID: $pid[2]) on port $p[2]\n"; kill "TERM", $pid[2]; - sleep 5; + usleep 500000; my @v4 = ($p[0], $p[1]); wait_and_check_expected_view(\@v4); @@ -320,13 +321,13 @@ if ($do_run[3]) { print "Restart killed server on port $p[2]\n"; $pid[2] = spawn_ls ($p[0], $p[2]); - sleep 5; + usleep 500000; my @v5 = ($p[0], $p[1], $p[2]); wait_and_check_expected_view(\@v5); cleanup(); - sleep 2; + usleep 200000; } if ($do_run[4]) { @@ -336,20 +337,20 @@ if ($do_run[4]) { print "Kill server (PID: $pid[2]) on port $p[2]\n"; kill "TERM", $pid[2]; - sleep 5; + usleep 500000; my @v4 = ($p[0], $p[1]); wait_and_check_expected_view(\@v4); print "Kill server (PID: $pid[1]) on port $p[1]\n"; kill "TERM", $pid[1]; - sleep 5; + usleep 500000; #no view change can happen because of a lack of majority print "Restarting server on port $p[2]\n"; $pid[2] = spawn_ls($p[0], $p[2]); - sleep 5; + usleep 500000; #no view change can happen because of a lack of majority foreach my $port (@p[0..2]) { @@ -361,13 +362,13 @@ if ($do_run[4]) { print "Kill server (PID: $pid[2]) on port $p[2]\n"; kill "TERM", $pid[2]; - sleep 5; + usleep 500000; print "Restarting server on port $p[1]\n"; $pid[1] = spawn_ls($p[0], $p[1]); - sleep 7; + usleep 700000; foreach my $port (@p[0..1]) { $in_views{$port} = get_num_views( paxos_log($port), $port ); @@ -400,12 +401,12 @@ if ($do_run[5]) { print "Sending paxos breakpoint 1 to first server on port $p[0]\n"; spawn("./rsm_tester", $p[0]+1, "breakpoint", 3); - sleep 1; + usleep 100000; print "Kill third server (PID: $pid[2]) on port $p[2]\n"; kill "TERM", $pid[2]; - sleep 5; + usleep 500000; foreach my $port (@p[0..2]) { my $num_v = get_num_views( paxos_log($port), $port ); die "$num_v views in ", paxos_log($port), " : no new views should be formed due to the lack of majority\n" if ($num_v != $in_views{$port}); @@ -417,7 +418,7 @@ if ($do_run[5]) { foreach my $port (@lastv) { wait_for_view_change(paxos_log($port), $in_views{$port}+1, $port, 20); } - sleep 10; + usleep 1000000; # now check the paxos logs and make sure the logs go through the right # views @@ -436,24 +437,24 @@ if ($do_run[6]) { print "Sending paxos breakpoint 2 to first server on port $p[0]\n"; spawn("./rsm_tester", $p[0]+1, "breakpoint", 4); - sleep 1; + usleep 100000; print "Kill fourth server (PID: $pid[3]) on port $p[3]\n"; kill "TERM", $pid[3]; - sleep 5; + usleep 500000; foreach my $port ($p[1],$p[2]) { my $num_v = get_num_views( paxos_log($port), $port ); die "$num_v views in ", paxos_log($port), " : no new views should be formed due to the lack of majority\n" if ($num_v != $in_views{$port}); } - sleep 5; + usleep 500000; print "Restarting fourth server on port $p[3]\n"; $pid[3] = spawn_ls($p[1], $p[3]); - sleep 5; + usleep 500000; my @v5 = ($p[0],$p[1],$p[2]); foreach my $port (@v5) { @@ -461,7 +462,7 @@ if ($do_run[6]) { } push @views, \@v5; - sleep 10; + usleep 1000000; # the 6th view will be (2,3) or (1,2,3,4) my @v6 = ($p[1],$p[2]); @@ -489,12 +490,12 @@ if ($do_run[7]) { start_nodes(4,"ls"); print "Sending paxos breakpoint 2 to first server on port $p[0]\n"; spawn("./rsm_tester", $p[0]+1, "breakpoint", 4); - sleep 3; + usleep 300000; print "Kill fourth server (PID: $pid[3]) on port $p[3]\n"; kill "TERM", $pid[3]; - sleep 5; + usleep 500000; print "Kill third server (PID: $pid[2]) on port $p[2]\n"; kill "TERM", $pid[2]; @@ -502,17 +503,17 @@ if ($do_run[7]) { print "Kill second server (PID: $pid[1]) on port $p[1]\n"; kill "TERM", $pid[1]; - sleep 5; + usleep 500000; print "Restarting second server on port $p[1]\n"; $pid[1] = spawn_ls($p[0], $p[1]); - sleep 5; + usleep 500000; print "Restarting third server on port $p[2]\n"; $pid[2] = spawn_ls($p[0], $p[2]); - sleep 5; + usleep 500000; #no view change is possible by now because there is no majority foreach my $port ($p[1],$p[2]) { @@ -523,7 +524,7 @@ if ($do_run[7]) { print "Restarting fourth server on port $p[3]\n"; $pid[3] = spawn_ls($p[1], $p[3]); - sleep 5; + usleep 500000; my @v5 = ($p[0], $p[1], $p[2]); push @views, \@v5; @@ -531,7 +532,7 @@ if ($do_run[7]) { $in_views{$port}++; } - sleep 15; + usleep 1500000; my @lastv = ($p[1],$p[2],$p[3]); foreach my $port (@lastv) { wait_for_view_change(paxos_log($port), $in_views{$port}+1, $port, 20); @@ -560,7 +561,7 @@ if ($do_run[8]) { } cleanup(); - sleep 2; + usleep 200000; } if ($do_run[9]) { @@ -576,7 +577,7 @@ if ($do_run[9]) { print "Kill slave (PID: $pid[2]) on port $p[2]\n"; kill "TERM", $pid[2]; - sleep 3; + usleep 300000; # it should go through 4 views my @v4 = ($p[0], $p[1]); @@ -590,7 +591,7 @@ if ($do_run[9]) { } cleanup(); - sleep 2; + usleep 200000; } if ($do_run[10]) { @@ -606,13 +607,13 @@ if ($do_run[10]) { print "Kill slave (PID: $pid[2]) on port $p[2]\n"; kill "TERM", $pid[2]; - sleep 3; + usleep 300000; # it should go through 4 views my @v4 = ($p[0], $p[1]); wait_and_check_expected_view(\@v4); - sleep 3; + usleep 300000; print "Restarting killed lock_server on port $p[2]\n"; $pid[2] = spawn_ls($p[0], $p[2]); @@ -627,7 +628,7 @@ if ($do_run[10]) { } cleanup(); - sleep 2; + usleep 200000; } @@ -644,7 +645,7 @@ if ($do_run[11]) { print "Kill primary (PID: $pid[0]) on port $p[0]\n"; kill "TERM", $pid[0]; - sleep 3; + usleep 300000; # it should go through 4 views my @v4 = ($p[1], $p[2]); @@ -658,7 +659,7 @@ if ($do_run[11]) { } cleanup(); - sleep 2; + usleep 200000; } if ($do_run[12]) { @@ -670,13 +671,13 @@ if ($do_run[12]) { print "Start lock_tester $p[0]\n"; $t = spawn("./lock_tester", $p[0]); - sleep 1; + usleep 100000; print "Kill master (PID: $pid[0]) on port $p[0] at breakpoint 1\n"; spawn("./rsm_tester", $p[0]+1, "breakpoint", 1); - sleep 1; + usleep 100000; # it should go through 5 views my @v4 = ($p[1], $p[2]); @@ -685,7 +686,7 @@ if ($do_run[12]) { print "Restarting killed lock_server on port $p[0]\n"; $pid[0] = spawn_ls($p[1], $p[0]); - sleep 3; + usleep 300000; # the last view should include all nodes my @lastv = ($p[0],$p[1],$p[2]); @@ -705,7 +706,7 @@ if ($do_run[12]) { } cleanup(); - sleep 2; + usleep 200000; } if ($do_run[13]) { @@ -717,12 +718,12 @@ if ($do_run[13]) { print "Start lock_tester $p[0]\n"; $t = spawn("./lock_tester", $p[0]); - sleep 1; + usleep 100000; print "Kill slave (PID: $pid[2]) on port $p[2] at breakpoint 1\n"; spawn("./rsm_tester", $p[2]+1, "breakpoint", 1); - sleep 1; + usleep 100000; # it should go through 4 views my @v4 = ($p[0], $p[1]); @@ -731,7 +732,7 @@ if ($do_run[13]) { print "Restarting killed lock_server on port $p[2]\n"; $pid[2] = spawn_ls($p[0], $p[2]); - sleep 3; + usleep 300000; # the last view should include all nodes my @lastv = ($p[0],$p[1],$p[2]); @@ -751,7 +752,7 @@ if ($do_run[13]) { } cleanup(); - sleep 2; + usleep 200000; } if ($do_run[14]) { @@ -763,7 +764,7 @@ if ($do_run[14]) { print "Start lock_tester $p[0]\n"; $t = spawn("./lock_tester", $p[0]); - sleep 1; + usleep 100000; print "Kill slave (PID: $pid[4]) on port $p[4] at breakpoint 1\n"; spawn("./rsm_tester", $p[4]+1, "breakpoint", 1); @@ -773,7 +774,7 @@ if ($do_run[14]) { spawn("./rsm_tester", $p[3]+1, "breakpoint", 2); - sleep 1; + usleep 100000; # two view changes: @@ -798,7 +799,7 @@ if ($do_run[14]) { } cleanup(); - sleep 2; + usleep 200000; } if ($do_run[15]) { @@ -810,7 +811,7 @@ if ($do_run[15]) { print "Start lock_tester $p[0]\n"; $t = spawn("./lock_tester", $p[0]); - sleep 1; + usleep 100000; print "Kill slave (PID: $pid[4]) on port $p[4] at breakpoint 1\n"; spawn("./rsm_tester", $p[4]+1, "breakpoint", 1); @@ -819,7 +820,7 @@ if ($do_run[15]) { print "Kill primary (PID: $pid[0]) on port $p[0] at breakpoint 2\n"; spawn("./rsm_tester", $p[0]+1, "breakpoint", 2); - sleep 1; + usleep 100000; # two view changes: @@ -844,7 +845,7 @@ if ($do_run[15]) { } cleanup(); - sleep 2; + usleep 200000; } if ($do_run[16]) { @@ -856,13 +857,13 @@ if ($do_run[16]) { print "Start lock_tester $p[0]\n"; $t = spawn("./lock_tester", $p[0]); - sleep 1; + usleep 100000; print "Partition primary (PID: $pid[0]) on port $p[0] at breakpoint\n"; spawn("./rsm_tester", $p[0]+1, "partition", 0); - sleep 3; + usleep 300000; print "first view change wait\n"; my @lastv = ($p[1],$p[2]); @@ -870,12 +871,12 @@ if ($do_run[16]) { wait_for_view_change(paxos_log($port), $in_views{$port}+1, $port, 20); } - sleep 1; + usleep 100000; print "Heal partition primary (PID: $pid[0]) on port $p[0] at breakpoint\n"; spawn("./rsm_tester", $p[0]+1, "partition", 1); - sleep 1; + usleep 100000; # xxx it should test that this is the 5th view! print "second view change wait\n"; @@ -892,7 +893,7 @@ if ($do_run[16]) { } cleanup(); - sleep 2; + usleep 200000; } print "tests done OK\n";