3 import subprocess as sp
13 views = [] # expected views
14 in_views = {} # the number of views a node is expected to be present
20 def killprocess(num, frame):
21 print "killprocess: forcestop all spawned processes...%s" % (str(pid),)
25 os.kill(p, signal.SIGKILL)
27 for sig in ['HUP', 'INT', 'ABRT', 'QUIT', 'TERM']:
28 num = getattr(signal, 'SIG'+sig)
29 signal.signal(num, killprocess)
32 return "paxos-%d.log" % port
35 print >>sys.stderr, ''.join(s)
48 os.kill(p, signal.SIGKILL)
62 mydie("Cannot fork: %s" % (repr(e),))
65 logs.append("%s-%s.log" % (p, aa))
66 if 'lock_server' in p:
67 logs.append(paxos_log(a[1]))
72 sys.stdout = open("%s-%s.log" % (p, aa), 'w')
75 sys.stderr = sys.stdout
76 print "%s %s" % (p, ' '.join(sa))
80 mydie("Cannot start new %s %s %s", (p, repr(sa), repr(e)))
83 return sorted([random.randint(0, 54000/2)*2+10000 for i in xrange(num)])
85 def print_config(ports):
87 config = open("config", 'w')
89 mydie("Couldn't open config for writing")
91 print >>config, "%05d" % (p,)
94 def spawn_ls(master, port):
95 return spawn("./lock_server", master, port)
97 def check_views(l, vs, last_v=None):
103 mydie("Failed: couldn't read %s" % (l,))
107 if not line.startswith('done'):
109 words = line.split(' ')
111 view = map(int, words[2:])
114 # let there be extra views
117 if tuple(expected) != tuple(view):
118 mydie("Failed: In log %s at view %s is (%s), but expected %s (%s)" %
119 (l, str(num), repr(view), str(i), repr(expected)))
122 mydie("Failed: In log %s, not enough views seen!" % (l,))
123 if last_v is not None and tuple(last_v) != tuple(last_view):
124 mydie("Failed: In log %s last view didn't match, got view %s, but expected %s" %
125 (l, repr(last_view), repr(last_v)))
127 def get_num_views(log, including):
134 return len([x for x in log if 'done ' in x and str(including) in x])
136 def wait_for_view_change(log, num_views, including, timeout):
138 while get_num_views(log, including) < num_views and (start + timeout > time.time()) and not quit:
141 loglines = f.readlines()
143 lastv = [x for x in loglines if 'done' in x][-1].strip()
144 print " Waiting for %s to be present in >=%s views in %s (Last view: %s)" % \
145 (including, str(num_views), log, lastv)
150 if get_num_views(log, including) < num_views:
151 mydie("Failed: Timed out waiting for %s to be in >=%s in log %s" %
152 (including, str(num_views), log))
154 print " Done: %s is in >=%s views in %s" % (including, str(num_views), log)
156 def waitpid_to(pid, to):
159 while done_pid <= 0 and (time.time() - start) < to:
161 done_pid = os.waitpid(pid, os.WNOHANG)
164 os.kill(pid, signal.SIGKILL)
165 mydie("Failed: Timed out waiting for process %s" % (str(pid),))
169 def wait_and_check_expected_view(v):
174 wait_for_view_change(paxos_log(port), in_views[port], port, 20)
176 log = paxos_log(port)
177 check_views(log, views)
179 def start_nodes(n, command):
180 global pid, logs, views
189 pid.append(spawn_ls(p[0],p[i]))
190 print "Start lock_server on %s" % (str(p[i]),)
193 wait_and_check_expected_view(p[:i+1])
195 options, arguments = getopt.getopt(sys.argv[1:], "s:k")
196 options = dict(options)
199 random.seed(options[s])
204 # get a sorted list of random ports
209 do_run = [0] * NUM_TESTS
211 # see which tests are set
215 if t < NUM_TESTS and t >= 0:
219 for i in xrange(NUM_TESTS):
223 print "test0: start 3-process lock server"
229 print "test1: start 3-process lock server, kill third server"
231 print "Kill third server (PID: %s) on port %s" % (str(pid[2]), str(p[2]))
232 os.kill(pid[2], signal.SIGTERM)
234 # it should go through 4 views
236 wait_and_check_expected_view(v4)
241 print "test2: start 3-process lock server, kill first server"
243 print "Kill first (PID: $pid[0]) on port $p[0]"
244 os.kill(pid[0], signal.SIGTERM)
246 # it should go through 4 views
248 wait_and_check_expected_view(v4)
253 print "test3: start 3-process lock_server, kill a server, restart a server"
255 print "Kill server (PID: $pid[2]) on port $p[2]"
256 os.kill(pid[2], signal.SIGTERM)
259 wait_and_check_expected_view(v4)
260 print "Restart killed server on port $p[2]"
261 pid[2] = spawn_ls (p[0], p[2])
263 v5 = (p[0], p[1], p[2])
264 wait_and_check_expected_view(v5)
269 print "test4: 3-process lock_server, kill third server, kill second server, restart third server, kill third server again, restart second server, re-restart third server, check logs"
271 print "Kill server (PID: $pid[2]) on port $p[2]"
272 os.kill(pid[2], signal.SIGTERM)
275 wait_and_check_expected_view(v4)
276 print "Kill server (PID: $pid[1]) on port $p[1]"
277 os.kill(pid[1], signal.SIGTERM)
279 #no view change can happen because of a lack of majority
280 print "Restarting server on port $p[2]"
281 pid[2] = spawn_ls(p[0], p[2])
283 #no view change can happen because of a lack of majority
284 for port in p[0:1+2]:
285 num_v = get_num_views(paxos_log(port), port)
286 if num_v != in_views[port]:
287 die("$num_v views in ", paxos_log(port), " : no new views should be formed due to the lack of majority")
289 print "Kill server (PID: $pid[2]) on port $p[2]"
290 os.kill(pid[2], signal.SIGTERM)
292 print "Restarting server on port $p[1]"
293 pid[1] = spawn_ls(p[0], p[1])
295 for port in p[0:1+1]:
296 in_views[port] = get_num_views(paxos_log(port), port)
297 print " Node $port is present in ", in_views[port], " views in ", paxos_log(port), ""
298 print "Restarting server on port $p[2]"
299 pid[2] = spawn_ls(p[0], p[2])
300 lastv = (p[0],p[1],p[2])
302 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 20)
303 # now check the paxos logs and make sure the logs go through the right
306 check_views(paxos_log(port), views, lastv)
310 print "test5: 3-process lock_server, send signal 1 to first server, kill third server, restart third server, check logs"
312 print "Sending paxos breakpoint 1 to first server on port $p[0]"
313 spawn("./rsm_tester", p[0]+1, "breakpoint", 3)
315 print "Kill third server (PID: $pid[2]) on port $p[2]"
316 os.kill(pid[2], signal.SIGTERM)
318 for port in p[0:1+2]:
319 num_v = get_num_views(paxos_log(port), port)
320 if num_v != in_views[port]:
321 die("$num_v views in ", paxos_log(port), " : no new views should be formed due to the lack of majority")
322 print "Restarting third server on port $p[2]"
323 pid[2]= spawn_ls(p[0], p[2])
326 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 20)
328 # now check the paxos logs and make sure the logs go through the right
331 check_views(paxos_log(port), views, lastv)
335 print "test6: 4-process lock_server, send signal 2 to first server, kill fourth server, restart fourth server, check logs"
337 print "Sending paxos breakpoint 2 to first server on port $p[0]"
338 spawn("./rsm_tester", p[0]+1, "breakpoint", 4)
340 print "Kill fourth server (PID: $pid[3]) on port $p[3]"
341 os.kill(pid[3], signal.SIGTERM)
343 for port in (p[1],p[2]):
344 num_v = get_num_views(paxos_log(port), port)
345 if num_v != in_views[port]:
346 die("$num_v views in ", paxos_log(port), " : no new views should be formed due to the lack of majority")
348 print "Restarting fourth server on port $p[3]"
349 pid[3] = spawn_ls(p[1], p[3])
351 v5 = (p[0],p[1],p[2])
356 # the 6th view will be (2,3) or (1,2,3,4)
361 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 30)
362 # final will be (2,3,4)
363 lastv = (p[1],p[2],p[3])
365 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 20)
367 check_views(paxos_log(port), views, lastv)
371 print "test7: 4-process lock_server, send signal 2 to first server, kill fourth server, kill other servers, restart other servers, restart fourth server, check logs"
373 print "Sending paxos breakpoint 2 to first server on port $p[0]"
374 spawn("./rsm_tester", p[0]+1, "breakpoint", 4)
376 print "Kill fourth server (PID: $pid[3]) on port $p[3]"
377 os.kill(pid[3], signal.SIGTERM)
379 print "Kill third server (PID: $pid[2]) on port $p[2]"
380 os.kill(pid[2], signal.SIGTERM)
381 print "Kill second server (PID: $pid[1]) on port $p[1]"
382 os.kill(pid[1], signal.SIGTERM)
384 print "Restarting second server on port $p[1]"
385 pid[1] = spawn_ls(p[0], p[1])
387 print "Restarting third server on port $p[2]"
388 pid[2] = spawn_ls(p[0], p[2])
390 #no view change is possible by now because there is no majority
391 for port in (p[1],p[2]):
392 num_v = get_num_views(paxos_log(port), port)
393 if num_v != in_views[port]:
394 die("$num_v views in ", paxos_log(port), " : no new views should be formed due to the lack of majority")
395 print "Restarting fourth server on port $p[3]"
396 pid[3] = spawn_ls(p[1], p[3])
398 v5 = (p[0], p[1], p[2])
403 lastv = (p[1],p[2],p[3])
405 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 20)
407 check_views(paxos_log(port), views, lastv)
411 print "test8: start 3-process lock service"
413 print "Start lock_tester $p[0]"
414 t = spawn("./lock_tester", p[0])
415 print " Wait for lock_tester to finish (waitpid $t)"
417 if os.system("grep \"passed all tests successfully\" lock_tester-$p[0].log"):
418 mydie("Failed lock tester for test 8")
423 print "test9: start 3-process rsm, kill second slave while lock_tester is running"
425 print "Start lock_tester $p[0]"
426 t = spawn("./lock_tester", p[0])
427 usleep(random.randint(1,1000000))
428 print "Kill slave (PID: $pid[2]) on port $p[2]"
429 os.kill(pid[2], signal.SIGTERM)
431 # it should go through 4 views
433 wait_and_check_expected_view(v4)
434 print " Wait for lock_tester to finish (waitpid $t)"
436 if os.system("grep \"passed all tests successfully\" lock_tester-$p[0].log"):
437 mydie("Failed lock tester for test 9")
442 print "test10: start 3-process rsm, kill second slave and restarts it later while lock_tester is running"
444 print "Start lock_tester $p[0]"
445 t = spawn("./lock_tester", p[0])
446 usleep(random.randint(1,1000000))
447 print "Kill slave (PID: $pid[2]) on port $p[2]"
448 os.kill(pid[2], signal.SIGTERM)
450 # it should go through 4 views
452 wait_and_check_expected_view(v4)
454 print "Restarting killed lock_server on port $p[2]"
455 pid[2] = spawn_ls(p[0], p[2])
456 v5 = (p[0],p[1],p[2])
457 wait_and_check_expected_view(v5)
458 print " Wait for lock_tester to finish (waitpid $t)"
460 if os.system("grep \"passed all tests successfully\" lock_tester-$p[0].log"):
461 mydie("Failed lock tester for test 10")
466 print "test11: start 3-process rsm, kill primary while lock_tester is running"
468 print "Start lock_tester $p[0]"
469 t = spawn("./lock_tester", p[0])
470 usleep(random.randint(1,1000000))
471 print "Kill primary (PID: $pid[0]) on port $p[0]"
472 os.kill(pid[0], signal.SIGTERM)
474 # it should go through 4 views
476 wait_and_check_expected_view(v4)
477 print " Wait for lock_tester to finish (waitpid $t)"
479 if os.system("grep \"passed all tests successfully\" lock_tester-$p[0].log"):
480 mydie("Failed lock tester for test 11")
485 print "test12: start 3-process rsm, kill master at break1 and restart it while lock_tester is running"
487 print "Start lock_tester $p[0]"
488 t = spawn("./lock_tester", p[0])
490 print "Kill master (PID: $pid[0]) on port $p[0] at breakpoint 1"
491 spawn("./rsm_tester", p[0]+1, "breakpoint", 1)
493 # it should go through 5 views
495 wait_and_check_expected_view(v4)
496 print "Restarting killed lock_server on port $p[0]"
497 pid[0] = spawn_ls(p[1], p[0])
499 # the last view should include all nodes
500 lastv = (p[0],p[1],p[2])
502 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 20)
504 check_views(paxos_log(port), views, lastv)
505 print " Wait for lock_tester to finish (waitpid $t)"
507 if os.system("grep \"passed all tests successfully\" lock_tester-$p[0].log"):
508 mydie("Failed lock tester for test 12")
513 print "test13: start 3-process rsm, kill slave at break1 and restart it while lock_tester is running"
515 print "Start lock_tester $p[0]"
516 t = spawn("./lock_tester", p[0])
518 print "Kill slave (PID: $pid[2]) on port $p[2] at breakpoint 1"
519 spawn("./rsm_tester", p[2]+1, "breakpoint", 1)
521 # it should go through 4 views
523 wait_and_check_expected_view(v4)
524 print "Restarting killed lock_server on port $p[2]"
525 pid[2] = spawn_ls(p[0], p[2])
527 # the last view should include all nodes
528 lastv = (p[0],p[1],p[2])
530 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 20)
532 check_views(paxos_log(port), views, lastv)
533 print " Wait for lock_tester to finish (waitpid $t)"
535 if os.system("grep \"passed all tests successfully\" lock_tester-$p[0].log"):
536 mydie("Failed lock tester for test 13")
541 print "test14: start 5-process rsm, kill slave break1, kill slave break2"
543 print "Start lock_tester $p[0]"
544 t = spawn("./lock_tester", p[0])
546 print "Kill slave (PID: $pid[4]) on port $p[4] at breakpoint 1"
547 spawn("./rsm_tester", p[4]+1, "breakpoint", 1)
548 print "Kill slave (PID: $pid[3]) on port $p[3] at breakpoint 2"
549 spawn("./rsm_tester", p[3]+1, "breakpoint", 2)
552 print "first view change wait"
553 lastv = (p[0],p[1],p[2],p[3])
555 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 20)
556 print "second view change wait"
557 lastv = (p[0],p[1],p[2])
559 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 20)
560 print " Wait for lock_tester to finish (waitpid $t)"
562 if os.system("grep \"passed all tests successfully\" lock_tester-$p[0].log"):
563 mydie("Failed lock tester for test 14")
568 print "test15: start 5-process rsm, kill slave break1, kill primary break2"
570 print "Start lock_tester $p[0]"
571 t = spawn("./lock_tester", p[0])
573 print "Kill slave (PID: $pid[4]) on port $p[4] at breakpoint 1"
574 spawn("./rsm_tester", p[4]+1, "breakpoint", 1)
575 print "Kill primary (PID: $pid[0]) on port $p[0] at breakpoint 2"
576 spawn("./rsm_tester", p[0]+1, "breakpoint", 2)
579 print "first view change wait"
580 lastv = (p[0],p[1],p[2],p[3])
582 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 20)
583 print "second view change wait"
584 lastv = (p[1],p[2],p[3])
586 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 20)
587 print " Wait for lock_tester to finish (waitpid $t)"
589 if os.system("grep \"passed all tests successfully\" lock_tester-$p[0].log"):
590 mydie("Failed lock tester for test 15")
595 print "test16: start 3-process rsm, partition primary, heal it"
597 print "Start lock_tester $p[0]"
598 t = spawn("./lock_tester", p[0])
600 print "Partition primary (PID: $pid[0]) on port $p[0] at breakpoint"
601 spawn("./rsm_tester", p[0]+1, "partition", 0)
603 print "first view change wait"
606 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 20)
608 print "Heal partition primary (PID: $pid[0]) on port $p[0] at breakpoint"
609 spawn("./rsm_tester", p[0]+1, "partition", 1)
611 # xxx it should test that this is the 5th view!
612 print "second view change wait"
613 lastv = (p[0], p[1],p[2])
615 wait_for_view_change(paxos_log(port), in_views[port]+1, port, 20)
616 print " Wait for lock_tester to finish (waitpid $t)"
618 if os.system("grep \"passed all tests successfully\" lock_tester-$p[0].log"):
619 mydie("Failed lock tester for test 16")
623 print "tests done OK"