diff --git a/internal/script/script.go b/internal/script/script.go index 1fe759ad..171741fb 100644 --- a/internal/script/script.go +++ b/internal/script/script.go @@ -259,6 +259,7 @@ declare -A pids=() declare -A exitcodes=() declare -A orig_names=() current_seq_pid="" +current_seq_script="" continue_on_script_error={{if .ContinueOnScriptError}}1{{else}}0{{end}} @@ -290,6 +291,7 @@ start_concurrent_scripts() { run_sequential_scripts() { for s in "${sequential_scripts[@]}"; do current_seq_pid="" + current_seq_script="$s" setsid bash "$script_dir/${s}.sh" > "$script_dir/${s}.stdout" 2> "$script_dir/${s}.stderr" & current_seq_pid=$! pids[$s]=$current_seq_pid @@ -304,6 +306,7 @@ run_sequential_scripts() { fi fi current_seq_pid="" + current_seq_script="" done } @@ -314,10 +317,17 @@ kill_script() { if ! ps -p "$pid" > /dev/null 2>&1; then return 0; fi # Send signal to the process group (negative PID) kill -SIGINT -"$pid" 2>/dev/null || true - # Give it a moment to clean up - sleep 0.1 + # Give it time to clean up so child script can finalize + # Wait up to 5 seconds in 0.5s intervals + local waited=0 + while ps -p "$pid" > /dev/null 2>&1 && [ "$waited" -lt 10 ]; do + sleep 0.5 + waited=$((waited + 1)) + done # Force kill the process group if still alive - kill -SIGKILL -"$pid" 2>/dev/null || true + if ps -p "$pid" > /dev/null 2>&1; then + kill -SIGKILL -"$pid" 2>/dev/null || true + fi wait "$pid" 2>/dev/null || true if [[ -z "${exitcodes[$s]:-}" ]]; then exitcodes[$s]=130; fi } @@ -351,11 +361,8 @@ handle_sigint() { kill_script "$s" done # kill current sequential script if running - if [[ -n "$current_seq_pid" ]]; then - kill -SIGINT -"$current_seq_pid" 2>/dev/null || true - sleep 0.1 - kill -SIGKILL -"$current_seq_pid" 2>/dev/null || true - wait "$current_seq_pid" 2>/dev/null || true + if [[ -n "$current_seq_script" ]]; then + kill_script "$current_seq_script" fi print_summary rm -f {{.ControllerPIDFile}} diff --git a/internal/script/scripts.go b/internal/script/scripts.go index 2848c53d..ab42faa9 100644 --- a/internal/script/scripts.go +++ b/internal/script/scripts.go @@ -1462,11 +1462,11 @@ fi stop_profiling() { if [ -n "$perf_fp_pid" ]; then kill -0 "$perf_fp_pid" 2>/dev/null && kill -INT "$perf_fp_pid" - wait "$perf_fp_pid" + wait "$perf_fp_pid" || true fi if [ -n "$perf_dwarf_pid" ]; then kill -0 "$perf_dwarf_pid" 2>/dev/null && kill -INT "$perf_dwarf_pid" - wait "$perf_dwarf_pid" + wait "$perf_dwarf_pid" || true fi for pid in "${java_pids[@]}"; do async-profiler/bin/asprof stop -o collapsed -f ap_folded_"$pid" "$pid" @@ -1518,8 +1518,14 @@ print_results() { done } +_finalize=0 # flag to indicate if finalize has been called + # Function to finalize profiling and output finalize() { + if [ $_finalize -eq 1 ]; then + return + fi + _finalize=1 stop_profiling collapse_perf_data print_results @@ -1584,16 +1590,18 @@ for pid in "${java_pids[@]}"; do done # profiling has been started, set up trap to finalize on interrupt -trap finalize INT TERM +trap finalize INT TERM EXIT -# Wait for the specified duration (seconds), then wrap it up by calling finalize +# wait if [ "$duration" -gt 0 ]; then + # wait for the specified duration (seconds), then wrap it up by calling finalize sleep "$duration" - finalize +else + # wait indefinitely until child processes are killed or interrupted + wait fi -# Wait indefinitely until interrupted, trap will call finalize -wait +finalize `, Superuser: true, Sequential: true, diff --git a/internal/workflow/signals.go b/internal/workflow/signals.go index 58fb935e..4e34aff2 100644 --- a/internal/workflow/signals.go +++ b/internal/workflow/signals.go @@ -5,6 +5,7 @@ package workflow import ( "context" + "fmt" "log/slog" "os" "os/exec" @@ -31,7 +32,10 @@ func signalProcessOnTarget(t target.Target, pidStr string, sigStr string) error } else { cmd = exec.Command("kill", sigStr, pidStr) } - _, _, _, err := t.RunCommandEx(cmd, 5, false, true) // #nosec G204 + // waitTime must exceed the controller script's kill_script graceful shutdown period (5s) + // plus buffer for network latency when working with remote targets + waitTime := 15 + _, _, _, err := t.RunCommandEx(cmd, waitTime, false, true) // #nosec G204 return err } @@ -55,9 +59,6 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi go func() { sig := <-sigChannel slog.Debug("received signal", slog.String("signal", sig.String())) - // The controller.sh script is run in its own process group, so we need to send the signal - // directly to the PID of the controller. For every target, look for the primary_collection_script - // PID file and send SIGINT to it. // The controller script is run in its own process group, so we need to send the signal // directly to the PID of the controller. For every target, look for the controller // PID file and send SIGINT to it. @@ -69,6 +70,7 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204 if err != nil { slog.Error("error retrieving target controller PID", slog.String("target", t.GetName()), slog.String("error", err.Error())) + continue } if exitcode == 0 { pidStr := strings.TrimSpace(stdout) @@ -86,18 +88,20 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi ctx, cancel := context.WithTimeout(context.Background(), targetTimeout) timedOut := false pidFilePath := filepath.Join(t.GetTempDirectory(), script.ControllerPIDFileName) + // read the pid file + stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204 + if err != nil || exitcode != 0 { + slog.Debug("target controller PID file no longer exists, assuming script has exited or is in the process of exiting", slog.String("target", t.GetName())) + cancel() + continue + } + pidStr := strings.TrimSpace(stdout) for { - // read the pid file - stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204 - if err != nil || exitcode != 0 { - // pid file doesn't exist - break - } - pidStr := strings.TrimSpace(stdout) // determine if the process still exists _, _, exitcode, err = t.RunCommandEx(exec.Command("ps", "-p", pidStr), 5, false, true) // #nosec G204 if err != nil || exitcode != 0 { - break // process no longer exists, script has exited + slog.Debug("target controller process no longer exists", slog.String("target", t.GetName())) + break } // check for timeout select { @@ -122,8 +126,26 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi cancel() } - // send SIGINT to perfspect's children - err := util.SignalChildren(syscall.SIGINT) + // Race condition between the controller script deleting its PID file and it truly exiting. + // Future work: reconsider decision to have the controller script delete its own PID file. + // When working with a remote target we want to give our local SSH command time to exit cleanly + // before we send SIGINT to it. If we interrupt the SSH command unnecessarily, the controller output + // will be lost. + time.Sleep(500 * time.Millisecond) + + // send SIGINT to perfspect's remaining children, if any + myPid := os.Getpid() + children, err := util.GetChildren(myPid) + if err != nil { + slog.Error("error retrieving child processes", slog.String("error", err.Error())) + return + } + if len(children) == 0 { + slog.Debug("no child processes to signal") + return + } + slog.Debug("signaling child processes", slog.String("child PIDs", fmt.Sprintf("%v", children))) + err = util.SignalChildren(syscall.SIGINT) if err != nil { slog.Error("error sending signal to children", slog.String("error", err.Error())) }