Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions internal/script/script.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ declare -A pids=()
declare -A exitcodes=()
declare -A orig_names=()
current_seq_pid=""
current_seq_script=""

continue_on_script_error={{if .ContinueOnScriptError}}1{{else}}0{{end}}

Expand Down Expand Up @@ -290,6 +291,7 @@ start_concurrent_scripts() {
run_sequential_scripts() {
for s in "${sequential_scripts[@]}"; do
current_seq_pid=""
current_seq_script="$s"
setsid bash "$script_dir/${s}.sh" > "$script_dir/${s}.stdout" 2> "$script_dir/${s}.stderr" &
current_seq_pid=$!
pids[$s]=$current_seq_pid
Expand All @@ -304,6 +306,7 @@ run_sequential_scripts() {
fi
fi
current_seq_pid=""
current_seq_script=""
done
}

Expand All @@ -314,10 +317,17 @@ kill_script() {
if ! ps -p "$pid" > /dev/null 2>&1; then return 0; fi
# Send signal to the process group (negative PID)
kill -SIGINT -"$pid" 2>/dev/null || true
# Give it a moment to clean up
sleep 0.1
# Give it time to clean up so child script can finalize
# Wait up to 5 seconds in 0.5s intervals
local waited=0
while ps -p "$pid" > /dev/null 2>&1 && [ "$waited" -lt 10 ]; do
sleep 0.5
waited=$((waited + 1))
done
# Force kill the process group if still alive
kill -SIGKILL -"$pid" 2>/dev/null || true
if ps -p "$pid" > /dev/null 2>&1; then
kill -SIGKILL -"$pid" 2>/dev/null || true
fi
wait "$pid" 2>/dev/null || true
if [[ -z "${exitcodes[$s]:-}" ]]; then exitcodes[$s]=130; fi
}
Expand Down Expand Up @@ -351,11 +361,8 @@ handle_sigint() {
kill_script "$s"
done
# kill current sequential script if running
if [[ -n "$current_seq_pid" ]]; then
kill -SIGINT -"$current_seq_pid" 2>/dev/null || true
sleep 0.1
kill -SIGKILL -"$current_seq_pid" 2>/dev/null || true
wait "$current_seq_pid" 2>/dev/null || true
if [[ -n "$current_seq_script" ]]; then
kill_script "$current_seq_script"
fi
print_summary
rm -f {{.ControllerPIDFile}}
Expand Down
22 changes: 15 additions & 7 deletions internal/script/scripts.go
Original file line number Diff line number Diff line change
Expand Up @@ -1462,11 +1462,11 @@ fi
stop_profiling() {
if [ -n "$perf_fp_pid" ]; then
kill -0 "$perf_fp_pid" 2>/dev/null && kill -INT "$perf_fp_pid"
wait "$perf_fp_pid"
wait "$perf_fp_pid" || true
fi
if [ -n "$perf_dwarf_pid" ]; then
kill -0 "$perf_dwarf_pid" 2>/dev/null && kill -INT "$perf_dwarf_pid"
wait "$perf_dwarf_pid"
wait "$perf_dwarf_pid" || true
fi
for pid in "${java_pids[@]}"; do
async-profiler/bin/asprof stop -o collapsed -f ap_folded_"$pid" "$pid"
Expand Down Expand Up @@ -1518,8 +1518,14 @@ print_results() {
done
}

_finalize=0 # flag to indicate if finalize has been called

# Function to finalize profiling and output
finalize() {
if [ $_finalize -eq 1 ]; then
return
fi
_finalize=1
stop_profiling
collapse_perf_data
print_results
Expand Down Expand Up @@ -1584,16 +1590,18 @@ for pid in "${java_pids[@]}"; do
done

# profiling has been started, set up trap to finalize on interrupt
trap finalize INT TERM
trap finalize INT TERM EXIT

# Wait for the specified duration (seconds), then wrap it up by calling finalize
# wait
if [ "$duration" -gt 0 ]; then
# wait for the specified duration (seconds), then wrap it up by calling finalize
sleep "$duration"
finalize
else
# wait indefinitely until child processes are killed or interrupted
wait
fi

# Wait indefinitely until interrupted, trap will call finalize
wait
finalize
`,
Superuser: true,
Sequential: true,
Expand Down
50 changes: 36 additions & 14 deletions internal/workflow/signals.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package workflow

import (
"context"
"fmt"
"log/slog"
"os"
"os/exec"
Expand All @@ -31,7 +32,10 @@ func signalProcessOnTarget(t target.Target, pidStr string, sigStr string) error
} else {
cmd = exec.Command("kill", sigStr, pidStr)
}
_, _, _, err := t.RunCommandEx(cmd, 5, false, true) // #nosec G204
// waitTime must exceed the controller script's kill_script graceful shutdown period (5s)
// plus buffer for network latency when working with remote targets
waitTime := 15
_, _, _, err := t.RunCommandEx(cmd, waitTime, false, true) // #nosec G204
return err
}

Expand All @@ -55,9 +59,6 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi
go func() {
sig := <-sigChannel
slog.Debug("received signal", slog.String("signal", sig.String()))
// The controller.sh script is run in its own process group, so we need to send the signal
// directly to the PID of the controller. For every target, look for the primary_collection_script
// PID file and send SIGINT to it.
// The controller script is run in its own process group, so we need to send the signal
// directly to the PID of the controller. For every target, look for the controller
// PID file and send SIGINT to it.
Expand All @@ -69,6 +70,7 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi
stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204
if err != nil {
slog.Error("error retrieving target controller PID", slog.String("target", t.GetName()), slog.String("error", err.Error()))
continue
}
if exitcode == 0 {
pidStr := strings.TrimSpace(stdout)
Expand All @@ -86,18 +88,20 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi
ctx, cancel := context.WithTimeout(context.Background(), targetTimeout)
timedOut := false
pidFilePath := filepath.Join(t.GetTempDirectory(), script.ControllerPIDFileName)
// read the pid file
stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204
if err != nil || exitcode != 0 {
slog.Debug("target controller PID file no longer exists, assuming script has exited or is in the process of exiting", slog.String("target", t.GetName()))
cancel()
continue
}
pidStr := strings.TrimSpace(stdout)
for {
// read the pid file
stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204
if err != nil || exitcode != 0 {
// pid file doesn't exist
break
}
pidStr := strings.TrimSpace(stdout)
// determine if the process still exists
_, _, exitcode, err = t.RunCommandEx(exec.Command("ps", "-p", pidStr), 5, false, true) // #nosec G204
if err != nil || exitcode != 0 {
break // process no longer exists, script has exited
slog.Debug("target controller process no longer exists", slog.String("target", t.GetName()))
break
}
// check for timeout
select {
Expand All @@ -122,8 +126,26 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi
cancel()
}

// send SIGINT to perfspect's children
err := util.SignalChildren(syscall.SIGINT)
// Race condition between the controller script deleting its PID file and it truly exiting.
// Future work: reconsider decision to have the controller script delete its own PID file.
// When working with a remote target we want to give our local SSH command time to exit cleanly
// before we send SIGINT to it. If we interrupt the SSH command unnecessarily, the controller output
// will be lost.
time.Sleep(500 * time.Millisecond)

// send SIGINT to perfspect's remaining children, if any
myPid := os.Getpid()
children, err := util.GetChildren(myPid)
if err != nil {
slog.Error("error retrieving child processes", slog.String("error", err.Error()))
return
}
if len(children) == 0 {
slog.Debug("no child processes to signal")
return
}
slog.Debug("signaling child processes", slog.String("child PIDs", fmt.Sprintf("%v", children)))
err = util.SignalChildren(syscall.SIGINT)
if err != nil {
slog.Error("error sending signal to children", slog.String("error", err.Error()))
}
Expand Down