Project

General

Profile

patch_21682_20190204.txt

Marco Mambelli, 02/04/2019 07:43 PM

 
1
diff --git a/creation/web_base/condor_startup.sh b/creation/web_base/condor_startup.sh
2
index cbff7f0e..dfccd5e2 100644
3
--- a/creation/web_base/condor_startup.sh
4
+++ b/creation/web_base/condor_startup.sh
5
@@ -9,10 +9,31 @@
6
 # This script starts the condor daemons expects a config file as a parameter
7
 #
8
 
9
+function trap_with_arg {
10
+    func="$1" ; shift
11
+    for sig ; do
12
+        trap "$func $sig" "$sig"
13
+    done
14
+}
15
+
16
 #function to handle passing signals to the child processes
17
+# no need to re-raise sigint, caller does unconditional exit (https://www.cons.org/cracauer/sigint.html)
18
+#  The condor_master -k <file> sends a SIGTERM to the pid named in the file. This results in a graceful shutdown,
19
+# where daemons get a chance to do orderly cleanup. To do a fast shutdown, you would send a SIGQUIT to the
20
+# condor_master process, something like this:
21
+#  /bin/kill -s SIGQUIT `cat condor_master2.pid`
22
+# In either case, when the master receives the signal, it will immediately write a message to the log, then signal
23
+# all of its children. When each child exits, the master will send a SIGKILL to any remaining descendants.
24
+# Once all of the children exit, the master then exits.
25
 function on_die {
26
-    echo "Condor startup received kill signal... shutting down condor processes"
27
-    $CONDOR_DIR/sbin/condor_master -k $PWD/condor_master2.pid
28
+    condor_signal=$1
29
+    # Can receive SIGTERM SIGINT SIGQUIT, condor understands SIGTERM SIGQUIT. Send SIGQUIT for SIGQUIT, SIGTERM otherwise
30
+    [[ "$condor_signal" != SIGQUIT ]] && condor_signal=SIGTERM
31
+    condor_pid_tokill=$condor_pid
32
+    [[ -z "$condor_pid_tokill" ]] && condor_pid_tokill=`cat $PWD/condor_master2.pid 2> /dev/null`
33
+    echo "Condor startup received $1 signal ... shutting down condor processes (forwarding $condor_signal to $condor_pid_tokill)"
34
+    [[ -n "$condor_pid_tokill" ]] && kill -s $condor_signal $condor_pid_tokill
35
+    # $CONDOR_DIR/sbin/condor_master -k $PWD/condor_master2.pid
36
     ON_DIE=1
37
 }
38
 
39
@@ -1010,27 +1031,41 @@ fi
40
 start_time=`date +%s`
41
 echo "=== Condor starting `date` (`date +%s`) ==="
42
 ON_DIE=0
43
-trap 'ignore_signal' HUP
44
-trap 'on_die' TERM
45
-trap 'on_die' INT
46
-
47
+condor_pid=
48
+trap 'ignore_signal' SIGHUP
49
+trap_with_arg on_die SIGTERM SIGINT SIGQUIT
50
+#trap 'on_die' TERM
51
+#trap 'on_die' INT
52
 
53
 #### STARTS CONDOR ####
54
-if [ "$check_only" == "1" ]; then
55
+if [[ "$check_only" == "1" ]]; then
56
     echo "=== Condor started in test mode ==="
57
     $CONDOR_DIR/sbin/condor_master -pidfile $PWD/condor_master.pid
58
 else
59
     $CONDOR_DIR/sbin/condor_master -f -pidfile $PWD/condor_master2.pid &
60
+    condor_pid=$!
61
     # Wait for a few seconds to make sure the pid file is created,
62
+    sleep 5 & wait $!
63
+    # Wait more if the pid file was not created and the Glidein was not killed, see [#9639]
64
+    if [[ ! -e "$PWD/condor_master2.pid" ]] && [[ "$ON_DIE" -eq 0 ]]; then
65
+        echo "=== Condor started in background but the pid file is still missing, waiting 200 sec more ==="
66
+        sleep 200 & wait $!
67
+    fi
68
     # then wait on it for completion
69
-    sleep 5
70
-    if [ -e "$PWD/condor_master2.pid" ]; then
71
-        echo "=== Condor started in background, now waiting on process `cat $PWD/condor_master2.pid` ==="
72
-        wait `cat $PWD/condor_master2.pid`
73
+    if [[ -e "$PWD/condor_master2.pid" ]]; then
74
+        [[ "$condor_pid" -ne `cat "$PWD/condor_master2.pid"` ]] && echo "Background PID $condor_pid is different from PID file content `cat "$PWD/condor_master2.pid"`"
75
+        echo "=== Condor started in background, now waiting on process $condor_pid ==="
76
+        wait $condor_pid
77
+    else
78
+        # If ON_DIE == 1, condor has already been killed by a signal
79
+        if [[ "$ON_DIE" -eq 0 ]]; then
80
+            echo "=== Condor was started but the PID file is missing, killing process $condor_pid ==="
81
+            kill -s SIGQUIT $condor_pid
82
+        fi
83
     fi
84
 fi
85
-
86
 condor_ret=$?
87
+condor_pid=
88
 
89
 if [ ${condor_ret} -eq 99 ]; then
90
     echo "Normal DAEMON_SHUTDOWN encountered" 1>&2
91
diff --git a/creation/web_base/glidein_startup.sh b/creation/web_base/glidein_startup.sh
92
index 9c26a0e3..abac0b74 100644
93
--- a/creation/web_base/glidein_startup.sh
94
+++ b/creation/web_base/glidein_startup.sh
95
@@ -13,18 +13,27 @@ global_args="$@"
96
 
97
 export LANG=C
98
 
99
+function trap_with_arg {
100
+    func="$1" ; shift
101
+    for sig ; do
102
+        trap "$func $sig" "$sig"
103
+    done
104
+}
105
+
106
+#function to handle passing signals to the child processes
107
+# no need to re-raise sigint, caller does unconditional exit (https://www.cons.org/cracauer/sigint.html)
108
 function on_die {
109
-        echo "Received kill signal... shutting down child processes" 1>&2
110
-        ON_DIE=1
111
-        kill %1
112
+    echo "Received kill signal... shutting down child processes (forwarding $1 signal)" 1>&2
113
+    ON_DIE=1
114
+    kill -s $1 %1
115
 }
116
 
117
 function ignore_signal {
118
-        echo "Ignoring SIGHUP signal... Use SIGTERM or SIGINT to kill processes" 1>&2
119
+    echo "Ignoring SIGHUP signal... Use SIGTERM or SIGQUIT to kill processes" 1>&2
120
 }
121
 
122
 function warn {
123
- echo `date` "$@" 1>&2
124
+    echo `date` "$@" 1>&2
125
 }
126
 
127
 function usage {
128
@@ -1860,9 +1869,10 @@ let validation_time=$last_startup_time-$startup_time
129
 echo "=== Last script starting `date` ($last_startup_time) after validating for $validation_time ==="
130
 echo
131
 ON_DIE=0
132
-trap 'ignore_signal' HUP
133
-trap 'on_die' TERM
134
-trap 'on_die' INT
135
+trap 'ignore_signal' SIGHUP
136
+trap_with_arg 'on_die' SIGTERM SIGINT SIGQUIT
137
+#trap 'on_die' TERM
138
+#trap 'on_die' INT
139
 gs_id_work_dir=`get_work_dir main`
140
 $main_dir/error_augment.sh -init
141
 "${gs_id_work_dir}/$last_script" glidein_config &