Project

General

Profile

condor_startup.sh

Marco Mambelli, 02/04/2019 07:43 PM

 
1
#!/bin/bash
2
#
3
# Project:
4
#   glideinWMS
5
#
6
# File Version:
7
#
8
# Description:
9
# This script starts the condor daemons expects a config file as a parameter
10
#
11

    
12
function trap_with_arg {
13
    func="$1" ; shift
14
    for sig ; do
15
        trap "$func $sig" "$sig"
16
    done
17
}
18

    
19
#function to handle passing signals to the child processes
20
# no need to re-raise sigint, caller does unconditional exit (https://www.cons.org/cracauer/sigint.html)
21
#  The condor_master -k <file> sends a SIGTERM to the pid named in the file. This results in a graceful shutdown,
22
# where daemons get a chance to do orderly cleanup. To do a fast shutdown, you would send a SIGQUIT to the
23
# condor_master process, something like this:
24
#  /bin/kill -s SIGQUIT `cat condor_master2.pid`
25
# In either case, when the master receives the signal, it will immediately write a message to the log, then signal
26
# all of its children. When each child exits, the master will send a SIGKILL to any remaining descendants.
27
# Once all of the children exit, the master then exits.
28
function on_die {
29
    condor_signal=$1
30
    # Can receive SIGTERM SIGINT SIGQUIT, condor understands SIGTERM SIGQUIT. Send SIGQUIT for SIGQUIT, SIGTERM otherwise
31
    [[ "$condor_signal" != SIGQUIT ]] && condor_signal=SIGTERM
32
    condor_pid_tokill=$condor_pid
33
    [[ -z "$condor_pid_tokill" ]] && condor_pid_tokill=`cat $PWD/condor_master2.pid 2> /dev/null`
34
    echo "Condor startup received $1 signal ... shutting down condor processes (forwarding $condor_signal to $condor_pid_tokill)"
35
    [[ -n "$condor_pid_tokill" ]] && kill -s $condor_signal $condor_pid_tokill
36
    # $CONDOR_DIR/sbin/condor_master -k $PWD/condor_master2.pid
37
    ON_DIE=1
38
}
39

    
40
function ignore_signal {
41
    echo "Condor startup received SIGHUP signal, ignoring..."
42
}
43

    
44
metrics=""
45

    
46
# put in place a reasonable default
47
GLIDEIN_CPUS=1
48

    
49
# first of all, clean up any CONDOR variable
50
condor_vars=`env |awk '/^_[Cc][Oo][Nn][Dd][Oo][Rr]_/{split($1,a,"=");print a[1]}'`
51
for v in $condor_vars; do
52
    unset $v
53
done
54
echo "Removed condor variables $condor_vars" 1>&2
55

    
56
# Condor 7.5.6 and above will use the system's gsi-authz.conf.  We don't want that.
57
export GSI_AUTHZ_CONF=/dev/null
58
# Specifically for the cloud:  If we want Condor to run as a specific user on the VM,
59
# set GLIDEIN_Condor_IDS in the environment.
60
if [ -n "$GLIDEIN_Condor_IDS" ]; then
61
    export _CONDOR_CONDOR_IDS=$GLIDEIN_Condor_IDS
62
    echo "Created _CONDOR_CONDOR_IDS variable based on GLIDEIN_Condor_User" 1>&2
63
fi
64

    
65
# pstr = variable representing an appendix
66
pstr='"'
67

    
68
config_file="$1"
69

    
70
error_gen="`grep '^ERROR_GEN_PATH ' "$config_file" | cut -d ' ' -f 2-`"
71

    
72
glidein_startup_pid="`grep -i "^GLIDEIN_STARTUP_PID " "$config_file" | cut -d ' ' -f 2-`"
73
# DO NOT USE PID FOR DAEMON NAMES
74
# If site's batch system is HTCondor and USE_PID_NAMESPACES is set pid's
75
# it does not play well with HTCondor daemon name creation
76
# $RANDOM is in range(0, 32K). Add extra safeguards
77
let "random_name_str=($RANDOM+1000)*($RANDOM+2000)"
78

    
79
# find out whether user wants to run job or run test
80
debug_mode="`grep -i "^DEBUG_MODE " "$config_file" | cut -d ' ' -f 2-`"
81

    
82
print_debug=0
83
check_only=0
84
if [ "$debug_mode" -ne 0 ]; then
85
    print_debug=1
86
    if [ "$debug_mode" -eq 2 ]; then
87
        check_only=1
88
    fi
89
fi
90

    
91
adv_only=`grep -i "^GLIDEIN_ADVERTISE_ONLY " "$config_file" | cut -d ' ' -f 2-`
92

    
93
if [ "$adv_only" -eq 1 ]; then
94
    adv_destination=`grep -i "^GLIDEIN_ADVERTISE_DESTINATION " "$config_file" | cut -d ' ' -f 2-`
95
    if [ -z "${adv_destination}" ]; then
96
        adv_destination=VO
97
    fi
98

    
99
    # no point in printing out debug info about config
100
    print_debug=0
101
    if [ "$adv_destination" = "VO" ]; then
102
        echo "Advertising failure to the VO collector"  1>&2
103
    else
104
        echo "Advertising failure to the Factory collector"  1>&2
105
    fi
106
fi
107

    
108
if [ "$print_debug" -ne 0 ]; then
109
    echo "-------- $config_file in condor_startup.sh ----------" 1>&2
110
    cat $config_file 1>&2
111
    echo "-----------------------------------------------------" 1>&2
112
fi
113

    
114
main_stage_dir="`grep -i "^GLIDEIN_WORK_DIR " "$config_file" | cut -d ' ' -f 2-`"
115

    
116
description_file="`grep -i "^DESCRIPTION_FILE " "$config_file" | cut -d ' ' -f 2-`"
117

    
118
in_condor_config="${main_stage_dir}/`grep -i '^condor_config ' "${main_stage_dir}/${description_file}" | cut -s -f 2-`"
119

    
120
export CONDOR_CONFIG="${PWD}/condor_config"
121

    
122
cp "$in_condor_config" "$CONDOR_CONFIG"
123

    
124
echo "# ---- start of condor_startup generated part ----" >> $CONDOR_CONFIG
125

    
126
wrapper_list="`grep -i "^WRAPPER_LIST " "$config_file" | cut -d ' ' -f 2-`"
127

    
128
#
129
# Create the job wrapper
130
#
131
# TODO: should it skip the wrapper if WRAPPER_LIST is empty?
132
condor_job_wrapper="condor_job_wrapper.sh"
133
cat > "$condor_job_wrapper" <<EOF
134
#!/bin/bash
135

    
136
# This script is started just before the user job
137
# It is referenced by the USER_JOB_WRAPPER
138

    
139
EOF
140

    
141
for fname in `cat "$wrapper_list"`;
142
do
143
    cat "$fname" >> "$condor_job_wrapper"
144
done
145

    
146

    
147
echo "USER_JOB_WRAPPER = \$(LOCAL_DIR)/$condor_job_wrapper" >> "$CONDOR_CONFIG"
148

    
149

    
150
# glidein_variables = list of additional variables startd is to publish
151
glidein_variables=""
152

    
153
# job_env = environment to pass to the job
154
# Make sure we do not leak LD_LIBRARY_PATH to the job incorrectly
155
job_env="LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
156

    
157

    
158
#
159
# Set a variable read from a file
160
#
161
function set_var {
162
    var_name=$1
163
    var_type=$2
164
    var_def=$3
165
    var_condor=$4
166
    var_req=$5
167
    var_exportcondor=$6
168
    var_user=$7
169

    
170
    if [ -z "$var_name" ]; then
171
        # empty line
172
        return 0
173
    fi
174

    
175
    var_val=`grep "^$var_name " $config_file | awk '{if (NF>1) ind=length($1)+1; v=substr($0, ind); print substr(v, index(v, $2))}'`
176
    if [ -z "$var_val" ]; then
177
        if [ "$var_req" == "Y" ]; then
178
            # needed var, exit with error
179
            #echo "Cannot extract $var_name from '$config_file'" 1>&2
180
            STR="Cannot extract $var_name from '$config_file'"
181
            "$error_gen" -error "condor_startup.sh" "Config" "$STR" "MissingAttribute" "$var_name"
182
            exit 1
183
        elif [ "$var_def" == "-" ]; then
184
            # no default, do not set
185
            return 0
186
        else
187
            eval var_val=$var_def
188
        fi
189
    fi
190

    
191
    if [ "$var_condor" == "+" ]; then
192
        var_condor=$var_name
193
    fi
194
    if [ "$var_type" == "S" ]; then
195
        var_val_str="${pstr}${var_val}${pstr}"
196
    else
197
        var_val_str="$var_val"
198
    fi
199

    
200
    # insert into condor_config
201
    echo "$var_condor=$var_val_str" >> $CONDOR_CONFIG
202

    
203
    if [ "$var_exportcondor" == "Y" ]; then
204
        # register var_condor for export
205
        if [ -z "$glidein_variables" ]; then
206
           glidein_variables="$var_condor"
207
        else
208
           glidein_variables="$glidein_variables,$var_condor"
209
        fi
210
    fi
211

    
212
    if [ "$var_user" != "-" ]; then
213
        # - means do not export
214
        if [ "$var_user" == "+" ]; then
215
            var_user=$var_name
216
        elif [ "$var_user" == "@" ]; then
217
            var_user=$var_condor
218
        fi
219

    
220
        condor_env_entry="$var_user=$var_val"
221
        condor_env_entry=`echo "$condor_env_entry" | awk "{gsub(/\"/,\"\\\\\"\\\\\"\"); print}"`
222
        condor_env_entry=`echo "$condor_env_entry" | awk "{gsub(/'/,\"''\"); print}"`
223
        if [ -z "$job_env" ]; then
224
           job_env="'$condor_env_entry'"
225
        else
226
           job_env="$job_env '$condor_env_entry'"
227
        fi
228
    fi
229

    
230
    # define it for future use
231
    eval "$var_name='$var_val'"
232
    return 0
233
}
234

    
235
function python_b64uuencode {
236
    echo "begin-base64 644 -"
237
    python -c 'import binascii,sys;fd=sys.stdin;buf=fd.read();size=len(buf);idx=0
238
while size>57:
239
 print binascii.b2a_base64(buf[idx:idx+57]),;
240
 idx+=57;
241
 size-=57;
242
print binascii.b2a_base64(buf[idx:]),'
243
    echo "===="
244
}
245

    
246
function base64_b64uuencode {
247
    echo "begin-base64 644 -"
248
    base64 -
249
    echo "===="
250
}
251

    
252
# not all WNs have all the tools installed
253
function b64uuencode {
254
    which uuencode >/dev/null 2>&1
255
    if [ $? -eq 0 ]; then
256
        uuencode -m -
257
    else
258
        which base64 >/dev/null 2>&1
259
        if [ $? -eq 0 ]; then
260
            base64_b64uuencode
261
        else
262
            python_b64uuencode
263
        fi
264
    fi
265
}
266

    
267
function cond_print_log {
268
    # $1 = fname
269
    # $2 = fpath
270

    
271
    logname=$1
272
    shift
273
    # Use ls to allow fpath to include wild cards
274
    files_to_zip="`ls -1 "$@" 2>/dev/null`"
275
    
276
    if [ "$files_to_zip" != "" ]; then
277
        echo "$logname" 1>&2
278
        echo "======== gzip | uuencode =============" 1>&2
279
        gzip --stdout $files_to_zip | b64uuencode 1>&2
280
        echo
281
    fi
282
}
283

    
284

    
285
function fix_param () {
286
    # Fix a parameter list with positional and dictionary parameters
287
    # 1. parameters, comma separated, parameter or name=value, positional parameters must come before all dictionary ones
288
    # 2. parameter names (all), comma separated, in the correct order (no extra comma at beginning or end)
289
    # return on stdout the expanded list, comma separated
290
    # exit code: 0=ok 1=error (echo on stderr error conditions)
291
    # e.g. fix_param 11,q4=44,q3=33 q1,q2,q3,q4   ->   11,,33,44
292

    
293
    if [[ -z "$2" || ! "$1" == *=* ]]; then
294
        echo "$1"
295
        return
296
    fi
297
    local varnames
298
    local varnames_len
299
    local PARLIST
300
    IFS=',' read -ra PARLIST <<< "$1"
301
    varnames_len="${2//[^,]/},"
302
    if [ ${#PARLIST[@]} -gt ${#varnames_len} ]; then
303
        echo "Parameter list ($1) longer than possible parameters ($2). Aborting." 1>&2
304
        return 1
305
    fi
306
    varnames=",$2,"
307
    # prepare reverse index
308
    for i in "${!my_array[@]}"; do
309
        if [[ "${my_array[$i]}" = "${value}" ]]; then
310
            echo "${i}";
311
        fi
312
    done
313
    local dict_start=
314
    local res_ctr=0
315
    local r1
316
    local r2
317
    local RESLIST
318
    declare -a RESLIST
319
    for i in "${PARLIST[@]}"; do
320
        if [[ "$i" == *=* ]]; then
321
            dict_start=yes
322
            # find name position
323
            r1=${varnames%,${i%%=*},*}
324
            r2=${r1//[^,]/}
325
            RESLIST[${#r2}]=${i#*=}
326
        else
327
            if [ -n "$dict_start" ]; then
328
                echo "Positional parameter after dictionary in ($1). Aborting." 1>&2
329
                return 1
330
            fi
331
            RESLIST[res_ctr]=$i
332
        fi
333
        let res_ctr+=1
334
    done
335
    res="${RESLIST[0]}"
336
    let res_ctr=${#varnames_len}-1
337
    for i in $(seq 1 1 $res_ctr 2>/dev/null); do
338
        res="$res,${RESLIST[$i]}"
339
    done
340
    echo $res
341
}
342

    
343

    
344
function unit_division {
345
    # Divide the number and preserve the unit (integer division)
346
    # 1 dividend (integer w/ unit), 2 divisor (integer)
347
    # Dividend can be a fraction w/o units: .N (N is divided by the divisor), N/M (M is multiplied by the divisor)
348
    local number_only
349
    local res_num
350
    if [[ "$1" =~ ^\.[0-9]+$ ]]; then
351
        let res_num=${1:1}/$2
352
        res=".$res_num"
353
    elif [[ "$1" =~ ^[0-9]+\/[0-9]+$ ]]; then
354
        number_only=${1#*/}
355
        let res_num=$number_only*$2
356
        res="${1%/*}/$res_num"
357
    else
358
        number_only=${1%%[!0-9]*}
359
        if [ -n "$number_only" ]; then
360
            local number_only=${1%%[!0-9]*}
361
            let res_num=$number_only/$2
362
        else
363
            echo "Invalid format for $1. Skipping division by $2, returning $1." 1>&2
364
        fi
365
        res="$res_num${1:${#number_only}}"
366
    fi
367
    echo $res
368
}
369

    
370

    
371
function find_gpus_num {
372
    # use condor tools to find the available GPUs
373
    if [ ! -f "$CONDOR_DIR/libexec/condor_gpu_discovery" ]; then
374
        echo "WARNING: condor_gpu_discovery not found" 1>&2
375
        return 1
376
    fi
377
    local tmp1
378
    tmp1="`"$CONDOR_DIR"/libexec/condor_gpu_discovery`"
379
    local ec=$?
380
    if [ $ec -ne 0 ]; then
381
        echo "WARNING: condor_gpu_discovery failed (exit code: $ec)" 1>&2
382
        return $ec
383
    fi 
384
    local tmp="`echo "$tmp1" | grep "^DetectedGPUs="`"
385
    if [ "${tmp:13}" = 0 ]; then
386
        echo "No GPUs found with condor_gpu_discovery, setting them to 0" 1>&2
387
        echo 0
388
        return
389
    fi
390
    set -- $tmp
391
    echo "condor_gpu_discovery found $# GPUs: $tmp" 1>&2
392
    echo $#
393
}
394

    
395

    
396
# interpret the variables
397
rm -f condor_vars.lst.tmp
398
touch condor_vars.lst.tmp
399
for vid in GLIDECLIENT_GROUP_CONDOR_VARS_FILE GLIDECLIENT_CONDOR_VARS_FILE ENTRY_CONDOR_VARS_FILE CONDOR_VARS_FILE
400
do
401
    condor_vars="`grep -i "^$vid " "$config_file" | cut -d ' ' -f 2-`"
402
    if [ -n "$condor_vars" ]; then
403
        grep -v "^#" "$condor_vars" >> condor_vars.lst.tmp
404
    fi
405
done
406

    
407
while read line
408
do
409
    set_var $line
410
done < condor_vars.lst.tmp
411

    
412

    
413
cat >> "$condor_job_wrapper" <<EOF
414

    
415
# Condor job wrappers must replace its own image
416
exec "$GLIDEIN_WRAPPER_EXEC"
417
EOF
418
chmod a+x "$condor_job_wrapper"
419

    
420

    
421
now=`date +%s`
422
# If not an integer reset to 0 (a string could cause errors [#7899])
423
[ "$X509_EXPIRE" -eq "$X509_EXPIRE" ] 2>/dev/null || X509_EXPIRE=0
424

    
425
#add some safety margin
426
let "x509_duration=$X509_EXPIRE - $now - 300"
427

    
428
# Get relevant attributes from glidein_config if they exist
429
# if they do not, check condor config from vars population above
430
max_walltime=`grep -i "^GLIDEIN_Max_Walltime " "$config_file" | cut -d ' ' -f 2-`
431
job_maxtime=`grep -i "^GLIDEIN_Job_Max_Time " "$config_file" | cut -d ' ' -f 2-`
432
graceful_shutdown=`grep -i "^GLIDEIN_Graceful_Shutdown " "$config_file" | cut -d ' ' -f 2-`
433
# randomize the retire time, to smooth starts and terminations
434
retire_spread=`grep -i "^GLIDEIN_Retire_Time_Spread " "$config_file" | cut -d ' ' -f 2-`
435
expose_x509=`grep -i "^GLIDEIN_Expose_X509 " "$config_file" | cut -d ' ' -f 2-`
436

    
437
if [ -z "$expose_x509" ]; then
438
    expose_x509=`grep -i "^GLIDEIN_Expose_X509=" "$CONDOR_CONFIG" | awk -F"=" '{print $2}'`
439
    if [ -z "$expose_x509" ]; then
440
        expose_x509="false"
441
    fi
442
fi
443
expose_x509=`echo $expose_x509 | tr '[:upper:]' '[:lower:]'`
444

    
445
if [ -z "$graceful_shutdown" ]; then
446
    graceful_shutdown=`grep -i "^GLIDEIN_Graceful_Shutdown=" "$CONDOR_CONFIG" | awk -F"=" '{print $2}'`
447
    if [ -z "$graceful_shutdown" ]; then
448
        echo "WARNING: graceful shutdown not defined in vars or glidein_config, using 120!" 1>&2
449
        graceful_shutdown=120
450
    fi
451
fi
452
if [ -z "$job_maxtime" ]; then
453
    job_maxtime=`grep -i "^GLIDEIN_Job_Max_Time=" "$CONDOR_CONFIG" | awk -F"=" '{print $2}'`
454
    if [ -z "$job_maxtime" ]; then
455
        echo "WARNING: job max time not defined in vars or glidein_config, using 192600!" 1>&2
456
        job_maxtime=192600
457
    fi
458
fi
459

    
460
# At this point, we need to define two times:
461
#  die_time = time that glidein will enter graceful shutdown
462
#  retire_time = time that glidein will stop accepting jobs
463

    
464
# DAEMON_SHUTDOWN is only updated when the classad is sent to the Collector
465
# Since update interval is randomized, hardcode a grace period here to 
466
# make sure max_walltime is respected
467
update_interval=370
468

    
469
#Minimum amount retire time can be
470
min_glidein=600
471

    
472
# Take into account GLIDEIN_Max_Walltime
473
# GLIDEIN_Max_Walltime = Max allowed time for the glidein.
474
#   If you specify this variable, then Condor startup scripts will calculate the 
475
#   GLIDEIN_Retire_Time for the glidein as 
476
#    (GLIDEIN_MAX_Walltime - GLIDEIN_Job_Max_Time)
477
#   If GLIDEIN_Retire_Time is also specified, 
478
#   it will be ignored and only the calculated value is used. 
479
if [ -z "$max_walltime" ]; then
480
    retire_time=`grep -i "^GLIDEIN_Retire_Time " "$config_file" | cut -d ' ' -f 2-`
481
    if [ -z "$retire_time" ]; then
482
        retire_time=21600
483
        echo "used default retire time, $retire_time" 1>&2
484
    else
485
        echo "used param defined retire time, $retire_time" 1>&2
486
    fi
487
    let "die_time=$retire_time + $job_maxtime"
488
else
489
    echo "max wall time, $max_walltime" 1>&2
490

    
491
    if [ -z "$retire_spread" ]; then
492
        # Make sure that the default spread is enough so that we
493
        # dont drop below min_glidein (ie 600 seconds)
494
        let "default_spread=($min_glidein * 11) / 100"
495
    else
496
        let "default_spread=$retire_spread"
497
    fi
498

    
499
    # Make sure retire time is not set to less than 300 plus default spread
500
    # (since job max default is set to 36hours, this can happen)
501
    # total_grace=max total time to end glidein after DAEMON_SHUTDOWN occurs
502
    let "total_grace= $graceful_shutdown + $default_spread + $update_interval"
503
    let "total_job_allotment= $total_grace + $job_maxtime+$min_glidein"
504
    if [ "$total_job_allotment" -gt "$max_walltime" ]; then
505
        let "job_maxtime= $max_walltime - $total_grace - $min_glidein"
506
        if [ "$job_maxtime" -lt "0" ]; then
507
            let "job_maxtime=0"
508
        fi
509
        echo "WARNING: job max time is bigger than max_walltime, lowering it.  " 1>&2
510
    fi
511
    echo "job max time, $job_maxtime" 1>&2
512
  
513
    let "die_time=$max_walltime - $update_interval - $graceful_shutdown"
514
    let "retire_time=$die_time - $job_maxtime"
515
    GLIDEIN_Retire_Time=$retire_time
516
    echo "calculated retire time, $retire_time" 1>&2
517
fi
518

    
519
# make sure the glidein goes away before the proxy expires
520
if [ "$die_time" -gt "$x509_duration" ]; then
521
    ignore_x509=`grep -i "^GLIDEIN_Ignore_X509_Duration " $config_file | awk '{print $2}' | tr '[:upper:]' '[:lower:]'`
522
    if [ "$x509_duration" -lt 900 ]; then
523
        echo "Remaining proxy duration is less than 15min. Shortening the Glidein lifetime."
524
        ignore_x509=false
525
    fi
526
    if [ "x$ignore_x509" == "xfalse" ]; then
527
        # Subtract both die time and retire time by the difference
528
        let "reduce_time=$die_time-$x509_duration"
529
        let "die_time=$x509_duration"
530
        let "retire_time=$retire_time - $reduce_time"
531
        echo "Proxy not long lived enough ($x509_duration s left), shortened retire time to $retire_time" 1>&2
532
    else
533
        echo "GLIDEIN_Ignore_X509_Duration is true (default). Ignoring glidein die time ($retire_time s) longer than remaining proxy duration ($x509_duration s)" 1>&2
534
    fi
535
fi
536

    
537

    
538
if [ -z "$retire_spread" ]; then
539
    let "retire_spread=$retire_time / 10"
540
    echo "using default retire spread, $retire_spread" 1>&2
541
else
542
    echo "used param retire spead, $retire_spread" 1>&2
543
fi
544

    
545

    
546
let "random100=$RANDOM%100"
547
let "retire_time=$retire_time - $retire_spread * $random100 / 100"
548
let "die_time=$die_time - $retire_spread * $random100 / 100"
549

    
550
# but protect from going too low
551
if [ "$retire_time" -lt "$min_glidein" ]; then
552
    echo "Retire time after spread too low ($retire_time), remove spread" 1>&2
553
    # With the various calculations going on now with walltime
554
    # Safer to add spread rather than to revert to previous value
555
    let "retire_time=$retire_time + $retire_spread * $random100 / 100"
556
    let "die_time=$die_time + $retire_spread * $random100 / 100"
557
fi
558
if [ "$retire_time" -lt "$min_glidein" ] && [ "$adv_only" -ne "1" ]; then  
559
    #echo "Retire time still too low ($retire_time), aborting" 1>&2
560
    STR="Retire time still too low ($retire_time), aborting"
561
    "$error_gen" -error "condor_startup.sh" "Config" "$STR" "retire_time" "$retire_time" "min_retire_time" "$min_glidein"
562
    exit 1
563
fi
564
echo "Retire time set to $retire_time" 1>&2
565
echo "Die time set to $die_time" 1>&2
566

    
567
let "glidein_toretire=$now + $retire_time"
568
let "glidein_todie=$now + $die_time"
569

    
570
# minimize re-authentications, by asking for a session lenght to be the same as proxy lifetime, if possible
571
let "session_duration=$x509_duration"
572

    
573
# if in test mode, don't ever start any jobs
574
START_JOBS="TRUE"
575
if [ "$check_only" == "1" ]; then
576
    START_JOBS="FALSE"
577
    # need to know which startd to fetch against
578
    STARTD_NAME=glidein_${glidein_startup_pid}_${random_name_str}
579
fi
580

    
581
#Add release and distribution information
582
LSB_RELEASE="UNKNOWN"
583
LSB_DISTRIBUTOR_ID="UNKNOWN"
584
LSB_DESCRIPTION="UNKNOWN"
585
command -v lsb_release >/dev/null
586
if test $? = 0; then
587
    LSB_RELEASE=`lsb_release -rs | sed 's/"//g'`
588
    LSB_DISTRIBUTOR_ID=`lsb_release -is | sed 's/"//g'`
589
    LSB_DESCRIPTION=`lsb_release -ds | sed 's/"//g'`
590
fi
591

    
592

    
593
cat >> "$CONDOR_CONFIG" <<EOF
594
# ---- start of condor_startup fixed part ----
595
LSB_DISTRIBUTOR_ID = "$LSB_DISTRIBUTOR_ID"
596
LSB_RELEASE = "$LSB_RELEASE"
597
LSB_DESCRIPTION = "$LSB_DESCRIPTION"
598

    
599
SEC_DEFAULT_SESSION_DURATION = $session_duration
600

    
601
LOCAL_DIR = $PWD
602

    
603
#GLIDEIN_EXPIRE = $glidein_expire
604
GLIDEIN_TORETIRE = $glidein_toretire
605
GLIDEIN_ToDie = $glidein_todie
606
GLIDEIN_START_TIME = $now
607

    
608
STARTER_JOB_ENVIRONMENT = "$job_env"
609
GLIDEIN_VARIABLES = $glidein_variables
610

    
611
MASTER_NAME = glidein_${glidein_startup_pid}_${random_name_str}
612
STARTD_NAME = glidein_${glidein_startup_pid}_${random_name_str}
613

    
614
#This can be used for locating the proper PID for monitoring
615
GLIDEIN_PARENT_PID = $$
616

    
617
START = $START_JOBS && (SiteWMS_WN_Draining =?= False)
618

    
619
#Use the default grace time unless the job has to be preempted. In that case set the value to 20 minutes.
620
PREEMPT_GRACE_TIME = ifthenelse( (SiteWMS_WN_Preempt =?= True), 1200, $PREEMPT_GRACE_TIME)
621

    
622
EOF
623
####################################
624
if [ $? -ne 0 ]; then
625
    #echo "Error customizing the condor_config" 1>&2
626
    STR="Error customizing the condor_config"
627
    "$error_gen" -error "condor_startup.sh" "WN_Resource" "$STR" "file" "$CONDOR_CONFIG"
628
    exit 1
629
fi
630

    
631
monitor_mode=`grep -i "^MONITOR_MODE " "$config_file" | cut -d ' ' -f 2-`
632

    
633
if [ "$monitor_mode" == "MULTI" ]; then
634
    use_multi_monitor=1
635
else
636
    use_multi_monitor=0
637
fi
638

    
639
# get the periodic scripts configuration
640
condor_config_startd_cron_include="`grep -i "^GLIDEIN_condor_config_startd_cron_include " "$config_file" | cut -d ' ' -f 2-`"
641
if [ -n "$condor_config_startd_cron_include" ]; then
642
    echo "adding periodic scripts (startd_cron) configuration from: $condor_config_startd_cron_include" 1>&2
643
    echo "# ---- start of startd_cron part ----" >> "$CONDOR_CONFIG"
644
    cat "$condor_config_startd_cron_include" >> "$CONDOR_CONFIG"
645
fi
646

    
647
# get check_include file for testing
648
if [ "$check_only" == "1" ]; then
649
    condor_config_check_include="${main_stage_dir}/`grep -i '^condor_config_check_include ' ${main_stage_dir}/${description_file} | awk '{print $2}'`"
650
    echo "# ---- start of include part ----" >> "$CONDOR_CONFIG"
651
    cat "$condor_config_check_include" >> "$CONDOR_CONFIG"
652
    if [ $? -ne 0 ]; then
653
        #echo "Error appending check_include to condor_config" 1>&2
654
        STR="Error appending check_include to condor_config"
655
        "$error_gen" -error "condor_startup.sh" "WN_Resource" "$STR" "file" "$CONDOR_CONFIG" "infile" "$condor_config_check_include"
656
        exit 1
657
    fi
658
    # fake a few variables, to make the rest work
659
    use_multi_monitor=0
660
    GLIDEIN_Monitoring_Enabled=False
661
else
662
    # NO check_only, run the actual glidein and accept jobs
663
    if [ "$use_multi_monitor" -eq 1 ]; then
664
        condor_config_multi_include="${main_stage_dir}/`grep -i '^condor_config_multi_include ' ${main_stage_dir}/${description_file} | awk '{print $2}'`"
665
        echo "# ---- start of include part ----" >> "$CONDOR_CONFIG"
666
        cat "$condor_config_multi_include" >> "$CONDOR_CONFIG"
667
        if [ $? -ne 0 ]; then
668
            #echo "Error appending multi_include to condor_config" 1>&2
669
            STR="Error appending multi_include to condor_config"
670
            "$error_gen" -error "condor_startup.sh" "WN_Resource" "$STR" "file" "$CONDOR_CONFIG" "infile" "$condor_config_multi_include"
671
            exit 1
672
        fi
673
    else
674
        condor_config_main_include="${main_stage_dir}/`grep -i '^condor_config_main_include ' ${main_stage_dir}/${description_file} | awk '{print $2}'`"
675
        echo "# ---- start of include part ----" >> "$CONDOR_CONFIG"
676

    
677
        # using two different configs... one for monitor and one for main
678
        # don't create the monitoring configs and dirs if monitoring is disabled
679
        if [ "$GLIDEIN_Monitoring_Enabled" == "True" ]; then
680
            condor_config_monitor_include="${main_stage_dir}/`grep -i '^condor_config_monitor_include ' ${main_stage_dir}/${description_file} | awk '{print $2}'`"
681
            condor_config_monitor=${CONDOR_CONFIG}.monitor
682
            cp "$CONDOR_CONFIG" "$condor_config_monitor"
683
            if [ $? -ne 0 ]; then
684
                #echo "Error copying condor_config into condor_config.monitor" 1>&2
685
                STR="Error copying condor_config into condor_config.monitor"
686
                "$error_gen" -error "condor_startup.sh" "WN_Resource" "$STR" "infile" "$condor_config_monitor" "file" "$CONDOR_CONFIG"
687
                exit 1
688
            fi
689
            cat "$condor_config_monitor_include" >> "$condor_config_monitor"
690
            if [ $? -ne 0 ]; then
691
                #echo "Error appending monitor_include to condor_config.monitor" 1>&2
692
                STR="Error appending monitor_include to condor_config.monitor"
693
                "$error_gen" -error "condor_startup.sh" "WN_Resource" "$STR" "infile" "$condor_config_monitor" "file" "$condor_config_monitor_include"
694
                exit 1
695
            fi
696

    
697
            cat >> "$condor_config_monitor" <<EOF
698
# use a different name for monitor
699
MASTER_NAME = monitor_$$
700
STARTD_NAME = monitor_$$
701

    
702
# use plural names, since there may be more than one if multiple job VMs
703
Monitored_Names = "glidein_$$@\$(FULL_HOSTNAME)"
704
EOF
705
        fi  # end of [ "$GLIDEIN_Monitoring_Enabled" == "True" ], still in else from use_multi_monitor==1
706

    
707
        # Set number of CPUs (otherwise the physical number is used)
708
        echo "NUM_CPUS = \$(GLIDEIN_CPUS)" >> "$CONDOR_CONFIG"
709
        # set up the slots based on the slots_layout entry parameter
710
        slots_layout=`grep -i "^SLOTS_LAYOUT " "$config_file" | cut -d ' ' -f 2-`
711
        if [ "X$slots_layout" = "Xpartitionable" ]; then
712
            echo "NUM_SLOTS = 1" >> "$CONDOR_CONFIG"
713
            echo "SLOT_TYPE_1 = cpus=\$(GLIDEIN_CPUS)" >> "$CONDOR_CONFIG"
714
            echo "NUM_SLOTS_TYPE_1 = 1" >> "$CONDOR_CONFIG"
715
            echo "SLOT_TYPE_1_PARTITIONABLE = True" >> "$CONDOR_CONFIG"
716
            num_slots_for_shutdown_expr=1
717
        else
718
            # fixed slot
719
            echo "SLOT_TYPE_1 = cpus=1" >> "$CONDOR_CONFIG"
720
            echo "NUM_SLOTS_TYPE_1 = \$(GLIDEIN_CPUS)" >> "$CONDOR_CONFIG"
721
            num_slots_for_shutdown_expr=$GLIDEIN_CPUS
722
        fi
723

    
724

    
725
        # check for resource slots
726
        condor_config_resource_slots="`grep -i "^GLIDEIN_Resource_Slots " "$config_file" | cut -d ' ' -f 2-`"
727
        if [ -n "$condor_config_resource_slots" ]; then
728
            echo "adding resource slots configuration: $condor_config_resource_slots" 1>&2
729
            cat >> "$CONDOR_CONFIG" <<EOF
730
# ---- start of resource slots part ($condor_config_resource_slots) ----
731
NEW_RESOURCES_LIST =
732
EXTRA_SLOTS_NUM = 0
733
EXTRA_CPUS_NUM = 0
734
EXTRA_SLOTS_START = True
735
NUM_CPUS = \$(GLIDEIN_CPUS)+\$(EXTRA_SLOTS_NUM)+\$(EXTRA_CPUS_NUM)
736

    
737
# Slot 1 definition done before (fixed/partitionable)
738
#SLOT_TYPE_1_PARTITIONABLE = FALSE
739
#SLOT_TYPE_1 = cpus=1, ioslot=0
740
#NUM_SLOTS_TYPE_1 = \$(GLIDEIN_CPUS)
741
#
742
#SLOT_TYPE_1_PARTITIONABLE = TRUE
743
#SLOT_TYPE_1 = ioslot=0
744
#NUM_SLOTS_TYPE_1 = 1
745
EOF
746
            # resource processing: res_name[,res_num[,res_total_ram[,res_opt]]]{;res_name[,res_num[,res_total_ram[,res_opt]]]}*
747
            # res_opt: static, partitionable, main
748
            IFS=';' read -ra RESOURCES <<< "$condor_config_resource_slots"
749
            # Slot Type Counter - Leave slot type 2 for monitoring
750
            slott_ctr=3
751
            for i in "${RESOURCES[@]}"; do
752
                resource_params="`fix_param "$i" "name,number,memory,type,disk"`"
753
                IFS=',' read res_name res_num res_ram res_opt res_disk <<< "$resource_params"
754
                if [ -z "$res_name" ]; then
755
                    continue
756
                fi
757
                if [ -z "$res_num" ]; then
758
                    if [ "`echo "$res_name" | tr -s '[:upper:]' '[:lower:]'`" = "gpus" ]; then
759
                        # GPUs auto-discovery: https://htcondor-wiki.cs.wisc.edu/index.cgi/wiki?p=HowToManageGpus
760
                        res_num=`find_gpus_num`
761
                        ec=$?
762
                        if [ $ec -eq 0 ]; then
763
                            echo "GPU autodiscovery (condor_gpu_discovery) found $res_num GPUs" 1>&2
764
                            AUTO_GPU=True
765
                        else
766
                            echo "GPU autodiscovery (condor_gpu_discovery) failed, disabling auto discovery, assuming 0 GPUs." 1>&2
767
                            res_num=0
768
                        fi
769
                    else
770
                        res_num=1
771
                    fi
772
                fi
773
                if [ -z "$res_ram" ]; then
774
                    # Will be ignored if res_opt=main
775
                    let res_ram=128*${res_num}
776
                fi
777
                if [ -n "$AUTO_GPU" ]; then
778
                    cat >> "$CONDOR_CONFIG" <<EOF
779
# Declare GPUs resource, auto-discovered: ${i}
780
use feature : GPUs
781
GPU_DISCOVERY_EXTRA = -extra
782
# Protect against no GPUs found
783
if defined MACHINE_RESOURCE_${res_name}
784
else
785
  MACHINE_RESOURCE_${res_name} = 0
786
endif
787
EOF
788
                else
789
                    cat >> "$CONDOR_CONFIG" <<EOF
790
# Declare resource: ${i}
791
MACHINE_RESOURCE_${res_name} = ${res_num}
792
EOF
793
                fi
794
                if [ "x$res_opt" == "xextra" ]; then
795
                    # Like main, but adds CPUs
796
                    res_opt=main
797
                    echo "EXTRA_CPUS_NUM = \$(EXTRA_CPUS_NUM)+\$(MACHINE_RESOURCE_${res_name})" >> "$CONDOR_CONFIG"
798
                fi
799
                if [ "x$res_opt" == "xmain" ]; then  # which is the default value? main or static?
800
                    res_opt=
801
                    # Resource allocated for only main slots (partitionable or static)
802
                    # Main slots are determined by CPUs. Let condor split the resource: if not enough some slot will have none
803
                    echo "SLOT_TYPE_1 = \$(SLOT_TYPE_1), ${res_name}=100%" >> "$CONDOR_CONFIG"
804
                    # Decided not to add type "mainextra" with resources added to main slot and CPUs incremented
805
                    # It can be obtained with more control by setting GLIDEIN_CPUS
806
                else
807
                    if [[ "$res_num" -eq 1 || "x$res_opt" == "xstatic" ]]; then
808
                        res_opt=static
809
                        res_ram="`unit_division "${res_ram}" ${res_num}`"
810
                        if [ -n "$res_disk" ]; then
811
                            res_disk="`unit_division "${res_disk}" ${res_num}`"
812
                        fi
813
                    else
814
                        res_opt=partitionable
815
                    fi
816
                fi
817
                if [ -z "$res_disk" ]; then
818
                    # Set default here. What to do if disk is not given? Empty string lets HTCondor handle it
819
                    res_disk_specification=''
820
                else
821
                    res_disk_specification=", disk=${res_disk}"
822
                fi
823
                if [ -n "$res_opt" ]; then
824
                    # no main, separate static or partitionable
825
                    cat >> "$CONDOR_CONFIG" <<EOF
826
EXTRA_SLOTS_NUM = \$(EXTRA_SLOTS_NUM)+\$(MACHINE_RESOURCE_${res_name})
827
EOF
828
                    if [ "x$res_opt" == "xpartitionable" ]; then
829
                        cat >> "$CONDOR_CONFIG" <<EOF
830
SLOT_TYPE_${slott_ctr} = cpus=\$(MACHINE_RESOURCE_${res_name}), ${res_name}=\$(MACHINE_RESOURCE_${res_name}), ram=${res_ram}${res_disk_specification}
831
SLOT_TYPE_${slott_ctr}_PARTITIONABLE = TRUE
832
NUM_SLOTS_TYPE_${slott_ctr} = 1
833
EOF
834
                    else
835
                        cat >> "$CONDOR_CONFIG" <<EOF
836
SLOT_TYPE_${slott_ctr} = cpus=1, ${res_name}=1, ram=${res_ram}${res_disk_specification}
837
SLOT_TYPE_${slott_ctr}_PARTITIONABLE = FALSE
838
NUM_SLOTS_TYPE_${slott_ctr} = \$(MACHINE_RESOURCE_${res_name})
839
EOF
840
                    fi
841
                    cat >> "$CONDOR_CONFIG" <<EOF
842
IS_SLOT_${res_name} = SlotTypeID==${slott_ctr}
843
EXTRA_SLOTS_START = ifThenElse((SlotTypeID==${slott_ctr}), TARGET.Request${res_name}>0, (\$(EXTRA_SLOTS_START)))
844
EOF
845
                    let slott_ctr+=1
846
                fi
847
                echo "NEW_RESOURCES_LIST = \$(NEW_RESOURCES_LIST) $res_name" >> "$CONDOR_CONFIG"
848

    
849
            done
850

    
851
            cat >> "$CONDOR_CONFIG" <<EOF
852
# Update machine_resource_names and start expression
853
if defined MACHINE_RESOURCE_NAMES
854
  MACHINE_RESOURCE_NAMES = $\(MACHINE_RESOURCE_NAMES) \$(NEW_RESOURCES_LIST)
855
endif
856
START = (\$(START)) && (\$(EXTRA_SLOTS_START))
857
EOF
858

    
859
        fi  # end of resource slot if
860

    
861
        # Set to shutdown if total idle exceeds max idle, or if the age
862
        # exceeds the retire time (and is idle) or is over the max walltime (todie)
863
        echo "STARTD_SLOT_ATTRS = State, Activity, TotalTimeUnclaimedIdle, TotalTimeClaimedBusy" >> "$CONDOR_CONFIG"
864
        echo "STARTD_SLOT_ATTRS = \$(STARTD_SLOT_ATTRS), SelfMonitorAge, JobStarts, ExpectedMachineGracefulDrainingCompletion" >> "$CONDOR_CONFIG"
865
        daemon_shutdown=""
866
        for I in `seq 1 $num_slots_for_shutdown_expr`; do
867
            cat >> "$CONDOR_CONFIG" <<EOF
868

    
869
DS${I}_TO_DIE = ((GLIDEIN_ToDie =!= UNDEFINED) && (CurrentTime > GLIDEIN_ToDie))
870

    
871
# The condition pre 8.2 is valid only for not partitionable slots
872
# Since the idle timer doesn't reset/stop when resources are reclaimed, 
873
# partitionable slots will get reaped sooner than non-partitionable.
874
DS${I}_NOT_PARTITIONABLE = ((PartitionableSlot =!= True) || (TotalSlots =?=1))
875
# The daemon shutdown expression for idle startds(glideins) depends on some conditions:
876
# If some jobs were sheduled on the startd (TAIL) or none at all (NOJOB)
877
# If using condor 8.2 or later (NEW) or previous versions (PRE82). JobStarts defined
878
# is used to discriminate
879
DS${I}_IS_HTCONDOR_NEW = (Slot${I}_JobStarts =!= UNDEFINED)
880
# No jobs started (using GLIDEIN_Max_Idle) 
881
DS${I}_IDLE_NOJOB_NEW = ((Slot${I}_JobStarts =!= UNDEFINED) && (Slot${I}_SelfMonitorAge =!= UNDEFINED) && (GLIDEIN_Max_Idle =!= UNDEFINED) && \\
882
                  (Slot${I}_JobStarts == 0) && \\
883
                  (Slot${I}_SelfMonitorAge > GLIDEIN_Max_Idle))
884
DS${I}_IDLE_NOJOB_PRE82 = ((Slot${I}_TotalTimeUnclaimedIdle =!= UNDEFINED) && (GLIDEIN_Max_Idle =!= UNDEFINED) && \\
885
        \$(DS${I}_NOT_PARTITIONABLE) && \\
886
        (Slot${I}_TotalTimeUnclaimedIdle > GLIDEIN_Max_Idle))
887
DS${I}_IDLE_NOJOB = ((GLIDEIN_Max_Idle =!= UNDEFINED) && \\
888
        ifThenElse(\$(DS${I}_IS_HTCONDOR_NEW), \$(DS${I}_IDLE_NOJOB_NEW), \$(DS${I}_IDLE_NOJOB_PRE82))) 
889
# Some jobs started (using GLIDEIN_Max_Tail)
890
DS${I}_IDLE_TAIL_NEW = ((Slot${I}_JobStarts =!= UNDEFINED) && (Slot${I}_ExpectedMachineGracefulDrainingCompletion =!= UNDEFINED) && (GLIDEIN_Max_Tail =!= UNDEFINED) && \\
891
        (Slot${I}_JobStarts > 0) && \\
892
        ((CurrentTime - Slot${I}_ExpectedMachineGracefulDrainingCompletion) > GLIDEIN_Max_Tail) )
893
DS${I}_IDLE_TAIL_PRE82 = ((Slot${I}_TotalTimeUnclaimedIdle =!= UNDEFINED) && (GLIDEIN_Max_Tail =!= UNDEFINED) && \\
894
        (Slot${I}_TotalTimeClaimedBusy =!= UNDEFINED) && \\
895
        \$(DS${I}_NOT_PARTITIONABLE) && \\
896
        (Slot${I}_TotalTimeUnclaimedIdle > GLIDEIN_Max_Tail))
897
DS${I}_IDLE_TAIL = ((GLIDEIN_Max_Tail =!= UNDEFINED) && \\
898
        ifThenElse(\$(DS${I}_IS_HTCONDOR_NEW), \$(DS${I}_IDLE_TAIL_NEW), \$(DS${I}_IDLE_TAIL_PRE82)))
899
DS${I}_IDLE_RETIRE = (\$(DS${I}_NOT_PARTITIONABLE) && (GLIDEIN_ToRetire =!= UNDEFINED) && \\
900
       (CurrentTime > GLIDEIN_ToRetire ))
901
DS${I}_IDLE = ( (Slot${I}_Activity == "Idle") && (Slot${I}_State =!= "Claimed") && \\
902
        (\$(DS${I}_IDLE_NOJOB) || \$(DS${I}_IDLE_TAIL) || \$(DS${I}_IDLE_RETIRE)) )
903

    
904
DS${I} = (\$(DS${I}_TO_DIE) || \\
905
          \$(DS${I}_IDLE))
906

    
907
# But don't enforce shutdowns for dynamic slots (aka "subslots")
908
DS${I} = (DynamicSlot =!= True) && (\$(DS${I}))
909

    
910
EOF
911
            if [ "X$daemon_shutdown" != "X" ]; then
912
                daemon_shutdown="$daemon_shutdown &&"
913
            fi
914
            daemon_shutdown="$daemon_shutdown \$(DS${I})"
915
        done
916
        echo "STARTD.DAEMON_SHUTDOWN = $daemon_shutdown" >> "$CONDOR_CONFIG"
917

    
918
        cat $condor_config_main_include >> "$CONDOR_CONFIG"
919
        if [ $? -ne 0 ]; then
920
            #echo "Error appending main_include to condor_config" 1>&2
921
            STR="Error appending main_include to condor_config"
922
            "$error_gen" -error "condor_startup.sh" "WN_Resource" "$STR" "file" "$CONDOR_CONFIG" "infile" "$condor_config_main_include"
923
            exit 1
924
        fi
925

    
926
        if [ "$GLIDEIN_Monitoring_Enabled" == "True" ]; then
927
            cat >> "$CONDOR_CONFIG" <<EOF
928

    
929
Monitoring_Name = "monitor_$$@\$(FULL_HOSTNAME)"
930
EOF
931

    
932
            # also needs to create "monitor" dir for log and execute dirs
933
            if [ -d monitor ] && [ -d monitor/log ] && [ -d monitor/execute ]; then
934
                echo "Monitoring dirs exist" 1>&2
935
            else
936
                mkdir monitor monitor/log monitor/execute 
937
                if [ $? -ne 0 ]; then
938
                    #echo "Error creating monitor dirs" 1>&2
939
                    STR="Error creating monitor dirs"
940
                    "$error_gen" -error "condor_startup.sh" "WN_Resource" "$STR" "directory" "$PWD/monitor_monitor/log_monitor/execute"
941
                    exit 1
942
                fi
943
            fi
944
        fi
945
    fi  # end else of [ "$use_multi_monitor" -eq 1 ]
946
fi  # end else of "get check_include file for testing" [ "$check_only" == "1" ]
947

    
948

    
949
if [ -d log ] && [ -d execute ]; then
950
  echo "log and execute dirs exist" 1>&2
951
else
952
  mkdir log execute 
953
  if [ $? -ne 0 ]; then
954
    #echo "Error creating condor dirs" 1>&2
955
    STR="Error creating monitor dirs"
956
    "$error_gen" -error "condor_startup.sh" "WN_Resource" "$STR" "directory" "$PWD/log_execute"
957
    exit 1
958
  fi
959
fi
960

    
961
####################################
962

    
963
if [ "$print_debug" -ne "0" ]; then
964
  echo "--- condor_config ---" 1>&2
965
  cat $CONDOR_CONFIG 1>&2
966
  echo "--- ============= ---" 1>&2
967
  env 1>&2
968
  echo "--- ============= ---" 1>&2
969
  echo 1>&2
970
  #env 1>&2
971
fi
972

    
973
#Set the LD_LIBRARY_PATH so condor uses dynamically linked libraries correctly
974
export LD_LIBRARY_PATH=$CONDOR_DIR/lib:$CONDOR_DIR/lib/condor:$LD_LIBRARY_PATH
975

    
976
#
977
# The config is complete at this point
978
#
979

    
980
if [ "$adv_only" -eq "1" ]; then
981
    adv_type=`grep -i "^GLIDEIN_ADVERTISE_TYPE " "$config_file" | cut -d ' ' -f 2-`
982

    
983
    chmod u+rx "${main_stage_dir}/advertise_failure.helper"
984
    "${main_stage_dir}/advertise_failure.helper" "$CONDOR_DIR/sbin/condor_advertise" "${adv_type}" "${adv_destination}"
985
    # short circuit... do not even try to start the Condor daemons below
986
    exit $?
987
fi
988

    
989

    
990
X509_BACKUP=$X509_USER_PROXY
991
if [ "$expose_x509" == "true" ]; then
992
    echo "Exposing X509_USER_PROXY $X509_USER_PROXY" 1>&2
993
else
994
    echo "Unsetting X509_USER_PROXY" 1>&2
995
    unset X509_USER_PROXY
996
fi
997

    
998
## start the monitoring condor master
999
if [ "$use_multi_monitor" -ne 1 ]; then
1000
    # don't start if monitoring is disabled
1001
    if [ "$GLIDEIN_Monitoring_Enabled" == "True" ]; then
1002
      # start monitoring startd
1003
      # use the appropriate configuration file
1004
      tmp_condor_config=$CONDOR_CONFIG
1005
      export CONDOR_CONFIG=$condor_config_monitor
1006

    
1007
      monitor_start_time=`date +%s`
1008
      echo "Starting monitoring condor at `date` (`date +%s`)" 1>&2
1009

    
1010
      # set the worst case limit
1011
      # should never hit it, but let's be safe and shutdown automatically at some point
1012
      let "monretmins=( $retire_time + $GLIDEIN_Job_Max_Time ) / 60 - 1"
1013
      $CONDOR_DIR/sbin/condor_master -f -r $monretmins -pidfile $PWD/monitor/condor_master.pid  >/dev/null 2>&1 </dev/null &
1014
      ret=$?
1015
      if [ "$ret" -ne 0 ]; then
1016
      echo 'Failed to start monitoring condor... still going ahead' 1>&2
1017
      fi
1018

    
1019
      # clean back
1020
      export CONDOR_CONFIG=$tmp_condor_config
1021

    
1022
      monitor_starter_log='monitor/log/StarterLog'
1023
    fi
1024
      main_starter_log='log/StarterLog'
1025
      main_condor_log='log/StartdLog'
1026
else
1027
    main_starter_log='log/StarterLog.vm2'
1028
    monitor_starter_log='log/StarterLog.vm1'
1029
fi
1030

    
1031
start_time=`date +%s`
1032
echo "=== Condor starting `date` (`date +%s`) ==="
1033
ON_DIE=0
1034
condor_pid=
1035
trap 'ignore_signal' SIGHUP
1036
trap_with_arg on_die SIGTERM SIGINT SIGQUIT
1037
#trap 'on_die' TERM
1038
#trap 'on_die' INT
1039

    
1040
#### STARTS CONDOR ####
1041
if [[ "$check_only" == "1" ]]; then
1042
    echo "=== Condor started in test mode ==="
1043
    $CONDOR_DIR/sbin/condor_master -pidfile $PWD/condor_master.pid
1044
else
1045
    $CONDOR_DIR/sbin/condor_master -f -pidfile $PWD/condor_master2.pid &
1046
    condor_pid=$!
1047
    # Wait for a few seconds to make sure the pid file is created,
1048
    sleep 5 & wait $!
1049
    # Wait more if the pid file was not created and the Glidein was not killed, see [#9639]
1050
    if [[ ! -e "$PWD/condor_master2.pid" ]] && [[ "$ON_DIE" -eq 0 ]]; then
1051
        echo "=== Condor started in background but the pid file is still missing, waiting 200 sec more ==="
1052
        sleep 200 & wait $!
1053
    fi
1054
    # then wait on it for completion
1055
    if [[ -e "$PWD/condor_master2.pid" ]]; then
1056
        [[ "$condor_pid" -ne `cat "$PWD/condor_master2.pid"` ]] && echo "Background PID $condor_pid is different from PID file content `cat "$PWD/condor_master2.pid"`"
1057
        echo "=== Condor started in background, now waiting on process $condor_pid ==="
1058
        wait $condor_pid
1059
    else
1060
        # If ON_DIE == 1, condor has already been killed by a signal
1061
        if [[ "$ON_DIE" -eq 0 ]]; then
1062
            echo "=== Condor was started but the PID file is missing, killing process $condor_pid ==="
1063
            kill -s SIGQUIT $condor_pid
1064
        fi
1065
    fi
1066
fi
1067
condor_ret=$?
1068
condor_pid=
1069

    
1070
if [ ${condor_ret} -eq 99 ]; then
1071
    echo "Normal DAEMON_SHUTDOWN encountered" 1>&2
1072
    condor_ret=0
1073
    metrics+=" AutoShutdown True"
1074
else
1075
    metrics+=" AutoShutdown False"
1076
fi
1077

    
1078
end_time=`date +%s`
1079
let elapsed_time=$end_time-$start_time
1080
echo "=== Condor ended `date` (`date +%s`) after $elapsed_time ==="
1081
echo
1082

    
1083
metrics+=" CondorDuration $elapsed_time"
1084

    
1085

    
1086
## perform a condor_fetchlog against the condor_startd
1087
##    if fetch fails, sleep for 'fetch_sleeptime' amount
1088
##    of seconds, then try again.  Repeat until
1089
##    'timeout' amount of time has been reached.
1090
if [ "$check_only" -eq 1 ]; then
1091

    
1092
    HOST=`uname -n`
1093

    
1094
    # debug statement
1095
    # echo "CONDOR_CONFIG ENV VAR= `env | grep CONDOR_CONFIG | awk '{split($0,a,"="); print a[2]}'`" 1>&2
1096
    #echo "running condor_fetchlog with the following:" 1>&2
1097
    #echo "\t$CONDOR_DIR/sbin/condor_fetchlog -startd $STARTD_NAME@$HOST STARTD" 1>&2
1098

    
1099
    fetch_sleeptime=30      # can be dynamically set
1100
    fetch_timeout=500       # can be dynamically set
1101
    fetch_curTime=0
1102
    fetch_exit_code=1
1103
    let fetch_attemptsLeft="$fetch_timeout / $fetch_sleeptime"
1104
    while [ "$fetch_curTime" -lt "$fetch_timeout" ]; do
1105
        sleep $fetch_sleeptime
1106

    
1107
        # grab user proxy so we can authenticate ourselves to run condor_fetchlog
1108
        PROXY_FILE="`grep -i "^X509_USER_PROXY " "$config_file" | cut -d ' ' -f 2-`"
1109

    
1110
        let "fetch_curTime  += $fetch_sleeptime" 
1111
        FETCH_RESULTS=`X509_USER_PROXY=$PROXY_FILE $CONDOR_DIR/sbin/condor_fetchlog -startd $STARTD_NAME@$HOST STARTD`
1112
        fetch_exit_code=$?
1113
        if [ $fetch_exit_code -eq 0 ]; then
1114
            break
1115
        fi
1116
        echo "fetch exit code=$fetch_exit_code" 1>&2
1117
        echo "fetch failed in this iteration...will try $fetch_attemptsLeft more times."  >&2
1118
        let "fetch_attemptsLeft -= 1"
1119
    done
1120

    
1121
    if [ $fetch_exit_code -ne 0 ]; then
1122
        echo "Able to talk to startd? FALSE" 1>&1 1>&2
1123
        echo "Failed to talk to startd $STARTD_NAME on host $HOST" >&2
1124
        echo "Reason for failing: Condor_fetchlog took too long to talk to host" >&2
1125
        echo "time spent trying to fetch : $fetch_curTime" >&2
1126
    else
1127
        echo "Able to talk to startd? TRUE" 1>&1 1>&2
1128
        echo "Successfully talked to startd $STARTD_NAME on host $HOST" >&2
1129
        echo "Fetch Results from condor_fetchlog: $FETCH_RESULTS" >&2
1130
    fi
1131

    
1132
    ## KILL CONDOR
1133
    KILL_RES=`$CONDOR_DIR/sbin/condor_master -k $PWD/condor_master.pid`
1134
fi
1135

    
1136
# log dir is always different
1137
# get the real name
1138
log_dir='log'
1139

    
1140
echo "Total jobs/goodZ jobs/goodNZ jobs/badSignal jobs/badOther jobs below are normalized to 1 Core"
1141
echo "=== Stats of main ==="
1142
if [ -f "${main_starter_log}" ]; then
1143
    echo "===NewFile===" > separator_log.txt
1144
    listtoparse="separator_log.txt"
1145
    slotlogs="`ls -1 ${main_starter_log} ${main_starter_log}.slot* 2>/dev/null`"
1146
    for slotlog in $slotlogs
1147
    do
1148
        listtoparse="$listtoparse $slotlog separator_log.txt"
1149
    done
1150
    parsed_out=`cat $listtoparse | awk -v parallelism=${GLIDEIN_CPUS} -f "${main_stage_dir}/parse_starterlog.awk"`
1151
    echo "$parsed_out"
1152

    
1153
    parsed_metrics=`echo "$parsed_out" | awk 'BEGIN{p=0;}/^Total /{if (p==1) {if ($2=="jobs") {t="Total";n=$3;m=$5;} else {t=$2;n=$4;m=$7;} print t "JobsNr " n " " t "JobsTime " m;}}/^====/{p=1;}'`
1154
    # use echo to strip newlines
1155
    metrics+=`echo " " $parsed_metrics`
1156
fi
1157
echo "=== End Stats of main ==="
1158

    
1159
if [ -f "${main_condor_log}" ]; then
1160
    numactivations=`grep "Got activate_claim" "${main_condor_log}" 2>/dev/null | wc -l`
1161
    echo "Total number of activations/claims: $numactivations"
1162
fi
1163

    
1164
ls -l log 1>&2
1165
echo
1166
cond_print_log MasterLog log/MasterLog
1167
cond_print_log StartdLog log/StartdLog
1168
cond_print_log StarterLog ${main_starter_log}
1169
slotlogs="`ls -1 ${main_starter_log}.slot* 2>/dev/null`"
1170
for slotlog in $slotlogs
1171
do
1172
    slotname=`echo $slotlog | awk -F"${main_starter_log}." '{print $2}'`
1173
    cond_print_log StarterLog.${slotname} $slotlog
1174
done
1175

    
1176
if [ "$use_multi_monitor" -ne 1 ]; then
1177
    if [ "$GLIDEIN_Monitoring_Enabled" == "True" ]; then
1178
        cond_print_log MasterLog.monitor monitor/log/MasterLog
1179
        cond_print_log StartdLog.monitor monitor/log/StartdLog
1180
        cond_print_log StarterLog.monitor ${monitor_starter_log}
1181
    fi
1182
else
1183
    cond_print_log StarterLog.monitor ${monitor_starter_log}
1184
fi
1185
cond_print_log StartdHistoryLog log/StartdHistoryLog
1186

    
1187
## kill the master (which will kill the startd)
1188
if [ "$use_multi_monitor" -ne 1 ]; then
1189
    # terminate monitoring startd
1190
    if [ "$GLIDEIN_Monitoring_Enabled" == "True" ]; then
1191
        # use the appropriate configuration file
1192
        tmp_condor_config=$CONDOR_CONFIG
1193
        export CONDOR_CONFIG=$condor_config_monitor
1194

    
1195
        monitor_start_time=`date +%s`
1196
        echo "Terminating monitoring condor at `date` (`date +%s`)" 1>&2
1197

    
1198
        #### KILL CONDOR ####
1199
        $CONDOR_DIR/sbin/condor_master -k $PWD/monitor/condor_master.pid
1200
        ####
1201

    
1202
        ret=$?
1203
        if [ "$ret" -ne 0 ]; then
1204
            echo 'Failed to terminate monitoring condor... still going ahead' 1>&2
1205
        fi
1206

    
1207
        # clean back
1208
        export CONDOR_CONFIG=$tmp_condor_config
1209
    fi
1210
fi
1211

    
1212
if [ "$ON_DIE" -eq 1 ]; then
1213

    
1214
    #If we are explicitly killed, do not wait required time
1215
    echo "Explicitly killed, exiting with return code 0 instead of $condor_ret";
1216

    
1217
    condor_ret=0
1218
    metrics+=" CondorKilled True"
1219
else
1220
    metrics+=" CondorKilled False"
1221
fi
1222

    
1223
##
1224
##########################################################
1225

    
1226
if [ "$condor_ret" -eq "0" ]; then
1227
    "$error_gen" -ok "condor_startup.sh" $metrics
1228
else
1229
    "$error_gen" -error "condor_startup.sh" "Unknown" "See Condor logs for details" $metrics
1230
fi
1231

    
1232
exit $condor_ret