Project

General

Profile

check_dcache_pic_wrapper-1.sh

Gerard Bernabeu Altayo, 05/26/2015 06:26 PM

 
1
#!/bin/bash
2
#Checks that all dCache processes configured in node_config file are running in the machine
3
#and the default designated ports listening.
4

    
5

    
6
#################Parameters#############################
7

    
8
instance=`grep instance /opt/PICnodeInfo | cut -d= -f2`
9
node_type=`grep node_type /opt/PICnodeInfo | cut -d= -f2`
10
dCacheVersion=`grep dCacheVersion /opt/PICnodeInfo | cut -d= -f2`
11
dcachelayout=/etc/dcache/layouts/`grep dcache.layout /etc/dcache/dcache.conf | cut -d= -f2`.conf
12

    
13
case $instance in
14
	prod )
15
		dcip=dcip.pic.es #dCache information provider server
16
		dccore="193.109.174.39"
17
	;;
18
	test )
19
		dcip=dcip-test.pic.es #dCache information provider server
20
		dccore="193.109.174.41"
21
	;;
22
	disk )
23
		dcip=dccore-disk.pic.es #dCache information provider server
24
		dccore="193.109.175.249"
25
	;;
26
	*)
27
		echo "Unknown instance $instance. Check puppet maintanied /opt/PICnodeInfo file"
28
		exit 2
29
	;;
30
esac
31
cellinfo=/tmp/cellInfo.html
32
netstat=/tmp/netstat.tmp
33

    
34
#################### Timeout del check ##################
35
timeout_check() {
36
PID=$1
37
sleep 15
38
kill -9 $PID >/dev/null 2>&1
39
if [ $? -eq 0 ]; then
40
        echo "Sensor timeout. This sensor contacts dccore ($dccore) and $dcip."
41
        exit 1
42
fi
43
}
44

    
45
timeout_check $$ &
46
##################monitoring script#####################
47

    
48
case $node_type in
49
	srmdb )
50
		echo "This server only needs to run dCache in emergency cases, node_type=$node_type"
51
		exit 0
52
	;;
53
	PGSQLstandby|dcmon )
54
		echo "This server does not need to run this check because its node_type=$node_type"
55
		exit 0
56
	;;
57
esac
58

    
59
output=""
60
rc=0
61
rm -f $cellinfo $netstat
62
if [ "`uname`" == "SunOS" ] 
63
then
64
  /opt/csw/bin/links -dump http://$dcip:2288/cellInfo > $cellinfo  #In Solaris to run this you need to install links, at PIC we've done this way: pkg-get upgrade common; pkg-get install links
65
  grepJava="`ps -ef | grep java | grep -v '/usr/java/bin/java' | grep -cv grep`"
66
  hostname=`hostname`
67
  netstat -n > $netstat
68
else
69
  links -dump http://$dcip:2288/cellInfo > $cellinfo
70
  grepJava="`ps --no-headers -fC java | grep -c dcache`"
71
  netstat -putan > $netstat
72
  hostname=`hostname -s`
73
	#For all linux server we check that the system tunning is in place by checking one of the tricky properties
74
  proc=`ps --no-headers -fC java | grep  dcache | awk '{print $2}' | head -1`
75
  if [ `grep "Max open files" /proc/$proc/limits | grep -c 65535` -ne 1 ]; then output="$output [WARNING] Tunning not applied"; rc=1; fi
76
fi
77

    
78
if [ ! -s $cellinfo ]; then
79
	echo "$cellinfo couldn't be read properly, maybe links is not installed"
80
	exit 1
81
fi
82

    
83
#All dCache nodes should have connections to its intance dccore
84
if [ `grep -c $dccore $netstat` -lt 1 ];
85
then
86
	output="[CRITICAL] No connection with dccore for dCache instance=$instance ($dccore) detected. $output"
87
	rc=2
88
else
89
	output="$output [OK] Connection with dccore established: `echo $netstat | grep $dccore`."
90
fi
91

    
92
procStatus="`dcache status | grep running`"
93

    
94
function checkResults {
95
	#This function modifies nagios sensor return code (rc) and nagios sensor message (output).
96
	#noteval=1 should be included for those cells we have nothing to check.
97
	#dCacheStatus: should be 1 if a dcache status finds the cell (only when there is a 1 to 1 mapping betwen cell and java proc/dcache daemon).
98
	#grepNetstat: should be 1 if the cell is listening in the right port
99
	#grepCellInfo: should be 1 if the cell is found in dCache's info system
100

    
101
	if [ "$noteval" = "1"  ]; then 
102
		output="$output [?] $cell."
103
	elif [ $dCacheStatus -ne 1 ]; then
104
		output="[CRITICAL] dCacheDomain for Cell $cell is down. $output"
105
		rc=2
106
	elif [ $grepNetstat -ne 1 ]; then
107
		output="[CRITICAL] Cell $cell is not listening for new connections, check netstat. $output"
108
		rc=2
109
	elif [ `echo $grepCellInfo | grep -ic Offline` -ne 0 ]; then
110
		output="[CRITICAL] according to http://$dcip:2288/cellInfo $cell is Offline. $output"
111
		rc=2
112
	else
113
		if [ $rc -eq 0 ]; then #We sort so that if there is no issue not evaluated procs go to the end
114
			output="[OK] $cell. $output"
115
		else
116
			output="$output [OK] $cell."
117
		fi
118
	fi
119
}
120

    
121

    
122
#We count the number of pools running in the server and add 1 just to fit in the standard dCache case where we number 1 2 3 ...
123
let poolCount=`grep -ci "${hostname}_" $cellinfo`+1
124

    
125
#We get the list of services which should be on the server
126
for cell in `egrep "^\[.+/.+\]" $dcachelayout | cut -d/ -f2 | cut -d] -f1`; do
127
	noteval=0
128
	standardcell="no"
129
	case $cell in
130
		gsidcap)
131
			standardcell="yes"
132
			grepNetstat=`grep -c :::22128 $netstat`
133
		;;
134
		dcap)
135
			standardcell="yes"
136
			grepNetstat=`grep -c :::22125 $netstat`
137
		;;
138
		gridftp)
139
			standardcell="yes"
140
#			grepNetstat=`cat $netstat | grep -c 0.0.0.0:2811`
141
			if  [ `grep -c :::2811 $netstat` -eq 1 ]; then
142
				grepNetstat=`echo quit | nc -w 5 localhost 2811 | grep -ic "220 GSI FTP Door ready"`
143
			fi
144
		;;
145
		xrootd)
146
			standardcell="yes"
147
			grepNetstat=`grep -c :::1094 $netstat`
148
		;;
149
		webdav)
150
			standardcell="yes"
151
			grepNetstat=0
152
			for port in `grep 'webdav.net.port' /etc/dcache/layouts/door.conf | cut -d= -f2`; do #Done after RT4129
153
				if [ $grepNetstat -eq 0 ] || [ $grepNetstat -eq 1 ]; then #Check only if not yet tested or if it is OK. This means we will only report one failure at a time.
154
					grepNetstat=`grep -c ":::$port " $netstat`
155
				fi
156
			done
157
		;;
158
		srm)
159
			standardcell="yes"
160
			grepNetstat=`grep -c :::8443 $netstat`
161
		;;
162
		pool)
163
		  	grepCellInfo=`grep -i "${hostname}_" $cellinfo`
164
			let poolCount=$poolCount-1 #This will not work if pools are not in sequence
165
			dCacheStatus=`echo $procStatus | grep -c ${hostname}_$poolCount` #We count the number of pools running in the server and add 1 just to fit in the standard dCacheStatus = 1 = OK
166
			grepNetstat=1 #Pools should not listen any port
167
		;;
168
	#Cells with no dedicated java process in PIC default cell-proc setup, thus they do not appear in the dcache status output.
169
		poolmanager)
170
			dCacheStatus=`echo $procStatus | grep -c dcache-$(hostname -s)Domain` #We select this cell to check on the daemon.
171
			grepCellInfo=`grep -i "${cell}" $cellinfo`
172
			grepNetstat=`grep :::11111 $netstat |grep -c LISTEN` #The port might be from another service, but running in the dCacheDomain.
173
		;;
174
		pnfsmanager)
175
			grepNetstat=1 #Nothing to be listening to
176
			dCacheStatus=`echo $procStatus | grep -c namespace-$(hostname -s)Domain` #We select this cell to check on the daemon.
177
			grepCellInfo=`grep -i "${cell}-${hostname}" $cellinfo`
178
		;;
179
		loginbroker)
180
			grepNetstat=1 #Nothing to be listening to
181
			dCacheStatus=1
182
			grepCellInfo=`grep -i "${cell}-${hostname}" $cellinfo`
183
		;;
184
		srm-loginbroker)
185
			grepNetstat=1 #Nothing to be listening to
186
			dCacheStatus=1
187
			grepCellInfo=`grep -i "${cell}-${hostname}" $cellinfo`
188
		;;
189
	#Cells with no info in the dCache web
190
		info)
191
			grepNetstat=`grep -c  127.0.0.1:22112 $netstat`
192
			dCacheStatus=`echo $procStatus | grep -c info-$(hostname -s)Domain` #We select this cell to check on the daemon.
193
			grepCellInfo=1
194
		;;
195
		nfsv3)
196
			grepNetstat=`grep tcp  $netstat | grep -c :::2049`
197
			dCacheStatus=`echo $procStatus | grep -c nfsv3-$(hostname -s)Domain` #We select this cell to check on the daemon.
198
			grepCellInfo=1
199
		;;
200
		httpd)
201
			dCacheStatus=1
202
			grepNetstat=`grep -c  :::2288 $netstat`
203
			grepCellInfo=1
204
		;;
205
		admin)
206
			grepNetstat=`grep -c  :::22224 $netstat`
207
			dCacheStatus=1
208
			grepCellInfo=1
209
		;;
210
		*)
211
			noteval=1
212
		;;
213
	esac
214
	if [ "$standardcell" = "yes" ]; then
215
		       dCacheStatus=`echo $procStatus | grep -c $cell`
216
		       grepCellInfo=`grep -i "${cell}-${hostname}" $cellinfo`
217
	fi
218

    
219
	checkResults
220
done
221

    
222
#We get the number of Java Daemons which should be on the server
223
JavaProcs=`egrep -c "^\[.+Domain]" $dcachelayout`
224

    
225
#Check how many java daemons are running in the host
226
if [ $grepJava -gt $JavaProcs ]; then
227
	if [ $rc -eq 0 ]; then rc=1; fi
228
	output="[WARNING] Too many java procs running ($grepJava -gt $JavaProcs). $output"
229
else
230
	if [ $grepJava -lt $JavaProcs ]; then
231
		output="[CRITICAL] Too few java procs running ($grepJava -lt $JavaProcs). $output"
232
		rc=2
233
	fi
234
fi
235

    
236
#Check JAVA version
237
jversion=`java -version 2>&1 | grep version`
238
if [ "`echo $jversion | awk '{print $3}' | xargs echo | cut -d. -f1,2`" == "1.6" ] || [ "`echo $jversion | awk '{print $3}' | xargs echo | cut -d. -f1,2`" == "1.7" ]; then
239
	output="$output $jversion"
240
else
241
	if [ "$rc" = "0" ]; then rc=1; fi
242
	output="[WARNING] $jversion. $output"
243
fi
244

    
245
echo $output
246
exit $rc
247

    
248