Well, after I finished my first OCF agent back in October 2008, we have it running in production now for about ten months. During that time, we found quite a few points in which we’d like to improve the behaviour with that Linux-HA should handle TSM.
- Shutdown TSM nicely if possible (Cancel client sessions, cancel running processes and dismount mounted volumes)
- Better error handling
So, after another week of writing and testing with a small instance, I present the new OCF agent for Tivoli Storage Manager. It still has one or two weak points, but they are negligible. I still need to write the documentation for it, but the script should just work …
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 |
#!/bin/sh # Copyright 2009 christian.heim@barfoo.org . ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs tsm_check() { local exit_val="" case "$1" in validate) exit_val="$OCF_ERR_ARGS" ;; *) exit_val="$OCF_NOT_RUNNING" ;; esac : ${OCF_RESKEY_dsmserv_dir:=/opt/tivoli/tsm/server/bin} : ${OCF_RESKEY_dsmclient_dir:=/opt/tivoli/tsm/client/ba/bin} : ${OCF_RESKEY_tsm_retries:=5} : ${OCF_RESKEY_tsm_timeout:=5} : ${OCF_RESKEY_tsm_logdir:=/var/log/tsm} : ${OCF_RESKEY_tsm_gloves:=1} case "$OCF_RESKEY_single_instance" in 0|false) single_instance=0 if $TEST -z $OCF_RESKEY_instance_name -o -z $OCF_RESKEY_instance_prefix ; then ocf_log err "TSM: You didn't specify an instance_name nor a prefix" ocf_log err "TSM: yet you specified that this isn't a single instance!" ocf_log err "TSM: Please check your configuration." exit $exit_val else if $TEST ! -d $OCF_RESKEY_instance_prefix -o ! -d $OCF_RESKEY_instance_prefix/$OCF_RESKEY_instance_name ; then ocf_log err "TSM: Either the directory specified as instance_prefix" ocf_log err "TSM: or as instance_name don't exist!" ocf_log err "TSM: Please check your system." exit $exit_val else instance_name=$OCF_RESKEY_instance_name instance_prefix=$OCF_RESKEY_instance_prefix instance_path=$OCF_RESKEY_instance_prefix/$OCF_RESKEY_instance_name fi fi ;; 1|true) single_instance=1 instance_prefix=/opt/tivoli/tsm/server instance_name=tsm instance_path=$instance_prefix/$instance_name if $TEST ! -d $instance_path ; then ocf_log err "TSM: The directory specified as TSM directory doesn't exist!" ocf_log err "TSM: Please check your system." exit $exit_val fi ;; *) ocf_log err "TSM: You didn't specify single_instance!" exit $exit_val ;; esac if $TEST ! -d $OCF_RESKEY_dsmserv_dir -o ! -x $OCF_RESKEY_dsmserv_dir/dsmserv ; then ocf_log err "TSM: Either the specified directory $OCF_RESKEY_dsmserv_dir or the" ocf_log err "TSM: dsmserv executeable doesn't exist!" ocf_log err "TSM: Please check your system." exit $exit_val fi if $TEST ! -d $OCF_RESKEY_dsmclient_dir -o ! -x $OCF_RESKEY_dsmclient_dir/dsmadmc ; then ocf_log err "TSM: Either the specified directory $OCF_RESKEY_dsmclient_dir or the" ocf_log err "TSM: dsmadmc executeable doesn't exist!" ocf_log err "TSM: Please check your system." exit $exit_val fi if $TEST ! -d $OCF_RESKEY_tsm_logdir ; then ocf_log err "TSM: The logging dir specified ($OCF_RESKEY_tsm_logdir) doesn't exist!" ocf_log err "TSM: Please check your system." exit $exit_val fi if $TEST ! -f $instance_path/dsmserv.opt -o ! -f $instance_path/dsmserv.dsk ; then ocf_log err "TSM: Either $instance_path/dsmserv.opt" ocf_log err "TSM: or $instance_path/dsmserv.dsk don't exist!" ocf_log err "TSM: Please check your configuration." exit $exit_val fi # We need to test the dsm.opt/dsm.sys for correct information, since it's # needed for the graceful shutdown. if $TEST $OCF_RESKEY_tsm_gloves -a -f $OCF_RESKEY_dsmclient_dir/dsm.sys -a -f $OCF_RESKEY_dsmclient_dir/dsm.opt; then if $TEST -z $OCF_RESKEY_cluster_dns -o -z $OCF_RESKEY_cluster_address -o -z $OCF_RESKEY_cluster_port ; then ocf_log err "TSM: You are missing a configuration value:" ocf_log err "TSM: either cluster_dns, cluster_address or cluster_port isn't set!" ocf_log err "TSM: Please recheck your configuration!" exit $exit_val fi if $TEST "$( $EGREP "^ServerName.*$OCF_RESKEY_cluster_dns" $OCF_RESKEY_dsmclient_dir/dsm.sys )" = "" ; then # We need to construct the dsm.sys ocf_log err "TSM: You are lacking the proper entry for this TSM server." ocf_log err "TSM: You need to check your $OCF_RESKEY_dsmclient_dir/dsm.sys" ocf_log err "TSM: and set it up properly!" ocf_log info "TSM: If in doubt, copy & paste this server stanza:" ocf_log info "ServerName $OCF_RESKEY_cluster_dns" ocf_log info "TCPServerAddress $OCF_RESKEY_cluster_address" ocf_log info "TCPPORT $OCF_RESKEY_cluster_port" ocf_log info "CommMethod TCPIP" exit $exit_val fi fi case "$OCF_RESKEY_tsm_gloves" in 0|1);; true) OCF_RESKEY_tsm_gloves=1;; false) OCF_RESKEY_tsm_gloves=0;; *) ocf_log err "You specified an invalid value for tsm_gloves." ocf_log err "tsm_gloves should be either 0 or 1, or not set at all." ;; esac return $OCF_SUCCESS } tsm_pid() { # Check whether or not the selected TSM instance is still running if $TEST -f $instance_path/dsmserv.lock ; then pid="$( $AWK --source '{ print $4 }' $instance_path/dsmserv.lock 2>/dev/null )" kill -0 $pid &>/dev/null case "$?" in 0) # Process is up and running export OCF_RETURNVAL_PID=$OCF_SUCCESS export OCF_TSM_PID=$pid return $OCF_SUCCESS ;; 1) # Stale pid-file detected export OCF_RETURNVAL_PID=$OCF_ERR_GENERIC unset OCF_TSM_PID return $OCF_ERR_GENERIC ;; esac else # Process is not running export OCF_RETURNVAL_PID=$OCF_NOT_RUNNING unset OCF_TSM_PID return $OCF_NOT_RUNNING fi } tsm_monitor() { tsm_check tsm_pid } tsm_start() { unset OCF_RETURNVAL_PID unset OCF_TSM_PID tsm_monitor if $TEST $OCF_RETURNVAL_PID -eq 7 ; then # Prepping the environment export DSMSERV_DIR=$OCF_RESKEY_dsmserv_dir export DSMSERV_CONFIG=$instance_path/dsmserv.opt cd ${DSMSERV_CONFIG%/*} $DSMSERV_DIR/dsmserv >> ${OCF_RESKEY_tsm_logdir}/$instance_name.log 2>&1 & if $TEST $? -ne 0 ; then ocf_log err "dsmserv failed to start up correctly and returned $?" exit $OCF_ERR_GENERIC fi unset DSMSERV_CONFIG DSMSERV_DIR ocf_log info "TSM: Started instance $instance_name." fi return $OCF_SUCCESS } tsm_stop() { unset OCF_RETURNVAL_PID unset OCF_TSM_PID tsm_monitor # In order to stop TSM there are two ways: # o Gracefully shutting it down by stopping running sessions, disconnecting # nodes and cancelling pending/running processes and issueing 'halt' # o Simply killing the process with -9 (which is sometimes considered harmful # # If not explicitly wished, first try using the supplemented userid/password # to shutdown the TSM instance. if $TEST -n $OCF_RESKEY_tsm_user -a -n $OCF_RESKEY_tsm_password -a -n $OCF_RESKEY_cluster_address -a -n $OCF_RESKEY_cluster_port -a -n $OCF_RESKEY_cluster_dns -a $OCF_RESKEY_tsm_gloves -a $OCF_RETURNVAL_PID -eq 0 ; then local cmd="$OCF_RESKEY_dsmclient_dir/dsmadmc -noconfirm -displaymode=list -id=$OCF_RESKEY_tsm_user -password=$OCF_RESKEY_tsm_password -server=$OCF_RESKEY_cluster_dns" local logfile=${OCF_RESKEY_tsm_logdir}/dsmadmc.log # dsmadmc is kinda limited, since it only write the logfile to the current PWD cd $OCF_RESKEY_tsm_logdir echo $( date ) 1>> dsmadmc.log 2>/dev/null ocf_log info "TSM: Trying soft shutdown." local i=1 while $TEST $i -le ${OCF_RESKEY_tsm_retries} ; do process_list="$( $cmd query process | $EGREP 'Process Number: .*' | $AWK -F ': ' '{ print $2 }' )" ocf_log debug "TSM(tsm_stop): ($i) Process list during shutdown: $process_list" if $TEST -n $process_list ; then for process in $process_list ; do ocf_log debug "TSM(tsm_stop): ($i) Cancelling TSM process $process" $cmd cancel process $process >> $logfile 2>&1 done skip_process=0 else skip_process=1 fi session_list="$( $cmd query sessions | $EGREP 'Sess Number: .*' | $AWK -F ': ' '{ print $2 }' | sed "s/,//" )" ocf_log debug "TSM(tsm_stop: ($i) Session list during shutdown: $session_list" if $TEST -n $session_list ; then for session in $session_list ; do ocf_log debug "TSM(tsm_stop): ($i) Cancelling TSM session $session" $cmd cancel session $session >> $logfile 2>&1 done skip_session=0 else skip_session=1 fi mount_list="$( $cmd query mount | $EGREP 'LTO volume .* is mounted' | $AWK -F ' ' '{ print $4 }' )" ocf_log debug "TSM(tsm_stop): ($i) Mount list during shutdown: $mount_list" if $TEST -n $mount_list ; then for mount in $mount_list ; do ocf_log debug "TSM(tsm_stop): ($i) Cancelling TSM mount $mount" $cmd dismount volume $mount >> $logfile 2>&1 done skip_mount=0 else skip_mount=1 fi if $TEST $skip_process -a $skip_session -a $skip_mount ; then ocf_log debug "TSM(tsm_stop): Skipping the remaining $((${OCF_RESKEY_tsm_retries}-$i)) tries, no activity in instance $instance_name (pid: $pid)" break fi i=$(($i+1)) done ocf_log info "TSM: Halting instance $instance_name (pid: $OCF_TSM_PID)" ocf_log info "TSM: issuing $cmd halt" $cmd halt >> $logfile 2>&1 local i=1 while $TEST $i -le $OCF_RESKEY_tsm_retries ; do sleep $OCF_RESKEY_tsm_timeout # Break out of the while, if tsm is stopped. Saves us some time # (i*20 by default) when waiting for shutdown. unset OCF_RETURNVAL_PID unset OCF_TSM_PID tsm_monitor ocf_log info "TSM return value (290, pid: $OCF_TSM_PID): $OCF_RETURNVAL_PID" if $TEST "$OCF_RETURNVAL_PID" -eq "$OCF_NOT_RUNNING" ; then break fi i=$(($i+1)) done unset OCF_RETURNVAL_PID unset OCF_TSM_PID tsm_monitor ocf_log info "TSM return value (301, pid: $OCF_TSM_PID): $OCF_RETURNVAL_PID" case "$OCF_RETURNVAL_PID" in 0) ocf_log info "TSM(tsm_stop): Graceful shutdown for instance $instance_name (pid: $OCF_TSM_PID) failed, thus continuing with not-so-graceful shutdown!" success=1 ;; 1) ocf_log info "TSM: Graceful shutdown for instance $instance_name (pid: $OCF_TSM_PID) completed." success=0 ;; esac elif $TEST $OCF_RESKEY_tsm_gloves -eq 0 -a $OCF_RETURNVAL_PID -eq 0 ; then success=1 else success=0 fi if $TEST "$success" -eq "1" ; then ocf_log info "TSM: Trying not-so-graceful shutdown." ocf_log debug "TSM(tsm_stop): issuing SIGTERM to instance $instance_name (pid: $OCF_TSM_PID)" kill -TERM $OCF_TSM_PID 2>/dev/null if $TEST $? -ne 0 ; then ocf_log info "TSM: Instance $instance_name (pid: $OCF_TSM_PID) failed to shutdown with SIGTERM." ocf_log debug "TSM(tsm_stop): issuing SIGKILL to instance $instance_name (pid: $OCF_TSM_PID)" kill -KILL $OCF_TSM_PID 2>/dev/null if $TEST $? -ne 0 ; then ocf_log err "TSM: Instance $instance_name (pid: $OCF_TSM_PID) failed to shutdown with SIGKILL." ocf_log err "TSM: There's nothing we can do, so die gracefully." ocf_log err "TSM: User interaction is required!" return $OCF_ERR_GENERIC fi fi ocf_log info "TSM: Successfully halted instance $instance_name (pid: $OCF_TSM_PID)" fi return $OCF_SUCCESS } tsm_metadata() { cat <<END <?xml version="1.0"?> <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> <resource-agent name="TSM"> <version>1.0</version> <longdesc lang="en"> This script manages a single/or multiple instances of Tivoli Storage Manager. Please be aware, that in order to run your Tivoli Storage Manager server via Heartbeat, you need to prepare each instance according to the Storage Manager Installation handbook. </longdesc> <shortdesc lang="en">OCF Resource Agent compliant TSM script.</shortdesc> <parameters> <parameter name="single_instance" required="1" unique="0"> <longdesc lang="en"> Is your setup a single instance, or are you running multiple instances </longdesc> <shortdesc lang="en">Toggles changes for single/multiple instances</shortdesc> <content type="boolean" /> </parameter> </parameters> <actions> <action name="start" timeout="90s" /> <action name="stop" timeout="100s" /> <action name="monitor" depth="10" timeout="30s" interval="60s" start-delay="300s" /> <action name="meta-data" timeout="5s" /> <action name="status" timeout="30s" /> </actions> </resource-agent> END return $OCF_SUCCESS } case "$1" in start) tsm_start;; stop) tsm_stop;; monitor) tsm_monitor;; meta-data) tsm_metadata;; validate-all) tsm_check validate;; notify|demote|promote|migrate_to|migrate_from|reload|recover|*) exit $OCF_ERR_UNIMPLEMENTED;; esac # vim: set tabstop=2 shiftwidth=2 softtabstop=2 expandtab : |