diff --git a/.gitignore b/.gitignore index 5f4ef25..ef76462 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ osc +ibs misc *.tar.gz *.tgz diff --git a/man/.gitignore b/man/.gitignore new file mode 100644 index 0000000..7c5c5ba --- /dev/null +++ b/man/.gitignore @@ -0,0 +1,2 @@ +*.html +*.adoc diff --git a/man/SAPCMControlZone_basic_cluster.7 b/man/SAPCMControlZone_basic_cluster.7 index 8d7960f..2c0ab1e 100644 --- a/man/SAPCMControlZone_basic_cluster.7 +++ b/man/SAPCMControlZone_basic_cluster.7 @@ -12,61 +12,59 @@ SAPCMControlZone_basic_cluster \- basic settings to make SAPCMControlZone work. The Convergent Mediation (CM) component ControlZone needs a certain basic cluster configuration. Besides neccessary settings, additional configurations might match specific needs. - .PP \fB* Operating System Basics\fR - +.PP \fBUsers and groups\fR - +.PP Technical users and groups, such as "mzadmin" are defined locally in the Linux system. See man page passwd(5) and usermod(8. - +.PP \fBHostnames\fR - +.PP Name resolution of the cluster nodes and the virtual IP address must be done locally on all cluster nodes. See man page hosts(5). - +.PP \fBTime synchronization\fR - +.PP Strict time synchronization between the cluster nodes is mandatory, e.g. NTP. See man page chrony.conf(5). Further, the nodes should have configured the same timezone. - +.PP \fBNFS mounted filesystem\fR - +.PP A shared filesystem for ControlZone data can be statically mounted on both cluster nodes. This filesystem holds work directories, e.g. for batch processing. It must not be confused with the ControlZone application itself. The application is copied from NFS to both cluster nodes into local filesystems. Client-side write caching has to be disabled for the NFS shares containing customer data. See man page fstab(5) and example below. - .PP \fB* CRM Basics\fR - +.PP \fBstonith-enabled = true\fR - +.PP The cib bootstrap option stonith-enabled is crucial for any reliable pacemaker cluster. .br The value 'true' is one pre-requisite for having a cluster supported. - +.PP \fBmigration-threshold = 3\fR - +.PP The crm rsc_default parameter migration-threshold defines how many errors on a resource can be detected before this resource will be moved to another node. A value greater than 1 is needed for resource monitor option on-fail=restart. See also failure-timeout. - +.PP \fBrecord-pending = true\fR - +.PP The crm op_default record-pending defines, whether the intention of an action upon the resource is recorded in the Cluster Information Base (CIB). Setting this parameter to \'true\' allows the user to see pending actions like \'starting\' and \'stopping\' in crm_mon. - +.PP \fBfailure-timeout = 86400\fR - +.PP The crm op_default failure-timeout defines how long failed actions will be kept in the CIB. After that time the failure record will be deleted. Time unit is seconds. @@ -74,9 +72,9 @@ See also migration-threshold. .br The value '86400' means failure records will be cleaned automatically after one day. - +.PP \fBpriority-fencing-delay = 30\fP - +.PP The optional crm property priority-fencing-delay specified delay for the fencings that are targeting the lost nodes with the highest total resource priority in case we do not have the majority of the nodes in our cluster @@ -97,7 +95,7 @@ pcmk_delay_max. .\" TODO OS network tcp_retries2=8 (8..10) .\" \fB* CRM basic configuration.\fR - +.PP This example has been taken from a two-node cluster SLE-HA 15 SP4 with disk-based SBD. Priority fencing is configured and the SBD pcmk_delay_max has been reduced accordingly. The stonith-timeout is adjusted to SBD on-disk @@ -142,7 +140,7 @@ op_defaults op-options: \\ .RE .PP \fB* Statically mounted NFS share for ControlZone platform data.\fR - +.PP Below is an fstab example for a shared filesystem holding application data. The filesystem is statically mounted on all nodes of the cluster. The correct mount options are depending on the NFS server. @@ -154,10 +152,9 @@ nfs1:/s/c11/platform /mnt/platform nfs4 rw,noac,sync,default 0 0 .PP Note: The NFS share might be monitored, but not mounted/umounted by the HA cluster. See ocf_suse_SAPCMControlZone(7) for details. - .PP \fB* Ping cluster resource for checking connectivity.\fR - +.PP Below is an example of an optional ping resource for checking connectivity to the outer world. If the nodes have only one network interface, shared between HA cluster and application, this measure may not improve availability. diff --git a/man/ocf_suse_SAPCMControlZone.7 b/man/ocf_suse_SAPCMControlZone.7 index 4b721e4..5a0f9ae 100644 --- a/man/ocf_suse_SAPCMControlZone.7 +++ b/man/ocf_suse_SAPCMControlZone.7 @@ -1,6 +1,6 @@ .\" Version: 0.1 .\" -.TH ocf_suse_SAPCMControlZone 7 "18 Mar 2024" "" "SAPCMControlZone" +.TH ocf_suse_SAPCMControlZone 7 "15 Apr 2024" "" "SAPCMControlZone" .\" .SH NAME .\" @@ -14,36 +14,92 @@ SAPCMControlZone \- Manages Convergent Mediation ControlZone platform services f .\" .SH DESCRIPTION .\" -SAPCMControlZone is a resource agent for managing the Convergent Mediation (CM) -ControlZone platform and UI for a single instance as HA resources. +\fBOverview\fP +.PP +SAPCMControlZone is a resource agent (RA) for managing the Convergent Mediation +(CM) ControlZone platform and UI for a single instance as HA resources. .PP The CM central ControlZone platform is responsible for providing services to -other instances. Several platform containers may exist in a CM system, for high +other CM instances. Several platform containers may exist in a CM system, for high availability, but only one is active at a time. .\" see https://infozone.atlassian.net/wiki/spaces/MD9/pages/4863840/Terminology The CM central ControlZone UI is used to query, edit, import, and export data. .\" see https://infozone.atlassian.net/wiki/spaces/MD83/pages/5966420/3.+Web+UI -This ControlZone services can be handled as active/passive resources. +.PP +The SAPCMControlZone RA manages platform services as active/passive resources. +The RA relies on the mzsh command of ControlZone as interface. +This calls are used: +.PP +.RS 2 +- mzsh startup -f \fISERVICE\fP +.br +- mzsh status \fISERVICE\fP +.br +- mzsh shutdown \fISERVICE\fP +.RE +.PP +Currently supported services are "platform" and "ui". +.\" TODO output +Please see also the REQUIREMENTS section below. +.PP +\fBFilesystems\fP .PP NFS shares with work directories can be mounted statically on all nodes. The -HA cluster does not need to control that filesystems. +HA cluster does not need to control that filesystems. See also manual page +SAPCMControlZone_basic_cluster(7). .PP -The resource agent uses the following interface provided by the ControlZone -command shell: +The ControlZone software and Java runtime environment can be installed into a +central NFS share, or into the cluster node´s local filesystems, or both. Again, +the HA cluster does not need to control that filesystems. The SAPCMControlZone +RA offers three ways for managing ControlZone services: .PP -mzsh startup \fISERVICE\fP +.RS 2 +- calling mzsh always from central shared NFS +.br +- calling mzsh always from cluster node´s local filesystem +.br +- calling mzsh startup/shutdown centrally, but mzsh status locally +.RE .PP -mzsh status -.\" TODO mzsh status \fISERVICE\fP +Another option would be to install the ControlZone software into a cluster +managed filesystem on shared storage. This might fit for on-premise systems +backed by SAN storage infrastructure. From SAPCMControlZone RA´s perspective +this would look like node´s local filesystem. We do not discuss storage details +here. .PP -mzsh shutdown \fISERVICE\fP +\fBBest Practice\fP .PP -mzsh kill \fISERVICE\fP +* Use two independent corosync rings, at least one of them on bonded network. +Resulting in at least three physical links. Unicast is preferred. .PP -Currently supported services are "platform" and "ui". -.\" TODO output +* Use three Stonith Block Device (SBD), shared LUNs across all nodes on all sites. +Of course, together with hardware watchdog. .PP -Please see also the REQUIREMENTS section below. +* Align all timeouts in the Linux cluster with the timeouts of the underlying +infrastructure - particuarly network and storage. +.PP +* Prefer cluster node´s local filesystem over NFS, whenever possible. +.PP +* Check the installation of OS and Linux cluster on all nodes before doing any +functional tests. +.PP +* Carefully define, perform, and document tests for all scenarios that should be +covered, as well as all maintenance procedures. +.PP +* Test ControlZone features without Linux cluster before doing the overall +cluster tests. +.PP +* Test basic Linux cluster features without ControlZone before doing the overall +cluster tests. +.PP +* Be patient. For detecting the overall ControlZone status, the Linux cluster +needs a certain amount of time, depending on the ControlZone services and the +configured intervals and timeouts. +.PP +* Before doing anything, always check for the Linux cluster's idle status, +left-over migration constraints, and resource failures as well as the +ControlZone status. +Please see also manual page SAPCMControlZone_maintenance_examples(7). .PP .\" .SH SUPPORTED PARAMETERS @@ -52,7 +108,7 @@ This resource agent supports the following parameters: .PP \fBUSER\fP .RS 4 -OS user who calls mzsh, owner of $MZ_HOME. +OS user who calls mzsh, owner of $MZ_HOME (might be different from $HOME). .br Optional. Unique, string. Default value: "mzadmin". .RE @@ -68,7 +124,7 @@ Optional. Unique, [ platform | ui ]. Default value: "platform". .RS 4 Path to mzsh. Could be one or two full paths. If one path is given, that path is used for all actions. In case two paths are given, the first one is used for monitor -actions, the second one is used for start/stop/kill actions. If two paths are given, +actions, the second one is used for start/stop actions. If two paths are given, the first needs to be on local disk, the second needs to be on the central NFS share with the original CM ControlZone installation. Two paths are separated by a colon (:). The mzsh contains settings that need to be consistent with MZ_HOME and JAVA_HOME. @@ -78,49 +134,47 @@ Please refer to Convergent Mediation product documentation for details. Optional. Unique, string. Default value: "/usr/bin/mzsh". .RE .PP -\fBMZ_HOME\fP +\fBMZHOME\fP .RS 4 -Path to Convergent Mediation installation dirctory, owned by the mz-user. +Path to CM ControlZone installation dirctory, owned by the mz-user. Could be one or two full paths. If one path is given, that path is used for all actions. In case two paths are given, the first one is used for monitor actions, -the second one is used for start/stop/kill actions. If two paths are given, the +the second one is used for start/stop actions. If two paths are given, the first needs to be on local disk, the second needs to be on the central NFS share with the original CM ControlZone installation. Two paths are separated by a colon (:). -(Not yet implemented) .br .\" TODO default /opt/cm9/ ? Optional. Unique, string. Default value: "/opt/cm/". .RE .PP -\fBJAVA_HOME\fP +\fBJAVAHOME\fP .RS 4 -Path to Java virtual machine used by mzsh. +Path to Java virtual machine used for CM ControlZone. Could be one or two full paths. If one path is given, that path is used for all actions. In case two paths are given, the first one is used for monitor actions, -the second one is used for start/stop/kill actions. If two paths are given, the +the second one is used for start/stop actions. If two paths are given, the first needs to be on local disk, the second needs to be on the central NFS share with the original CM ControlZone installation. Two paths are separated by a colon (:). -(Not yet implemented) .br .\" TODO default /opt/cm9/sapmachine17 or OS $JAVA_HOME ? Optional. Unique, string. Default value: "/usr/lib64/jvm/jre-17-openjdk". .RE .PP -\fBMZ_PLATFORM\fP +\fBMZPLATFORM\fP .RS 4 -URL used by mzsh for connecting to Convergent Mediation components. +.\" TODO Could be one or two URLs. ... +URL used by mzsh for connecting to CM ControlZone services. Should usually not be changed. The service´s virtual hostname or virtual IP -address managed by the cluster must never be used here. -(Not yet implemented) +address managed by the cluster must never be used for RA monitor actions. .br Optional. Unique, string. Default value: "http://localhost:9000". .RE .PP \fBCALL_TIMEOUT\fP .RS 4 -Define timeout how long calls to the ControlZone platform for checking the +Define timeout how long calls to the ControlZone service for checking the status can take. If the timeout is reached, the return code will be 124. If you increase this timeout for ControlZone calls, you should also adjust the monitor operation timeout of your Linux cluster resources. @@ -154,23 +208,33 @@ This resource agent supports the following actions (operations): \fBstart\fR .RS 4 Starts the ControlZone platform resource. -.br +If the mzsh startup call fails, the RA tries twice. Timeout might be adapted to match expected application timing. +The RA start timeout relates to the ControlZone component property +term.default.startup.timeout, which defaults to 180 seconds. Suggested minimum timeout: 120\&. .RE .PP \fBstop\fR .RS 4 Stops the ControlZone platform resource. +If the mzsh shutdown call fails, the RA tries twice. Timeout might be adapted to match expected application timing. -Suggested minimum timeout: 120\&. +.\" TODO The RA stop timeout relates to the ControlZone component property +.\" TODO, which defaults to 180 seconds. +.\" For maximum patience, the RA stop timeout would be TODO +Suggested minimum timeout: 120\&, default/required action on-fail=fence\&. .RE .PP \fBmonitor\fR .RS 4 Regularly checks the ControlZone platform resource status. +If the mzsh status call fails, the RA tries twice. Timeout might be adapted to be greater than expected infrastructure timeouts. -Suggested minimum timeout: 120, suggested interval: 120, +The RA monitor timeout also relates to the ControlZone component property +pico.rcp.timeout, which defaults to 60 seconds. +For maximum patience with this component, the RA monitor timeout would be 140 +(60+10+60+10). Suggested minimum timeout: 120, suggested interval: 120, suggested action on-fail=restart\&. .RE .PP @@ -208,7 +272,10 @@ be logged if CALL_TIMEOUT has been exceeded. Also log entries are written, which can be scanned by using a pattern like "SAPCMControlZone.*RA.*rc=[1-7,9]" for errors. Regular operations might be found with "SAPHanaControlZone.*RA.*rc=0". See SUSE TID 7022678 for maximum RA tracing. -.RE +.PP +The RA also logs mzsh return codes. For that codes, please look for the respective +functions at +https://infozone.atlassian.net/wiki/spaces/MD91/pages/23375910/Always+Available .PP .\" .SH EXAMPLES @@ -216,6 +283,21 @@ See SUSE TID 7022678 for maximum RA tracing. Configuration and basic checks for ControlZone platform resources in Linux clusters. See also man page SAPCMControlZone_maintenance_examples(7). .PP +\fB* Example .bashrc\fR +.PP +TODO +MZ_HOME and JAVA_HOME are inherited from RA +.PP +.RS 2 +# MZ_PLATFORM, MZ_HOME, JAVA_HOME are set by HA RA +.br +export MZ_PLATFORM=${RA_MZ_PLATFORM:-"http://localhost:9000"} +.br +export MZ_HOME=${RA_MZ_HOME:-"/opt/cm9/c11"} +.br +export JAVA_HOME=${RA_JAVA_HOME:-"/opt/cm9/c11/sapmachine17"} +.RE +.PP \fB* Example configuration for resource group with ControlZone platform and IP address.\fR .PP A ControlZone platform resoure rsc_cz_C11 is configured, handled by OS user @@ -239,9 +321,9 @@ primitive rsc_cz_C11 ocf:suse:SAPCMControlZone \\ .br MZSHELL=/opt/cm9/c11/bin/mzsh:/usr/sap/c11/bin/mzsh \\ .br - MZ_HOME=/opt/cm9/c11/:/usr/sap/c11/ \\ + MZHOME=/opt/cm9/c11/:/usr/sap/c11/ \\ .br - JAVA_HOME=/opt/cm9/c11/sapmachine17:/usr/sap/c11/sapmachine17 \\ + JAVAHOME=/opt/cm9/c11/sapmachine17:/usr/sap/c11/sapmachine17 \\ .br op monitor interval=120 timeout=120 on-fail=restart \\ .br @@ -301,7 +383,34 @@ colocation col_with_cz 2000: rsc_ui_C11:Started grp_cz_C11:Started .RE .PP Note: Instead of defining order and colocation, the resource rsc_ui_C11 might be -just added to the resource group grp_cz_C11. This may impact the platform in some situations. +just added to the resource group grp_cz_C11. This may impact the platform in some +situations. +.PP +\fB* Optional loadbalancer resource for specific environments.\fR +.PP +In some environments a loadbalancer is used for managing access to the virtual +IP addres. Thus a respective resource agent might be needed. The resource might +be grouped with the IPaddr2 resoure, and starts just after the IPaddr2. +In the example at hand azure-lb is the loadbalancer RA, 47011 is the used port. +See also man page ocf_heartbeat_azure-lb(7). +.PP +.RS 2 +primitive rsc_lb_C11 azure-lb \\ +.br + params port=47011 \\ +.br + op monitor timeout=20 interval=10 \\ +.br + op_params depth=0 \\ +.br + op start timeout=20 \\ +.br + op stop timeout=20 +.PP +group grp_cz_C11 \\ +.br + rsc_ip_C11 rsc_lb_C11 rsc_cz_C11 +.RE .PP \fB* Optional Filesystem resource for monitoring NFS shares.\fR .PP @@ -311,11 +420,11 @@ ControlZone application itself. Client-side write caching has to be disabled. .PP A Filesystem resource is configured for a bind-mount of the real NFS share. This resource is grouped with the ControlZone platform and IP address. In case -of filesystem failures, the whole group gets restarted. +of filesystem failures, the node gets fenced. No mount or umount on the real NFS share is done. Example for the real NFS share is /mnt/platform/check/, example for the bind-mount is /mnt/check/. Both mount points have to be created before the -cluster resource is activated. +cluster resource is activated. See also man page SAPCMControlZone_basic_cluster(7), ocf_heartbeat_Filesystem(7) and nfs(5). .PP @@ -326,16 +435,14 @@ primitive rsc_fs_C11 ocf:heartbeat:Filesystem \\ .br fstype=nfs4 options=bind,rw,noac,sync,defaults \\ .br - op monitor interval=120 timeout=120 on-fail=restart \\ + op monitor interval=60 timeout=120 on-fail=fence \\ .br op_params OCF_CHECK_LEVEL=20 \\ .br op start timeout=120 \\ .br op stop timeout=120 -.RE .PP -.RS 2 group grp_cz_C11 \\ .br rsc_fs_C11 rsc_ip_C11 rsc_cz_C11 @@ -352,7 +459,15 @@ Resource is rsc_cz_C11, resource group is grp_C11. \fB* Search for log entries of SAPCMControlZone, show errors only.\fR .PP .RS 2 -# grep "SAPCMControlZone.*RA.*rc=[1-7,9]" /var/log/messages +# grep "SAPCMControlZone.*rc=[1-7,9]" /var/log/messages +.RE +.PP +\fB* Show log entry of one specific SAPCMControlZone run.\fR +.PP +PID of run is 8558. +.PP +.RS 2 +# grep "SAPCMControlZone.*\\[8558\\]" /var/log/messages .RE .PP \fB* Show and delete failcount for resource.\fR @@ -368,8 +483,20 @@ and for testing. .PP \fB* Manually trigger a SAPCMControlZone probe action.\fR .PP +USER is mzadmin, SERVICE is platform, MZSHELL is /usr/sap/c11/bin/mzsh . +.PP .RS 2 -# OCF_ROOT=/usr/lib/ocf/ \\ +# OCF_RESKEY_USER=mzadmin \\ +.br +OCF_RESKEY_SERVICE=platform \\ +.br +OCF_RESKEY_MZSHELL="/usr/sap/c11/bin/mzsh" \\ +.br +OCF_RESKEY_MZHOME="/usr/sap/c11" \\ +.br +OCF_RESKEY_JAVAHOME="/usr/sap/sapmachine17" \\ +.br +OCF_ROOT=/usr/lib/ocf/ \\ .br OCF_RESKEY_CRM_meta_interval=0 \\ .br @@ -418,15 +545,20 @@ Note: Understand the impact before trying. /usr/lib/ocf/resource.d/suse/SAPCMControlZone the resource agent .TP -.\" TODO two paths +$HOME/.bashrc, e.g. /home/mzadmin/.bashrc +the mzadmin´s .bashrc, defining JAVA_HOME and MZ_HOME +.TP $MZ_HOME, e.g. /opt/cm/ -the installation directory of a ControlZone service +the installation directory of a CM ControlZone service .TP $MZ_HOME/bin/mzsh -the default mzshell, used as API for managing ControlZone components, contains paths and URL +the default mzshell, used as API for managing CM ControlZone services, contains paths and URL +.TP +$MZ_HOME/log/ +path to logfiles of mzsh as well as platform and UI .TP $MZ_HOME/tmp/ -temporary files of a ControlZone service +temporary files and lock files of platform and UI .TP $JAVA_HOME the JAVA virtual machine, used by mzsh @@ -437,18 +569,24 @@ the JAVA virtual machine, used by mzsh .SH REQUIREMENTS .\" * Convergent Mediation ControlZone version 9.0.0.0 or higher is installed and -configured on both cluster nodes. The software is installed once into a shared -NFS filesystem. Then binaries and configuration are copied into both cluster -nodes´ local filesystems. Finally the local configuration has to be adjusted. -Please refer to Convergent Mediation documentation for details. +configured on both cluster nodes. Either the software is installed once into a +shared NFS filesystem and then binaries and configuration are copied into both +cluster nodes´ local filesystems. Or the software is installed per node directly. +However, finally the local configuration has to be adjusted. Please refer to +Convergent Mediation documentation for details. +.PP +* CM ControlZone is configured identically on both cluster nodes. User, path +names and environment settings are the same. .PP * Only one ControlZone instance per Linux cluster. .PP -* Linux shell of the mz-user (e.g. "mzadmin") is /bin/bash. +* Linux shell of the mzadmin user is /bin/bash. +.PP +* The mzadmin´s .bashrc inherits MZ_HOME and JAVA_HOME from SAPCMControlZone RA. .PP -* When called by resource agent, mzsh connnects to Convergent Mediation components -at localhost. The service´s virtual hostname or virtual IP address managed by -the cluster must never be used when called by RA. +* When called by the resource agent, mzsh connnects to CM ControlZone services +via network. The service´s virtual hostname or virtual IP address managed by the +cluster must never be used when called by RA monitor actions. .PP * Technical users and groups are defined locally in the Linux system. If users are resolved by remote service, local caching is neccessary. Substitute user @@ -461,8 +599,9 @@ cluster nodes and services are resolved locally in the Linux system. * Strict time synchronization between the cluster nodes, e.g. NTP. All nodes of a cluster have configured the same timezone. .PP -* Needed NFS shares (e.g. /usr/sap/) are mounted statically or by automounter. No client-side write caching. File locking might be configured for -application needs. +* Needed NFS shares (e.g. /usr/sap/) are mounted statically or by automounter. +No client-side write caching. File locking might be configured for application +needs. .PP * The RA monitoring operations have to be active. .PP @@ -479,10 +618,10 @@ are done. The Linux cluster does not prevent from administrative mistakes. However, if the Linux cluster detects the application running at both sites in parallel, it will stop both and restart one. .PP -* Interface for the RA to the ControlZone platform is the command mzsh. The -mzsh should be accessed on the cluster nodes´ local filesystems. -The mzsh is called with the arguments startup, shutdown, status and kill. Its -output is parsed by the RA. Thus the command and its output needs to be stable. +* Interface for the RA to the ControlZone services is the command mzsh. Ideally, +the mzsh should be accessed on the cluster nodes´ local filesystems. +The mzsh is called with the arguments startup, shutdown and status. Its output +is parsed by the RA. Thus the command and its output needs to be stable. .PP * The mzsh is called on the active node with a defined interval for regular resource monitor operations. It also is called on the active or passive node in diff --git a/ra/SAPCMControlZone b/ra/SAPCMControlZone index 4fc5bb3..8f65a1c 100644 --- a/ra/SAPCMControlZone +++ b/ra/SAPCMControlZone @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/bash # # SAPCMControlZone # @@ -29,15 +29,38 @@ # OCF_RESKEY_VERBOSE_STATUS - optional parameter, call mz with "verbose output" (default no) # ####################################################################### -SAPCMControlZoneVersion="0.1.0" -logger --id -t "SAPCMControlZone" "SAPCMControlZone $SAPCMControlZoneVersion ($*)" +SAPCMControlZoneVersion="0.3.1" +export myBASH="/usr/bin/bash" +# +# DONE: PRIO1: get RA_MZ_PLATFORM, RA_JAVA_HOME AND RA_MZ_HOME from cluster config +export RA_MZ_PLATFORM="http://localhost:9000" +export RA_JAVA_HOME="/usr/lib64/jvm/jre-17-openjdk" +export RA_MZ_HOME="/opt/cm" + +export logger_pid="$$" +# +# all BASH-REGEX to be case insensitive (no case match) +shopt -s nocasematch # # Initialization: # raType="mz_" +# +# fallback, if OCF variables are not set (before sourcing the ocf-shellfuncts) +OCF_ROOT="${OCF_ROOT:-/usr/lib/ocf}" OCF_FUNCTIONS_DIR="${OCF_FUNCTIONS_DIR:-${OCF_ROOT}/lib/heartbeat}" +OCF_SUCCESS=0 +OCF_ERR_GENERIC=1 +OCF_ERR_ARGS=2 +OCF_ERR_UNIMPLEMENTED=3 +OCF_ERR_PERM=4 +# shellcheck disable=SC2034 +OCF_ERR_INSTALLED=5 +OCF_ERR_CONFIGURED=6 +OCF_NOT_RUNNING=7 # shellcheck source=/dev/null source "${OCF_FUNCTIONS_DIR}/ocf-shellfuncs" +ocf_log INFO "Version $SAPCMControlZoneVersion ($*)" # init a minimum set of variables, if the variable are not initialized so far OCF_SUCCESS="${OCF_SUCCESS:-0}" # @@ -52,49 +75,219 @@ OCF_SUCCESS="${OCF_SUCCESS:-0}" function mz_component_startup() { - local user="$1" component="$2" rc=0 - # TODO PRIO1: set env for variables JAVA_HOME and MZ_HOME - # su - v3zadm -c "JAVA_HOME=/opt/cm/V3Z/sapmachine17 MZ_HOME=/opt/cm/V3Z/CMP01/CM /opt/cm/V3Z/CMP01/CM/bin/mzsh startup platform" - # TODO PRIO1: out for already started component is "platform is already running" - # TODO PRIO1: Output like "FAILED\nStartable platform exited unexpectedly" + local user="$1" component="$2" rid="$3" lid="$4" rc=0 + # DONE PRIO1: set env for variables RA_JAVA_HOME and RA_MZ_HOME + # DONE PRIO1: output for already started component is "platform is already running" - implemented, checked with project + # DONE PRIO1: Output like "FAILED\nStartable platform exited unexpectedly"; interpreted as ERROR # TODO PRIO1: Tested, but not committed API: RC==0 OK - su - "$user" -c "$mz_shell_central startup $component" | grep -q "Starting $component\.*done."; rc="$?" + while [[ "$#" -gt 0 ]]; do + case "$1" in + --user=* ) # user handling + user=${1#*=} + ;; + --component=* ) # component name + component=${1#*=} + ;; + --rid=* ) # resource id to log + rid=${1#*=} + ;; + --lid=* ) # logger id to be used + lid=${1#*=} + ;; + --mz_platform=* ) # set RA_MZ_PLATFORM for CM + RA_MZ_PLATFORM=${1#*=} + export RA_MZ_PLATFORM + ;; + --java_home=* ) # set RA_JAVA_HOME for CM + RA_JAVA_HOME=${1#*=} + export RA_JAVA_HOME + ;; + --mz_home=* ) # set mz_home for CM + RA_MZ_HOME=${1#*=} + export RA_MZ_HOME + ;; + esac + shift + done + # using option "-f" for startup following mail "Convergent Mediation SUSE HA Status 2024-04-15" from software vendor + result=$(su -w RA_MZ_HOME,RA_MZ_PLATFORM,RA_JAVA_HOME - "$user" -c "$mz_shell_central startup -f $component"); rc_call="$?" + # DONE: PRIO2: Try to get rid of "grep" (bash-regmap) + regex_starting="Starting $component\.*done." + regex_already="$component is already running" + case "$rc_call" in + 0 ) # success + rc=0 + ;; + 1 ) # failed + rc=1 + ;; + 102 ) # failed (timeout on callback) + rc=1 + ;; + 103 ) # interrupted + rc=1 + ;; + 104 ) # failed critical + rc=1 + ;; + * ) # unknown + rc=1 + ;; + esac + if [[ ( "$result" =~ $regex_starting ) || ( "$result" =~ $regex_already ) ]]; then + logger --id="$lid" -t "SAPCMControlZone($rid)" "INFO: mzsh ($mz_shell_central) got rc_call=$rc_call" + else + logger --id="$lid" -t "SAPCMControlZone($rid)" "INFO: mzsh ($mz_shell_central) got rc_call=$rc_call; result=$result" + fi return "$rc" } -#export -f mz_component_startup +export -f mz_component_startup function mz_component_shutdown() { - local user="$1" component="$2" rc=0 - # TODO PRIO1: set env for variables JAVA_HOME and MZ_HOME + local user="mzadmin" component="platform" rid="n/a" lid="$$" rc=0 RA_JAVA_HOME="" RA_MZ_HOME="" RA_MZ_PLATFORM="" + # DONE PRIO1: set env for variables RA_JAVA_HOME and RA_MZ_HOME # TODO PRIO1: How to treat output like "Shutting down platform...\nShutdown failed. Trying kill instead.\ndone." # DONE PRIO1: How to treat output with unequal 3 dots (Shutting down platform....done.) # DONE PRIO1: Output fror already stopped component is " is not running" # TODO PRIO1: Output like "Starting ui...FAILED" # TODO PRIO1: Output like "platfoam: no such server process" # TODO PRIO1: Tested, but not committed API: RC==0 OK - su - "$user" -c "$mz_shell_central shutdown $component" | grep -q -e "Shutting down $component\.*done." -e "$component is not running"; rc="$?" + while [[ "$#" -gt 0 ]]; do + case "$1" in + --user=* ) # user handling + user=${1#*=} + ;; + --component=* ) # component name + component=${1#*=} + ;; + --rid=* ) # resource id to log + rid=${1#*=} + ;; + --lid=* ) # logger id to be used + lid=${1#*=} + ;; + --mz_platform=* ) # set RA_MZ_PLATFORM for CM + RA_MZ_PLATFORM=${1#*=} + export RA_MZ_PLATFORM + ;; + --java_home=* ) # set RA_JAVA_HOME for CM + RA_JAVA_HOME=${1#*=} + export RA_JAVA_HOME + ;; + --mz_home=* ) # set mz_home for CM + RA_MZ_HOME=${1#*=} + export RA_MZ_HOME + ;; + esac + shift + done + result=$(su -w RA_MZ_HOME,RA_MZ_PLATFORM,RA_JAVA_HOME - "$user" -c "$mz_shell_central shutdown $component"); rc_call="$?" + regex_shuttingdown="Shutting down $component\.*done." + regex_notrunning="$component is not running"; rc="$?" + case "$rc_call" in + 0 ) # success + rc=0 + ;; + 1 ) # not found + rc=1 + ;; + 3 ) # interrupted + rc=1 + ;; + 4 ) # failed + rc=1 + ;; + * ) # unknown + rc=1 + ;; + esac + if [[ ( "$result" =~ $regex_shuttingdown ) || ( "$result" =~ $regex_notrunning ) ]]; then + logger --id="$lid" -t "SAPCMControlZone($rid)" "INFO: mzsh ($mz_shell_central) got rc_call=$rc_call" + else + logger -p err --id="$lid" -t "SAPCMControlZone($rid)" "ERROR: mzsh ($mz_shell_central) got rc_call=$rc_call; result=$result" + fi return "$rc" } -#export -f mz_component_shutdown +export -f mz_component_shutdown -# shellcheck disable=SC2317 function mz_component_status() { - local user="$1" component="$2" rc=0 - # TODO PRIO1: How to treat output like "platform is running without ..." - # TODO PRIO1: set env for variables JAVA_HOME and MZ_HOME + # TODO DONE: improved and more detailled parameter handling + # TODO PRIO2: add --mz_shell in parameter handling + local user="mzadmin" component="platform" rid="n/a" lid="$$" rc=0 RA_JAVA_HOME="" RA_MZ_HOME="" RA_MZ_PLATFORM="" timeout=60 + while [[ "$#" -gt 0 ]]; do + case "$1" in + --user=* ) # user handling + user=${1#*=} + ;; + --component=* ) # component name + component=${1#*=} + ;; + --timeout=* ) # timeout + timeout=${1#*=} + ;; + --rid=* ) # resource id to log + rid=${1#*=} + ;; + --lid=* ) # logger id to be used + lid=${1#*=} + ;; + --mz_platform=* ) # set RA_MZ_PLATFORM for CM + RA_MZ_PLATFORM=${1#*=} + export RA_MZ_PLATFORM + ;; + --java_home=* ) # set RA_JAVA_HOME for CM + RA_JAVA_HOME=${1#*=} + export RA_JAVA_HOME + ;; + --mz_home=* ) # set mz_home for CM + RA_MZ_HOME=${1#*=} + export RA_MZ_HOME + ;; + esac + shift + done + # TODO PRIO1: How to treat output like "platform is running without ..." + # DONE PRIO1: set env for variables RA_JAVA_HOME and RA_MZ_HOME # TODO PRIO1: Tested, but not committed API: status does also work - # TODO PRIO1: Tested, but not committed API: RC==0 UP, RC==1 DEGRADED; RC==2 DOWN (one or multiple), OTHER: ERROR (FATAL) - su - "$user" -c "$mz_shell_local status $component" | grep -q "$component is running"; rc="$?" + # DONE PRIO1: Tested, but not committed API: RC==0 UP, RC==1 DEGRADED; RC==2 DOWN (one or multiple), OTHER: ERROR (FATAL) + # have been committed in a meeting that we could use the RCs + result=$(timeout "$timeout" su -w RA_MZ_HOME,RA_MZ_PLATFORM,RA_JAVA_HOME - "$user" -c "$mz_shell_local status $component"); rc_call="$?" + regex_isrunning="$component is running" + regex_isnotrunning="$component is not running" + case "$rc_call" in + 0 ) # success + rc=0 + ;; + 1 ) # degraded + rc=1 + ;; + 2 ) # down + rc=1 + ;; + * ) # unknown + rc=1 + ;; + esac + if [[ ( "$result" =~ $regex_isrunning ) ]]; then + logger --id="$lid" -t "SAPCMControlZone($rid)" "INFO: mzsh ($mz_shell_local) component running; got rc_call=$rc_call" + else + if [[ ( "$result" =~ $regex_isnotrunning ) ]]; then + logger --id="$lid" -t "SAPCMControlZone($rid)" "INFO: mzsh ($mz_shell_local) component not running; got rc_call=$rc_call; result=$result" + else + logger --id="$lid" -t "SAPCMControlZone($rid)" "INFO: mzsh ($mz_shell_local) component status unclear; got rc_call=$rc_call; result=$result" + fi + fi return "$rc" } export -f mz_component_status + function mz_call() { # TODO: 'timeout' default value (non zero), see OCF_RESKEY_CALL_TIMEOUT? - local rc="$OCF_ERR_GENERIC" timeout=3000, user="$mz_user", action="status", component="platform" call_rc=0 + local rc="$OCF_ERR_GENERIC" timeout=60, user="$mz_user", action="status", component="platform" call_rc=0 while [ $# -gt 0 ]; do case "$1" in --user=* ) user=${1#*=} ;; @@ -104,7 +297,6 @@ function mz_call() esac shift done - case "$action" in status ) export mz_shell_local mz_shell_central component user @@ -115,14 +307,17 @@ function mz_call() # needs to switch-user to while [[ "$retry_count" -lt "$max_retry" ]]; do (( retry_count++ )) - timeout "$timeout" bash -c "mz_component_status $user $component"; call_rc="$?" + # TODO PRIO1: improved and more detailled parameter handling + # TODO PRIO1: add also mz_shell and java_home + mz_component_status --user="$user" --component="$component" --mz_platform="${mz_platform}" --mz_home="${mz_home_local}" --java_home="${java_home_local}" --rid="${OCF_RESOURCE_INSTANCE}" --lid="${logger_pid}" --timeout="$timeout"; call_rc="$?" case "$call_rc" in - 0 ) ocf_log info "mz_call: $component is running" + 0 ) ocf_log INFO "mz_call: $component is running" rc="$OCF_SUCCESS" break;; - 1 ) ocf_log info "mz_call: $component is NOT running" + 1 ) # also for clear "not running" we need to repeat the status call (have seen short outages with ui in the test cluster) + ocf_log INFO "mz_call: $component is NOT running" rc="$OCF_NOT_RUNNING";; - * ) ocf_log info "mz_call: $component is unclear (call_rc=$call_rc)" + * ) ocf_log INFO "mz_call: $component is unclear (call_rc=$call_rc)" rc="$OCF_ERR_GENERIC";; esac sleep "$retry_sleep" @@ -135,12 +330,12 @@ function mz_call() retry_count=0 while [[ "$retry_count" -lt "$max_retry" ]]; do (( retry_count++ )) - if mz_component_startup "$user" "$component"; then - ocf_log info "mz_call: $component is started" + if mz_component_startup --user="$user" --component="$component" --mz_platform="${mz_platform}" --mz_home="${mz_home_central}" --java_home="${java_home_central}" --rid="${OCF_RESOURCE_INSTANCE}" --lid="${logger_pid}"; then + ocf_log INFO "mz_call: $component is started" rc="$OCF_SUCCESS" break else - ocf_log info "mz_call: $component is NOT started" + ocf_log INFO "mz_call: $component is NOT started" rc="$OCF_NOT_RUNNING" fi sleep "$retry_sleep" @@ -154,38 +349,17 @@ function mz_call() while [[ "$retry_count" -lt "$max_retry" ]]; do (( retry_count++ )) # second try to stop mz component - if mz_component_shutdown "$user" "$component"; then - ocf_log info "mz_call: $component is stopped" + if mz_component_shutdown --user="$user" --component="$component" --mz_platform="${mz_platform}" --mz_home="${mz_home_central}" --java_home="${java_home_central}" --rid="${OCF_RESOURCE_INSTANCE}" --lid="${logger_pid}" ; then + ocf_log INFO "mz_call: $component is stopped" rc="$OCF_SUCCESS" break else - ocf_log info "mz_call: $component is still NOT stopped" + ocf_log INFO "mz_call: $component is still NOT stopped" rc="$OCF_ERR_GENERIC" fi sleep "$retry_sleep" done ;; - restart ) - # needs to switch-user to - if su - "$user" -c "$mz_shell_central restart $component" | grep -q "Starting $component\.*done."; then - rc="$OCF_SUCCESS" - else - rc="$OCF_ERR_GENERIC" - fi - ;; - kill ) - # needs to switch-user to - # TODO PRIO1: How to treat that kill outputs "platform not running" (not killing) - # This occures e.g. if the component is already down. - if su - "$user" -c "$mz_shell_central kill $component" | grep -qi "killing $component\.*done."; then - ocf_log info "mz_call: killing $component successful" - rc="$OCF_SUCCESS" - else - # TODO PRIO2: maybe killing processes (with OS command kill) directly as last try? - ocf_log info "mz_call: killing $component NOT successful" - rc="$OCF_ERR_GENERIC" - fi - ;; * ) # not implemented rc="$OCF_ERR_UNIMPLEMENTED" @@ -211,22 +385,44 @@ function mz_methods() function mz_init() { # TODO: init variables - declare -a mz_path_arr mz_user="${OCF_RESKEY_USER:-mzadmin}" mz_path_str="${OCF_RESKEY_MZSHELL:-/usr/bin/mzsh}" - # shellcheck disable=SC2206 - IFS=: mz_path_arr=( $mz_path_str ) - mz_shell_local="${mz_path_arr[0]}" - if [[ -n "${mz_path_arr[1]}" ]]; then - # first (0) path is local, second (1) is central - mz_shell_central="${mz_path_arr[1]}" + mz_home_str="${OCF_RESKEY_MZHOME:-/opt/cm/C11/CM}" + java_home_str="${OCF_RESKEY_JAVAHOME:-/opt/cm/C11/CM}" + mz_platform="${OCF_RESKEY_MZPLATFORM:-http://localhost:9000}" + regex_multivalue="^([^:]*):(.*)$" + ## mz_shell + # cut string into a first part and an optional second part. parts are separated by colon (':') + if [[ "$mz_path_str" =~ $regex_multivalue ]]; then + # found first:second + mz_shell_local="${BASH_REMATCH[1]}" + mz_shell_central="${BASH_REMATCH[2]}" + else + mz_shell_local="$mz_path_str" + mz_shell_central="$mz_path_str" + fi + ## mz_home + # cut string into a first part and an optional second part. parts are separated by colon (':') + if [[ "$mz_home_str" =~ $regex_multivalue ]]; then + # found first:second + mz_home_local="${BASH_REMATCH[1]}" + mz_home_central="${BASH_REMATCH[2]}" else - # first-and-only (0) path is local and central - mz_shell_central="$mz_shell_local" + mz_home_local="$mz_home_str" + mz_home_central="$mz_home_str" fi - # mz_shell="$mz_shell_local" - # shellcheck disable=SC2034 - mz_search_list="/usr/bin:/usr/sap/[A-Z][A-Z0-9][A-Z0-9]/CMP[0-9][0-9]/CM/bin/" + # DONE PRIO1: also handle arraxs for java_home + ## java_home + # cut string into a first part and an optional second part. parts are separated by colon (':') + if [[ "$java_home_str" =~ $regex_multivalue ]]; then + # found first:second + java_home_local="${BASH_REMATCH[1]}" + java_home_central="${BASH_REMATCH[2]}" + else + java_home_local="$java_home_str" + java_home_central="$java_home_str" + fi + # mz_search_list="/usr/bin:/usr/sap/[A-Z][A-Z0-9][A-Z0-9]/CMP[0-9][0-9]/CM/bin/" mz_component="${OCF_RESKEY_SERVICE:-platform}" # shellcheck disable=SC2034 # variable for future usage mz_shutdown_retries="${OCF_RESKEY_SHUTDOWN_RETRIES:+-r $OCF_RESKEY_SHUTDOWN_RETRIES}" # empty, if OCF_RESKEY_SHUTDOWN_RETRIES} is usnet, otherwise "-r $OCF_RESKEY_SHUTDOWN_RETRIES" @@ -249,6 +445,21 @@ function mz_meta_data() How long calls to the ControlZone platform for checking the status can take. If you increase this timeout, you should also adjust the operation timeouts of your Linux cluster resources. + + ControlZone specific JAVA_HOME directory + TBD + + + + ControlZone home directory + TBD + + + + ControlZone connection URL + ControlZone connection URL + + ControlZone shell Path to the CM ControlZone commandline shell to be called. Values: full path to the shell. The parameter could also take two different paths separarated by colon (:). In that case the first path must be the path of the "local" mzsh and the second must be the path of the "central/shared" mzsh. Format: PATH-LOCAL:PATH-CENTRAL @@ -291,41 +502,73 @@ function mz_meta_data() # shellcheck disable=SC2120 function mz_check_params() { - local rc="$OCF_SUCCESS" log_stderr="-s" - while [[ "$#" -gt 1 ]]; do - case "$1" in - --quiet ) log_stderr="" ;; - esac - done + local rc="$OCF_SUCCESS" if [[ -n "$OCF_RESKEY_USER" ]]; then if id "$OCF_RESKEY_USER" 1>/dev/null 2>/dev/null; then - logger --id "$log_stderr" -t "SAPCMControlZone" "User '$OCF_RESKEY_USER' exists" + ocf_log INFO "User '$OCF_RESKEY_USER' exists" else - logger --id "$log_stderr" -t "SAPCMControlZone" "User '$OCF_RESKEY_USER' does NOT exist" + ocf_log ERROR "User '$OCF_RESKEY_USER' does NOT exist" rc="$OCF_ERR_CONFIGURED" fi fi if [[ -n "$mz_shell_local" ]]; then if [[ -x "$mz_shell_local" ]]; then - logger --id "$log_stderr" -t "SAPCMControlZone" "Program '$mz_shell_local' exists and is executable" + ocf_log INFO "mz_shell_local '$mz_shell_local' exists and is executable" else - logger --id "$log_stderr" -t "SAPCMControlZone" "Program '$mz_shell_local' either does NOT exist or is NOT executable" + ocf_log ERROR "mz_shell_local '$mz_shell_local' either does NOT exist or is NOT executable" rc="$OCF_ERR_CONFIGURED" fi fi if [[ -n "$mz_shell_central" ]]; then if [[ -x "$mz_shell_central" ]]; then - logger --id "$log_stderr" -t "SAPCMControlZone" "Program '$mz_shell_central' exists and is executable" + ocf_log INFO "mz_shell_central '$mz_shell_central' exists and is executable" + else + ocf_log ERROR "mz_shell_central '$mz_shell_central' either does NOT exist or is NOT executable" + rc="$OCF_ERR_CONFIGURED" + fi + fi +# mz_home_local= + if [[ -n "$mz_home_local" ]]; then + if [[ -e "$mz_home_local" ]]; then + ocf_log INFO "mz_home_local '$mz_home_local' exists" + else + ocf_log ERROR "mz_home_local '$mz_home_local' does NOT exist" + rc="$OCF_ERR_CONFIGURED" + fi + fi + if [[ -n "$mz_home_central" ]]; then + if [[ -e "$mz_home_central" ]]; then + ocf_log INFO "mz_home_central '$mz_home_central' exists" else - logger --id "$log_stderr" -t "SAPCMControlZone" "Program '$mz_shell_central' either does NOT exist or is NOT executable" + ocf_log ERROR "mz_home_central '$mz_home_central' does NOT exist" rc="$OCF_ERR_CONFIGURED" fi fi +### +# java_home_local= + if [[ -n "$java_home_local" ]]; then + if [[ -e "$java_home_local" ]]; then + ocf_log INFO "java_home_local '$java_home_local' exists" + else + ocf_log ERROR "java_home_local '$java_home_local' does NOT exist" + rc="$OCF_ERR_CONFIGURED" + fi + fi + if [[ -n "$java_home_central" ]]; then + if [[ -e "$java_home_central" ]]; then + ocf_log INFO "java_home_central '$java_home_central' exists" + else + ocf_log ERROR "java_home_central '$java_home_central' does NOT exist" + rc="$OCF_ERR_CONFIGURED" + fi + fi +### + regex_platformorui="^(platform|ui)$" if [[ -n "$OCF_RESKEY_SERVICE" ]]; then - if [[ "$OCF_RESKEY_SERVICE" =~ ^(platform|ui)$ ]]; then - logger --id "$log_stderr" -t "SAPCMControlZone" "Service '$OCF_RESKEY_SERVICE' does match list ['platform', 'ui']" + if [[ "$OCF_RESKEY_SERVICE" =~ $regex_platformorui ]]; then + ocf_log INFO "Service '$OCF_RESKEY_SERVICE' matches list ['platform', 'ui']" else - logger --id "$log_stderr" -t "SAPCMControlZone" "Service '$OCF_RESKEY_SERVICE' does NOT match list ['platform', 'ui']" + ocf_log INFO "Service '$OCF_RESKEY_SERVICE' does NOT match list ['platform', 'ui']" rc="$OCF_ERR_CONFIGURED" fi fi @@ -376,6 +619,7 @@ function mz_monitor() # params: ACTION # globals: OCF_*(r), ra_rc(rw), $0(r) # +ra_rc="$OCF_SUCCESS" if [ "$#" != "1" ] then @@ -389,19 +633,24 @@ if [ "$ACTION" = "status" ]; then fi # These operations don't require OCF parameters to be set -logger --id -t "SAPCMControlZone" "SAPCMControlZone action=$ACTION" +ocf_log INFO "begin action $ACTION" case "$ACTION" in usage) mz_usage + ocf_log INFO "SAPCMControlZone end action $ACTION rc=${ra_rc}" exit "$OCF_SUCCESS";; methods) mz_methods + ocf_log INFO "SAPCMControlZone end action $ACTION rc=${ra_rc}" exit "$OCF_SUCCESS";; meta-data) mz_meta_data + ocf_log INFO "SAPCMControlZone end action $ACTION rc=${ra_rc}" exit "$OCF_SUCCESS";; notify) # just ignore + ocf_log INFO "SAPCMControlZone end action $ACTION rc=${ra_rc}" exit "$OCF_SUCCESS";; validate-all) mz_init mz_validate; ra_rc="$?" + ocf_log INFO "end action $ACTION rc=${ra_rc}" exit "$ra_rc" ;; esac @@ -414,12 +663,14 @@ then exit "$OCF_ERR_PERM" fi -# parameter check -"${raType}check_params" --quiet +# TODO PRIO2: Maybe add 'light' parameter check later ra_rc="$OCF_ERR_UNIMPLEMENTED" case "$ACTION" in start) - mz_start; ra_rc="$?";; + mz_check_params + mz_start; + ra_rc="$?" + ;; stop) mz_stop; ra_rc="$?";; monitor) @@ -439,5 +690,6 @@ case "$ACTION" in ra_rc="$OCF_ERR_UNIMPLEMENTED" ;; esac +ocf_log INFO "end action $ACTION rc=${ra_rc}" exit "${ra_rc}" # set ts=4 sw=4 sts=4 et diff --git a/samples/bashrc b/samples/bashrc new file mode 100644 index 0000000..8a0df4b --- /dev/null +++ b/samples/bashrc @@ -0,0 +1,7 @@ +# +# for JAVA_HOME, MZ_HOME and MZ_PLATFORM use the RA_-name-space variables +# with priority +# +export JAVA_HOME="${RA_JAVA_HOME:-/usr/sap/CM/java}" +export MZ_HOME="${RA_MZ_HOME:-/usr/sap/CM}" +export MZ_PLATFORM="${RA_MZ_PLATFORM:-http://localhost:9000}" diff --git a/sap-convergent-resource-agents.spec b/sap-convergent-resource-agents.spec index 13bd4af..991327b 100644 --- a/sap-convergent-resource-agents.spec +++ b/sap-convergent-resource-agents.spec @@ -16,7 +16,7 @@ # Name: sap-convergent-resource-agents -Version: 0.2.2 +Version: 0.3.0 Release: 0 Group: Productivity/Clustering/HA Summary: Resource agents to control the convergent mediation control zone diff --git a/test/bin/mzsh b/test/bin/mzsh index 153ffee..fe4969a 100755 --- a/test/bin/mzsh +++ b/test/bin/mzsh @@ -1,12 +1,36 @@ #!/usr/bin/bash #set -x -action="$1" -comp="$2" +# startup [ -e ] [ -f ] [ -q ] +# shutdown [ -q ] [ -f ] ... +# status [ -q ] [ -verbose ] ... -services="platform ui bla-bla blub-blub" +rc=99 -logger --id -t "mzsh" "mzsh version 0.0.3 - action=$action" +while [[ "$#" -gt 0 ]]; do + case "$1" in + startup | shutdown | status ) action="$1" ;; + -f ) # just be able to ingnore it + true + ;; + -q ) # just be able to ingnore it + true + ;; + -verbose ) # just be able to ingnore it + true + ;; + * ) comp="$1" ;; + esac + shift +done + +lid="$$" + +services="platform ui" + +logger --id "$lid" -t "mzsh" "mzsh version 0.2.6 - action=$action" +logger --id "$lid" -t "mzsh" "MZSH=$0 JAVA_HOME=$JAVA_HOME MZ_HOME=$MZ_HOME MZ_PLATFORM=$MZ_PLATFORM" +logger --id "$lid" -t "mzsh" "MZSH=$0 RA_JAVA_HOME=$JAVA_HOME RA_MZ_HOME=$MZ_HOME RA_MZ_PLATFORM=$MZ_PLATFORM" file_of_fail_base="/dev/shm/mzsh_fail_" file_of_status="/dev/shm/mzsh_status_" @@ -22,38 +46,59 @@ fi case "$action" in status ) - # TODO: also simulate anwer like ' is running without' + # TODO: also simulate answer like ' is running without' # TODO: rc-handling (still to be confirmed) # TODO: also take component as optional parameter (still to be confirmed) - # TODO: simuate already running: ' already running' (format still to be confirmed) - logger --id -t "mzsh" "mzsh process action=$action" + # TODO: simulate already running: ' already running' (format still to be confirmed) + logger --id "$lid" -t "mzsh" "mzsh process action=$action comp=$comp" if [[ "$is_fail" == "yes" ]]; then - echo "blala is not running" - echo "blubblub is not running" sleep 60 + rc=2 else - for srv in $services; do - if [ -e "${file_of_status}${srv}" ]; then - cat "${file_of_status}${srv}" - else - echo "$srv is not running" - fi - done + if [[ -n "$comp" ]]; then + if [ -e "${file_of_status}${comp}" ]; then + cat "${file_of_status}${comp}" + if grep -q "$comp is running" "${file_of_status}${comp}"; then + rc=0 + else + rc=2 + fi + else + echo "$comp is not running" + rc=2 + fi + else + for srv in $services; do + if [ -e "${file_of_status}${srv}" ]; then + cat "${file_of_status}${srv}" + if grep -q "$comp is running" "${file_of_status}${srv}"; then + rc=0 + else + rc=2 + fi + else + echo "$srv is not running" + rc=2 + fi + done + fi fi ;; startup ) # TODO: number of dots in 'starting %s...' can vary # TODO: simulate FAILURES (format of different output strings still to be confirmed) # TODO: rc-handling (still to be confirmed) - logger --id -t "mzsh" "mzsh process action=$action component=$comp" + logger --id "$lid" -t "mzsh" "mzsh process action=$action component=$comp" if [[ "$is_fail" == "no" ]]; then - printf "starting %s..." "$comp" + printf "Starting %s..." "$comp" sleep 10 echo "done." echo "$comp is running" > "${file_of_status}${comp}" + rc=0 else - printf "starting %s..." "$comp" + printf "Starting %s..." "$comp" sleep 30 + rc=1 fi ;; shutdown ) @@ -64,15 +109,17 @@ case "$action" in # Escalated to call kill # done. # TODO: rc-handling (still to be confirmed) - logger --id -t "mzsh" "mzsh process action=$action component=$comp" + logger --id "$lid" -t "mzsh" "mzsh process action=$action component=$comp" if [[ "$is_fail" == "no" ]]; then printf "Shutting down %s..." "$comp" sleep 5 echo "done." echo "$comp is not running" > "${file_of_status}${comp}" + rc=0 else printf "Shutting down %s..." "$comp" sleep 30 + rc=1 fi ;; esac @@ -80,5 +127,5 @@ esac # TODO: simulation of 'kill' -# meaningless rc -exit 99 +# meaningfull rc +exit "$rc"