diff --git a/redfish-exporter/.env b/redfish-exporter/.env index 355b003..e98771f 100644 --- a/redfish-exporter/.env +++ b/redfish-exporter/.env @@ -6,14 +6,11 @@ METRICS_PORT="2112" USE_SSL="false" CERTFILE="path/to/certfile" KEYFILE="path/to/keyfile" -SLURM_USER="slurm user here" -SLURM_TOKEN="token string here, from secret when for real" -SLURM_CONTROL_NODE="slurm control node IP:Port" - -TRIGGER_EVENTS="[\ -{\"Severity\":\"Fatal\",\"Action\":\"DrainNode\"},\ -{\"Severity\":\"Critical\",\"Action\":\"DrainNode\"} -]" +SLURM_CONTROL_NODE="" +#List of regex strings to match against node drain reason and not drain the node if there is a match +SLURM_DRAIN_EXCLUDE_REASON_LIST="AMD|Pensando|RebootNeeded" +SLURM_SCONTROL_PATH="/usr/bin/scontrol" +TLS_TIMEOUT="15" # Subscription (v1.5+) # SUBSCRIPTION_PAYLOAD="{\ @@ -28,11 +25,11 @@ TRIGGER_EVENTS="[\ # Deprecated \",\"password\":\"\"}" \ No newline at end of file diff --git a/redfish-exporter/config.go b/redfish-exporter/config.go index 7e941d3..4c82f56 100644 --- a/redfish-exporter/config.go +++ b/redfish-exporter/config.go @@ -19,12 +19,15 @@ package main import ( "crypto/tls" "encoding/json" + "fmt" "log" + "net" "os" "strconv" "strings" "github.com/joho/godotenv" + "gopkg.in/yaml.v3" ) const ( @@ -32,6 +35,7 @@ const ( DefaultMetricsPort = "2112" DefaultUseSSL = "false" DefaultSeverityConfig = "Fatal,Critical,Informational" + NodeDrainPolicyFile = "nodeDrainPolicy.json" ) type Config struct { @@ -49,28 +53,56 @@ type Config struct { CertFile string KeyFile string } - SlurmToken string - SlurmControlNode string - SlurmUser string - SubscriptionPayload SubscriptionPayload - RedfishServers []RedfishServer - TriggerEvents []TriggerEvent - PrometheusConfig PrometheusConfig - context *tls.Config - eventCount int - dataBuffer []byte + SlurmToken string + SlurmControlNode string + SlurmUser string + SlurmScontrolPath string + SlurmDrainExcludeStr string + SubscriptionPayload SubscriptionPayload + RedfishServers []RedfishServer + TriggerEvents map[string]map[string][]EventInfo //map[Severity][MessageRegistry.MessageId][]EventInfo + PrometheusConfig PrometheusConfig + context *tls.Config + eventCount int + dataBuffer []byte + TlsTimeOut string +} + +type EventInfo struct { + UniqueString string + Category string + Subcategory string + DrainReasonPrefix string } type TriggerEvent struct { - Severity string `json:"Severity"` - Action string `json:"Action"` + Severity string `json:"Severity"` + Action string `json:"Action"` + Message string `json:"Message"` + DrainReasonPrefix string `json:"DrainReasonPrefix"` +} + +type TriggerEventsInfo struct { + Category string `json:"Category"` + Subcategory string `json:"Subcategory"` + MessageRegistry string `json:"MessageRegistry"` + MessageId string `json:"MessageId"` + UniqueString string `json:"UniqueString"` + Severity string `json:"Severity"` + DrainReasonPrefix string `json:"DrainReasonPrefix"` + Enable bool `json:"Enable"` } type PrometheusConfig struct { Severity []string `json:"Severity"` } -func setupConfig() Config { +type target struct { + Targets []string `yaml:"targets"` + Labels map[string]string `yaml:"labels"` +} + +func setupConfig(targetFile string) Config { // Load .env file err := godotenv.Load() if err != nil { @@ -119,20 +151,15 @@ func setupConfig() Config { AppConfig.SlurmToken = os.Getenv("SLURM_TOKEN") AppConfig.SlurmControlNode = os.Getenv("SLURM_CONTROL_NODE") AppConfig.SlurmUser = os.Getenv("SLURM_USER") + AppConfig.SlurmDrainExcludeStr = os.Getenv("SLURM_DRAIN_EXCLUDE_REASON_LIST") + AppConfig.SlurmScontrolPath = os.Getenv("SLURM_SCONTROL_PATH") + AppConfig.TlsTimeOut = os.Getenv("TLS_TIMEOUT") subscriptionPayloadJSON := os.Getenv("SUBSCRIPTION_PAYLOAD") if err := json.Unmarshal([]byte(subscriptionPayloadJSON), &AppConfig.SubscriptionPayload); err != nil { log.Fatalf("Failed to parse SUBSCRIPTION_PAYLOAD: %v", err) } - triggerEventsJSON := os.Getenv("TRIGGER_EVENTS") - if triggerEventsJSON != "" { - err = json.Unmarshal([]byte(triggerEventsJSON), &AppConfig.TriggerEvents) - if err != nil { - log.Fatalf("Failed to unmarshal TRIGGER_EVENTS: %v", err) - } - } - prometheusConfigJSON := os.Getenv("PROMETHEUS_CONFIG") if prometheusConfigJSON != "" { err = json.Unmarshal([]byte(prometheusConfigJSON), &AppConfig.PrometheusConfig) @@ -148,10 +175,113 @@ func setupConfig() Config { redfishServersJSON := os.Getenv("REDFISH_SERVERS") if redfishServersJSON == "" { log.Println("REDFISH_SERVERS environment variable is not set or is empty") + } else { + if err := json.Unmarshal([]byte(redfishServersJSON), &AppConfig.RedfishServers); err != nil { + log.Fatalf("Failed to parse REDFISH_SERVERS: %v", err) + } + } + + // Read the node drain policy config file + nodeDrainPolicyConfig, err := os.ReadFile(NodeDrainPolicyFile) + + if err != nil { + log.Fatalf("Failed to read: %v", NodeDrainPolicyFile) + } + + triggerEventsInfo := []TriggerEventsInfo{} + err = json.Unmarshal(nodeDrainPolicyConfig, &triggerEventsInfo) + if err != nil { + log.Fatalf("Failed to unmarshal file: %v | err: %v", NodeDrainPolicyFile, err) + } + + tInfoMap := map[string]map[string][]EventInfo{} + + for _, evt := range triggerEventsInfo { + fmt.Printf("Trigger Event: %+v\n", evt) + if evt.Enable != true { + continue + } + eInfo := EventInfo{} + eInfo.Category = evt.Category + eInfo.Subcategory = evt.Subcategory + eInfo.DrainReasonPrefix = evt.DrainReasonPrefix + eInfo.UniqueString = evt.UniqueString + key := "" + if evt.MessageRegistry == "" { + key = evt.MessageId + } else { + key = evt.MessageRegistry + "." + evt.MessageId + } + if ee, ok := tInfoMap[evt.Severity]; !ok { + eInfoMap := map[string][]EventInfo{} + eInfoMap[key] = []EventInfo{eInfo} + tInfoMap[evt.Severity] = eInfoMap + } else { + ee[key] = append(ee[key], eInfo) + } + } + + AppConfig.TriggerEvents = tInfoMap + + for kk, tt := range AppConfig.TriggerEvents { + log.Println("Severity: ", kk) + for kkk, ttt := range tt { + log.Println("key: ", kkk) + log.Printf("event: %+v\n", ttt) + } + } + + // Read and parse the REDFISH_SERVERS_COMMON_CONFIG environment variable + redfishServersCommonConfigJSON := os.Getenv("REDFISH_SERVERS_COMMON_CONFIG") + if redfishServersCommonConfigJSON == "" { + log.Println("redfishServersCommonConfigJSON environment variable is not set or is empty") return AppConfig } - if err := json.Unmarshal([]byte(redfishServersJSON), &AppConfig.RedfishServers); err != nil { - log.Fatalf("Failed to parse REDFISH_SERVERS: %v", err) + redfishServersCommonConfig := RedfishServersCommongConfig{} + if err := json.Unmarshal([]byte(redfishServersCommonConfigJSON), &redfishServersCommonConfig); err != nil { + log.Fatalf("Failed to parse REDFISH_SERVERS_COMMON_CONFIG: %v", err) + } + + if targetFile == "" { + log.Println("No target file provided") + return AppConfig + } + + targetYamlFile, err := os.ReadFile(targetFile) + + if err != nil { + log.Fatalf("Failed to read file: %v", targetFile) + } + + targets := []target{} + + err = yaml.Unmarshal(targetYamlFile, &targets) + + if err != nil { + log.Fatalf("Error parsing target file: %v | err: %v", targetFile, err) + } + + for _, t := range targets { + log.Println("target: ", t.Targets) + + for _, hostName := range t.Targets { + // add this target to Redfish servers + server := RedfishServer{} + bmcHost := fmt.Sprintf(hostName+".%v", redfishServersCommonConfig.HostSuffix) + ips, err := net.LookupIP(bmcHost) + if err != nil || len(ips) == 0 { + log.Printf("[error] Couldn't get the IP for host: %v | ips: %v | err: %v", bmcHost, ips, err) + continue + } + log.Println("IPs: ", ips) + + server.IP = fmt.Sprintf("https://%v", ips[0]) + server.LoginType = "Session" + server.Username = redfishServersCommonConfig.UserName + server.Password = redfishServersCommonConfig.Password + server.SlurmNode = hostName + AppConfig.RedfishServers = append(AppConfig.RedfishServers, server) + } } return AppConfig diff --git a/redfish-exporter/go.mod b/redfish-exporter/go.mod index 8c8370f..400dd6f 100644 --- a/redfish-exporter/go.mod +++ b/redfish-exporter/go.mod @@ -9,6 +9,7 @@ require ( github.com/nod-ai/ADA/redfish-exporter v0.0.0-20241002210630-2ef2d1070d90 github.com/prometheus/client_golang v1.20.4 github.com/stmcginnis/gofish v0.19.0 + gopkg.in/yaml.v3 v3.0.1 ) require ( diff --git a/redfish-exporter/go.sum b/redfish-exporter/go.sum index 0b9f2ce..b729e1a 100644 --- a/redfish-exporter/go.sum +++ b/redfish-exporter/go.sum @@ -26,3 +26,6 @@ golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/redfish-exporter/listener.go b/redfish-exporter/listener.go index c952682..e984d38 100644 --- a/redfish-exporter/listener.go +++ b/redfish-exporter/listener.go @@ -171,6 +171,37 @@ func (s *Server) handleConnection(AppConfig Config, conn net.Conn) { } } +func getDrainReasonPrefix(info EventInfo) string { + return info.DrainReasonPrefix + ": " + info.Category + ": " + info.Subcategory +} + +func isTriggerEvent(evt Event, config Config) (bool, string) { + tInfoMap := config.TriggerEvents + + if eInfoMap, ok := tInfoMap[evt.Severity]; !ok { + return false, "" + } else { + if eInfo, ok1 := eInfoMap[evt.MessageId]; !ok1 { + return false, "" + } else { + if len(eInfo) == 1 { + return true, getDrainReasonPrefix(eInfo[0]) + } else { + for _, info := range eInfo { + strs := strings.Split(info.UniqueString, "|") + for _, str := range strs { + if strings.Contains(evt.Message, str) == true { + return true, getDrainReasonPrefix(info) + } + } + + } + } + } + } + return false, "" +} + func (s *Server) processRequest(AppConfig Config, conn net.Conn, req *http.Request, eventCount *int, dataBuffer *[]byte) error { // Extract method, headers, and payload method := req.Method @@ -217,19 +248,26 @@ func (s *Server) processRequest(AppConfig Config, conn net.Conn, req *http.Reque log.Printf("Message ID: %s", messageId) log.Printf("Message Args: %v", messageArgs) log.Printf("Origin Of Condition: %s", originOfCondition) - for _, triggerEvent := range AppConfig.TriggerEvents { - if severity == triggerEvent.Severity { - log.Printf("Matched Trigger Event: %s with action %s", triggerEvent.Severity, triggerEvent.Action) - // Sending event belongs to redfish_utils. Each server may have different slurm node associated, and redfish_servers has the info/map. - if s.slurmQueue != nil { - redfishServerInfo := getServerInfoByIP(AppConfig.RedfishServers, ip) - if len(strings.TrimSpace(redfishServerInfo.SlurmNode)) == 0 { - log.Printf("failed to get the slurm node name, cannot perform action: %v", triggerEvent.Action) - break - } - s.slurmQueue.Add(redfishServerInfo.IP, redfishServerInfo.SlurmNode, triggerEvent.Severity, triggerEvent.Action) + + trigger, drainReason := isTriggerEvent(event, AppConfig) + if trigger == true { + log.Printf("Matched Trigger Event: %s | messageId: %s | message: %s", event.Severity, event.MessageId, event.Message) + // Sending event belongs to redfish_utils. Each server may have different slurm node associated, and redfish_servers has the info/map. + if s.slurmQueue != nil { + redfishServerInfo := getServerInfoByIP(AppConfig.RedfishServers, ip) + if len(strings.TrimSpace(redfishServerInfo.SlurmNode)) == 0 { + log.Println("failed to get the slurm node name, cannot perform drain action") + continue } - break + evt := slurm.AddEventReq{ + RedfishServerIP: redfishServerInfo.IP, + SlurmNodeName: redfishServerInfo.SlurmNode, + Severity: event.Severity, + DrainReason: drainReason, + ExcludeStr: AppConfig.SlurmDrainExcludeStr, + ScontrolPath: AppConfig.SlurmScontrolPath, + } + s.slurmQueue.Add(evt) } } } diff --git a/redfish-exporter/main.go b/redfish-exporter/main.go index 22704e7..cea722c 100644 --- a/redfish-exporter/main.go +++ b/redfish-exporter/main.go @@ -26,6 +26,7 @@ import ( "os/signal" "strconv" "strings" + "sync" "syscall" "time" @@ -36,15 +37,18 @@ import ( func main() { var ( - enableSlurm = flag.Bool("enable-slurm", false, "Enable slurm") + targetFile string + subscriptionMapLock sync.Mutex // to guard access to the map ) + + flag.StringVar(&targetFile, "target", "", "Path to the target file for host/slurm node names") flag.Parse() log.SetFlags(log.LstdFlags | log.Lshortfile) log.Println("Starting Redfish Event Listener/Exporter") // Setup configuration - AppConfig := setupConfig() + AppConfig := setupConfig(targetFile) // Log the initialized config log.Printf("Initialized Config: %+v", AppConfig) @@ -52,24 +56,14 @@ func main() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() var slurmQueue *slurm.SlurmQueue - if *enableSlurm { - if len(strings.TrimSpace(AppConfig.SlurmToken)) == 0 { - log.Fatalf("Provide slurm token to enable slurm") - } - if len(strings.TrimSpace(AppConfig.SlurmControlNode)) == 0 { - log.Fatalf("Provide slurm control node IP:Port to enable slurm") - } - _, err := slurm.NewClient(AppConfig.SlurmControlNode, AppConfig.SlurmUser, AppConfig.SlurmToken) - if err != nil { - log.Fatalf("failed to create slurm client, err: %+v", err) - } - slurmQueue = slurm.InitSlurmQueue(ctx) - go slurmQueue.ProcessEventActionQueue() - } + slurmQueue = slurm.InitSlurmQueue(ctx) + go slurmQueue.ProcessEventActionQueue() + + subscriptionMap := make(map[string]string) // Subscribe the listener to the event stream for all servers - subscriptionMap, err := CreateSubscriptionsForAllServers(AppConfig.RedfishServers, AppConfig.SubscriptionPayload) + err := CreateSubscriptionsForAllServers(AppConfig.RedfishServers, AppConfig.SubscriptionPayload, subscriptionMap, &subscriptionMapLock, AppConfig.TlsTimeOut) if err != nil { log.Fatal(err) } @@ -110,7 +104,9 @@ func main() { time.Sleep(time.Second) // Unsubscribe the listener from all servers + subscriptionMapLock.Lock() DeleteSubscriptionsFromAllServers(AppConfig.RedfishServers, subscriptionMap) + subscriptionMapLock.Unlock() cancel() diff --git a/redfish-exporter/nodeDrainPolicy.json b/redfish-exporter/nodeDrainPolicy.json new file mode 100644 index 0000000..759f86a --- /dev/null +++ b/redfish-exporter/nodeDrainPolicy.json @@ -0,0 +1,612 @@ +[ + { + "Category": "Firmware Update", + "SubCategory": "Verification Failed", + "MessageRegistry": "Update.1.0.1", + "MessageId": "VerificationFailed", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Firmware Update", + "SubCategory": "Transfer failed", + "MessageRegistry": "Update.1.0.1", + "MessageId": "TransferFailed", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Firmware Update", + "SubCategory": "Apply on component failed", + "MessageRegistry": "Update.1.0.1", + "MessageId": "ApplyFailed", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Firmware Update", + "SubCategory": "Activate Failed", + "MessageRegistry": "Update.1.0.1", + "MessageId": "ActivateFailed", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Firmware Update", + "SubCategory": "Bundle Check Failure", + "MessageRegistry": "OpenBMC.0.2.0", + "MessageId": "BundleCheckFailure", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Firmware Update", + "SubCategory": "Firmware Update Failed", + "MessageRegistry": "OpenBMC.0.2.0", + "MessageId": "FirmwareUpdateFailed", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Firmware Update", + "SubCategory": "Verification Failed at EROT", + "MessageRegistry": "Update.1.0.1", + "MessageId": "VerificationFailedOEM", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Temperature sensor threshold Alerts", + "SubCategory": "Upper Fatal", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingAboveUpperFatalThresholdOEM", + "UniqueString": "TEMP", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Temperature sensor threshold Alerts", + "SubCategory": "Upper Critical", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingAboveUpperCriticalThresholdOEM", + "UniqueString": "TEMP", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Temperature sensor threshold Alerts", + "SubCategory": "Upper Caution", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingAboveUpperCautionThresholdOEM", + "UniqueString": "TEMP", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Temperature sensor threshold Alerts", + "SubCategory": "Lower Fatal", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingBelowLowerFatalThresholdOEM", + "UniqueString": "TEMP", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Temperature sensor threshold Alerts", + "SubCategory": "Lower Caution", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingBelowLowerCautionThresholdOEM", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Voltage Sensor threshold Alerts", + "SubCategory": "Upper Fatal", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingAboveUpperFatalThresholdOEM", + "UniqueString": "VOLTAGE", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Voltage Sensor threshold Alerts", + "SubCategory": "Upper Critical", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingAboveUpperCriticalThresholdOEM", + "UniqueString": "VOLTAGE", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Voltage Sensor threshold Alerts", + "SubCategory": "Upper Caution", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingAboveUpperCautionThresholdOEM", + "UniqueString": "VOLTAGE", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Voltage Sensor threshold Alerts", + "SubCategory": "Lower Fatal", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingBelowLowerFatalThresholdOEM", + "UniqueString": "VOLTAGE", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Current Sensor threshold Alerts", + "SubCategory": "Upper Fatal", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingAboveUpperFatalThresholdOEM", + "UniqueString": "CURRENT", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Current Sensor threshold Alerts", + "SubCategory": "Upper Critical", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingAboveUpperCriticalThresholdOEM", + "UniqueString": "CURRENT", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Current Sensor threshold Alerts", + "SubCategory": "Upper Caution", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingAboveUpperCautionThresholdOEM", + "UniqueString": "CURRENT", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Current Sensor threshold Alerts", + "SubCategory": "Lower Fatal", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingBelowLowerFatalThresholdOEM", + "UniqueString": "CURRENT", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Memory CE threshold Alerts", + "SubCategory": "Upper Caution (HAS AdditionalDataURI, pointing to CPER LOG)", + "MessageRegistry": "SensorEvent.1.0.0", + "MessageId": "ReadingAboveUpperCautionThreshold", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "I2C/SPI Comm Errors", + "SubCategory": "SPI Errors", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceErrorsDetected", + "UniqueString": "SPI", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "I2C/SPI Comm Errors", + "SubCategory": "I2C2-failure/ interface hang [SMC -> OAM GPU, Retimer]", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceErrorsDetected", + "UniqueString": "I2C", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "I2C/SPI Comm Errors", + "SubCategory": "EROT config failure", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceSelfTestFailed", + "UniqueString": "Erot|eRoT", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "I2C/SPI Comm Errors", + "SubCategory": "ROT Fatal failure after retries", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceRetryFailOEM", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Component fault/failure", + "SubCategory": "OAM (HBM, XGMI) VR failure, GPU Module VR failure, UBB VR failure", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceStatusChangedCriticalOEM", + "UniqueString": "_STATUS", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Component fault/failure", + "SubCategory": "SMC FPGA State, UBB FPGA state", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceErrorsDetectedOEM", + "UniqueString": "FPGA", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Component fault/failure", + "SubCategory": "Retimer Link disabled", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceStatusChangedCriticalOEM", + "UniqueString": "Disabled", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Component fault/failure", + "SubCategory": "EROT non-fatal error", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceEROTNonFatalErrorsOEM", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Component fault/failure", + "SubCategory": "EROT fatal error", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceEROTFatalErrorsOEM", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Component fault/failure", + "SubCategory": "HSC/UBB PGOOD", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceStatusChangedCriticalOEM", + "UniqueString": "HSC|UBB", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "PCIe Link Width", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourcePerfChange", + "UniqueString": "Link Width", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "Pcie Link Speed", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourcePerfChange", + "UniqueString": "Link Speed", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "OAM PGOOD", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceStatusChangedCriticalOEM", + "UniqueString": "OAM", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "OAM CTF", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "TemperatureAssertCriticalFaultOEM", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "GPU Throttle state", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceStatusChangedCritical", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "GPU Throttle state", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceStatusChangedWarning", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "GPU boot failed (HAS AdditionalDataURI, pointing to CPER LOG file)", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceSelfTestFailed", + "UniqueString": "OAM_", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "UC Not-Fatal (inlcuding HBM & XGMI) (HAS AdditionalDataURI, pointing to CPER LOG)", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceErrorsDetected", + "UniqueString": "OAM", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "GPU Fatal Error (inlcuding HBM & XGMI) (HAS AdditionalDataURI, pointing to CPER LOG)", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceErrorsDetectedOEM", + "UniqueString": "Fatal error", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "Misplaced OAM", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceIncompatibleLocationOEM", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "OAM/GPU physically missing", + "MessageRegistry": "Base.1.8.1", + "MessageId": "\"ResourceNotFound\"", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "GPU Max Bad page count (HAS AdditionalDataURI, pointing to CPER LOG)", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceErrorsDetectedOEM", + "UniqueString": "HBM", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "GPU Triggers CTF event", + "MessageRegistry": "OpenBMC.0.2.0", + "MessageId": "GPUTriggerCTF", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "GPU Error Events", + "SubCategory": "OAM_X with PartNumber xxx is Unrecognized", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceUnrecognizedOEM", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Secure Boot notifications", + "SubCategory": "Secure boot failure", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceSelfTestFailed", + "UniqueString": "secure boot failure", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Secure Boot notifications", + "SubCategory": "Component Boot Failure", + "MessageRegistry": "OpenBMC.0.2.0", + "MessageId": "BootFailure", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Secure Boot notifications", + "SubCategory": "Revocation Failed", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "RevocationFailedOEM", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Secure Boot notifications", + "SubCategory": "", + "MessageRegistry": "OpenBMC.0.2.0", + "MessageId": "SecureFWUnexpectedRespWarning", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Auto Recovery", + "SubCategory": "Auto Recovery related events", + "MessageRegistry": "OpenBMC.0.2.0", + "MessageId": "AutomaticRecoveryTriggered", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Auto Recovery", + "SubCategory": "Auto Recovery related events", + "MessageRegistry": "OpenBMC.0.2.0", + "MessageId": "AutoFWCopyFailure", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Auto Recovery", + "SubCategory": "Auto Recovery related events", + "MessageRegistry": "OpenBMC.0.2.0", + "MessageId": "AutomaticRecoveryFailureCritical", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Power Status", + "SubCategory": "UBB Power Status", + "MessageRegistry": "OpenBMC.0.2.0", + "MessageId": "UBBPowerStatusCritical", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Power Status", + "SubCategory": "OAM Power Status", + "MessageRegistry": "OpenBMC.0.2.0", + "MessageId": "OAMPowerStatusCritical", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Power Status", + "SubCategory": "PowerSequence Events", + "MessageRegistry": "OpenBMC.0.2.0", + "MessageId": "PowerSequenceUnexpected", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "CPER Log Handling Events", + "SubCategory": "CPER Full", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceErrorThresholdExceeded", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "CPER Log Handling Events", + "SubCategory": "CPER Reach Threshold (HAS AdditionalDataURI, pointing to CPER LOG file)", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceWarningThresholdExceeded", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "LogRotate utility", + "SubCategory": "", + "MessageRegistry": "", + "MessageId": "LogAlmostFull", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "LogRotate utility", + "SubCategory": "", + "MessageRegistry": "", + "MessageId": "LogFull", + "UniqueString": "", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "Task Notifications", + "SubCategory": "Aborted", + "MessageRegistry": "TaskEvent.1.0.1", + "MessageId": "TaskAborted", + "UniqueString": "", + "Severity": "Warning", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + }, + { + "Category": "I2C/SPI Comm Errors", + "SubCategory": "TSI and RMI failures", + "MessageRegistry": "ResourceEvent.1.2.1", + "MessageId": "ResourceErrorsDetectedOEM", + "UniqueString": "TSI and RMI", + "Severity": "Critical", + "DrainReasonPrefix": "Reboot needed", + "Enable": true + } +] \ No newline at end of file diff --git a/redfish-exporter/redfish_utils.go b/redfish-exporter/redfish_utils.go index 8d70b86..ffefab7 100644 --- a/redfish-exporter/redfish_utils.go +++ b/redfish-exporter/redfish_utils.go @@ -19,12 +19,18 @@ package main import ( "fmt" "log" + "strconv" "sync" + "time" "github.com/stmcginnis/gofish" "github.com/stmcginnis/gofish/redfish" ) +const ( + PeriodicRetryTime = 30 +) + type RedfishServer struct { IP string `json:"ip"` Username string `json:"username"` @@ -33,6 +39,12 @@ type RedfishServer struct { SlurmNode string `json:"slurmNode"` } +type RedfishServersCommongConfig struct { + HostSuffix string `json:"hostSuffix"` + UserName string `json:"username"` + Password string `json:"password"` +} + type SubscriptionPayload struct { Destination string `json:"Destination,omitempty"` EventTypes []redfish.EventType `json:"EventTypes,omitempty"` @@ -45,13 +57,27 @@ type SubscriptionPayload struct { Context string `json:"Context,omitempty"` } +type RedfishSubsciptionFailedData struct { + server RedfishServer + payload SubscriptionPayload +} + // Create a new connection to a redfish server -func getRedfishClient(server RedfishServer) (*gofish.APIClient, error) { +func getRedfishClient(server RedfishServer, tlsTimeout string) (*gofish.APIClient, error) { + timeOut := 0 + if tlsTimeout != "" { + t, err := strconv.Atoi(tlsTimeout) + if err == nil { + timeOut = t + } + } + clientConfig := gofish.ClientConfig{ - Endpoint: server.IP, - Username: server.Username, - Password: server.Password, - Insecure: true, // TODO Set Based on login type + Endpoint: server.IP, + Username: server.Username, + Password: server.Password, + Insecure: true, // TODO Set Based on login type + TLSHandshakeTimeout: timeOut, } c, err := gofish.Connect(clientConfig) @@ -129,55 +155,55 @@ func createLegacySubscription(eventService *redfish.EventService, SubscriptionPa // Create subscriptions for all servers and return their URIs // Rollback if any subscription attempt fails -func CreateSubscriptionsForAllServers(redfishServers []RedfishServer, subscriptionPayload SubscriptionPayload) (map[string]string, error) { - var wg sync.WaitGroup - var mu sync.Mutex // to guard access to the map - - subscriptionMap := make(map[string]string) - - errChan := make(chan error, len(redfishServers)) - +func CreateSubscriptionsForAllServers(redfishServers []RedfishServer, subscriptionPayload SubscriptionPayload, subscriptionMap map[string]string, mu *sync.Mutex, tlsTimeout string) error { + failedSubsChan := make(chan RedfishSubsciptionFailedData) for _, server := range redfishServers { - wg.Add(1) - go func(server RedfishServer) { - defer wg.Done() + go doSubscription(server, subscriptionPayload, subscriptionMap, mu, failedSubsChan, tlsTimeout) + } - c, err := getRedfishClient(server) - if err != nil { - errChan <- fmt.Errorf("failed to connect to server %s: %v", server.IP, err) - return - } - defer c.Logout() + go periodicSubscriptionRetry(failedSubsChan, subscriptionMap, mu, tlsTimeout) + return nil +} - subscriptionURI, err := createSubscription(c, server, subscriptionPayload) - if err != nil { - errChan <- fmt.Errorf("subscription failed on server %s: %v", server.IP, err) - return - } - mu.Lock() - subscriptionMap[server.IP] = subscriptionURI - mu.Unlock() - log.Printf("Successfully created subscription on Redfish server %s: %s", server.IP, subscriptionURI) - }(server) - } +func periodicSubscriptionRetry(failedSubsChan chan RedfishSubsciptionFailedData, subscriptionMap map[string]string, mu *sync.Mutex, tlsTimeout string) { + failedSubsMap := map[string]RedfishSubsciptionFailedData{} - wg.Wait() - close(errChan) + ticker := time.NewTicker(PeriodicRetryTime * time.Second) + defer ticker.Stop() - // Any error that occurred during the subscription process - var allErrors []string - for err := range errChan { - if err != nil { - allErrors = append(allErrors, err.Error()) + for { + select { + case <-ticker.C: + for ip, data := range failedSubsMap { + log.Printf("Retrying subscription for: %v", ip) + go doSubscription(data.server, data.payload, subscriptionMap, mu, failedSubsChan, tlsTimeout) + delete(failedSubsMap, ip) + } + case data := <-failedSubsChan: + failedSubsMap[data.server.IP] = data } } +} - if len(allErrors) > 0 { - DeleteSubscriptionsFromAllServers(redfishServers, subscriptionMap) - return nil, fmt.Errorf("subscription process encountered errors: %s", allErrors) +func doSubscription(server RedfishServer, subscriptionPayload SubscriptionPayload, subscriptionMap map[string]string, mu *sync.Mutex, failedSubsChan chan RedfishSubsciptionFailedData, tlsTimeout string) { + c, err := getRedfishClient(server, tlsTimeout) + if err != nil { + log.Printf("[error] failed to connect to server %s: %v", server.IP, err) + failedSubsChan <- RedfishSubsciptionFailedData{server: server, payload: subscriptionPayload} + return } + defer c.Logout() - return subscriptionMap, nil + subscriptionURI, err := createSubscription(c, server, subscriptionPayload) + if err != nil { + log.Printf("[error] subscription failed on server %s: %v", server.IP, err) + failedSubsChan <- RedfishSubsciptionFailedData{server: server, payload: subscriptionPayload} + return + } + mu.Lock() + subscriptionMap[server.IP] = subscriptionURI + mu.Unlock() + log.Printf("Successfully created subscription on Redfish server %s: %s", server.IP, subscriptionURI) } // Delete all event subscriptions stored in the map @@ -192,7 +218,7 @@ func DeleteSubscriptionsFromAllServers(redfishServers []RedfishServer, subscript defer wg.Done() server := getServerInfo(redfishServers, serverIP) - c, err := getRedfishClient(server) + c, err := getRedfishClient(server, "") if err != nil { log.Printf("Failed to connect to server %s: %v", server.IP, err) return diff --git a/redfish-exporter/slurm/queue.go b/redfish-exporter/slurm/queue.go index cd40594..209259b 100644 --- a/redfish-exporter/slurm/queue.go +++ b/redfish-exporter/slurm/queue.go @@ -2,6 +2,7 @@ package slurm import ( "context" + "fmt" "log" "strings" @@ -9,14 +10,32 @@ import ( ) const ( - Drain = "DrainNode" + Drain = "DrainNode" + ExlcudeReasonSet = "DRAIN_EXCLUDE_REASON_SET" ) +type AddEventReq struct { + RedfishServerIP string + SlurmNodeName string + Severity string + Action string + DrainReason string + MessageId string + Message string + ExcludeStr string + ScontrolPath string +} + type eventsActionReq struct { redfishServerIP string slurmNodeName string severity string action string + drainReason string + messageId string + message string + excludeStr string + scontrolPath string } type SlurmQueue struct { @@ -28,12 +47,14 @@ func InitSlurmQueue(ctx context.Context) *SlurmQueue { return &SlurmQueue{ctx: ctx, queue: make(chan *eventsActionReq)} } -func (q *SlurmQueue) Add(redfishServerIP, slurmNodeName, severity, action string) { +func (q *SlurmQueue) Add(evt AddEventReq) { q.queue <- &eventsActionReq{ - redfishServerIP: redfishServerIP, - slurmNodeName: slurmNodeName, - severity: severity, - action: action, + redfishServerIP: evt.RedfishServerIP, + slurmNodeName: evt.SlurmNodeName, + severity: evt.Severity, + drainReason: evt.DrainReason, + excludeStr: evt.ExcludeStr, + scontrolPath: evt.ScontrolPath, } } @@ -52,35 +73,37 @@ func (q *SlurmQueue) ProcessEventActionQueue() { actionReq.redfishServerIP, actionReq.slurmNodeName, actionReq.severity, - actionReq.action).Inc() - return + "Drain").Inc() + } else { + metrics.SlurmAPISuccessMetric. + WithLabelValues( + actionReq.redfishServerIP, + actionReq.slurmNodeName, + actionReq.severity, + "Drain").Inc() } - metrics.SlurmAPISuccessMetric. - WithLabelValues( - actionReq.redfishServerIP, - actionReq.slurmNodeName, - actionReq.severity, - actionReq.action).Inc() } } } +func getDrainReasonString(prefix, msg, msgId, severity string) string { + ret := fmt.Sprintf("%s:redfishlistener:%s:%s:%s", prefix, severity, msgId, msg) + return ret +} + func (q *SlurmQueue) performEventAction(req *eventsActionReq) error { if len(strings.TrimSpace(req.slurmNodeName)) == 0 { return nil } - slurmClient := GetClient() - if slurmClient == nil { - return nil - } - - if req.action == Drain { - err := slurmClient.DrainNode(req.slurmNodeName) - if err != nil { - log.Printf("Error draining node: %v", err) - return err + err := DrainNodeWithScontrol(req.slurmNodeName, req.drainReason, req.excludeStr, req.scontrolPath) + if err != nil { + if strings.Contains(err.Error(), ExlcudeReasonSet) { + log.Printf("Node not drained: %v", err.Error()) + return nil } + log.Printf("Error draining node: %v", err) + return err } log.Printf("Performed action: %v on slurm node: %v successfully", req.action, req.slurmNodeName) diff --git a/redfish-exporter/slurm/slurm.go b/redfish-exporter/slurm/slurm.go index abbfd08..cba3cb2 100644 --- a/redfish-exporter/slurm/slurm.go +++ b/redfish-exporter/slurm/slurm.go @@ -2,10 +2,14 @@ package slurm import ( "context" + "encoding/json" "fmt" "log" "math" "net/http" + "os/exec" + "regexp" + "strings" "time" "github.com/nod-ai/ADA/redfish-exporter/api/generated/slurmrestdapi" @@ -67,6 +71,7 @@ func NewClient(slurmControlNode, slurmUser, slurmToken string) (*Client, error) c := &Client{apiClient: cl} log.Printf("[slurm] created slurm client for node: %v\n", slurmControlNode) + err := c.getConnectionStatus() if err != nil { log.Printf("[slurm] error in getting the connection status of the slurm node: %v, err: %+v\n", slurmControlNode, err) @@ -105,12 +110,14 @@ func (c *Client) ResumeNode(nodeName string) error { return nil } -func (c *Client) DrainNode(nodeName string) error { +func (c *Client) DrainNodeWithAPI(nodeName, reason, excludeStr, scontrolPath string) error { apiCall := func() (interface{}, *http.Response, error) { + uid := "0" ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) //slurm_v0039_update_node jreq := c.apiClient.SlurmAPI.SlurmV0039UpdateNode(ctx, nodeName) - req := slurmrestdapi.V0039UpdateNodeMsg{State: []string{"drain"}} + req := slurmrestdapi.V0039UpdateNodeMsg{State: []string{"drain"}, Reason: &reason, ReasonUid: &uid} + jreq = jreq.V0039UpdateNodeMsg(req) res, resp, err := c.apiClient.SlurmAPI.SlurmV0039UpdateNodeExecute(jreq) cancel() @@ -122,6 +129,14 @@ func (c *Client) DrainNode(nodeName string) error { return res, resp, nil } + curReason, err := c.GetNodeReasonWithAPI(nodeName) + if err != nil { + return err + } + log.Printf("node: %v, Reason: %v", nodeName, curReason) + if strings.Contains(curReason, excludeStr) { + return fmt.Errorf("%s: not draining node: %s | current reason: %s", ExlcudeReasonSet, nodeName, curReason) + } _, resp, err := CallWithRetry(apiCall, maxRetries, baseDelay) if err != nil { return err @@ -131,7 +146,33 @@ func (c *Client) DrainNode(nodeName string) error { return nil } -func (c *Client) GetNodes() ([]string, error) { +func DrainNodeWithScontrol(nodeName, reason, excludeStr, scontrolPath string) error { + + if excludeStr != "" { + curReason, err := GetNodeReasonWithScontrol(nodeName, scontrolPath) + if err != nil { + log.Printf("GetNodeReasonWithScontrol returned err: %v\n", err) + return err + } + + if curReason != "" { + re := regexp.MustCompile(excludeStr) + match := re.FindAllString(curReason, -1) + + if len(match) != 0 { + log.Printf("exlcudStr: %v, curReason: %v", excludeStr, curReason) + log.Printf("match: %v | len: %v", match, len(match)) + return fmt.Errorf("%s: not draining node: %s | current reason: %s", ExlcudeReasonSet, nodeName, curReason) + } + } + } + cmd := fmt.Sprintf("%s update NodeName=%s State=DRAIN Reason=\"%s\"", scontrolPath, nodeName, reason) + res := LocalCommandOutput(cmd) + log.Printf("Drain node result: %s", res) + return nil +} + +func (c *Client) GetNodesWithAPI() ([]string, error) { var nodes []string apiCall := func() (interface{}, *http.Response, error) { ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) @@ -160,6 +201,64 @@ func (c *Client) GetNodes() ([]string, error) { return nodes, nil } +func (c *Client) GetNodeReasonWithAPI(nodeName string) (string, error) { + var reason string + apiCall := func() (interface{}, *http.Response, error) { + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + jreq := c.apiClient.SlurmAPI.SlurmV0039GetNode(ctx, nodeName) + res, resp, err := c.apiClient.SlurmAPI.SlurmV0039GetNodeExecute(jreq) + cancel() + if err != nil { + return res, resp, err + } else if resp.StatusCode != 200 { + return res, resp, fmt.Errorf("invalid status code: %v", resp.StatusCode) + } + return res, resp, nil + } + + res, resp, err := CallWithRetry(apiCall, maxRetries, baseDelay) + if err != nil { + return reason, err + } + defer resp.Body.Close() + + temp := res.(*slurmrestdapi.V0039NodesResponse) + nodes := temp.GetNodes() + if len(nodes) != 1 { + return reason, fmt.Errorf("GetNodeReason failed") + } + + reason = *nodes[0].Reason + log.Printf("[slurm] get node reasons(%s): %+v\n", nodeName, reason) + return reason, nil +} + +func GetNodeReasonWithScontrol(nodeName, scontrolPath string) (string, error) { + type scontrolShowNode struct { + Nodes []struct { + Reason string `json:"reason"` + } `json:"nodes"` + } + + cmd := fmt.Sprintf("%s show node %s --json", scontrolPath, nodeName) + ret := LocalCommandOutput(cmd) + + if ret == "" { + return "", fmt.Errorf("failed to get current node reason") + } + + res := scontrolShowNode{} + err := json.Unmarshal([]byte(ret), &res) + if err != nil { + return "", err + } + if len(res.Nodes) != 1 { + return "", fmt.Errorf("show node failed for %s", nodeName) + } + log.Printf("get node reasons(%s): %+v\n", nodeName, res.Nodes[0].Reason) + return res.Nodes[0].Reason, nil +} + func (c *Client) getConnectionStatus() error { apiCall := func() (interface{}, *http.Response, error) { ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) @@ -196,3 +295,10 @@ func createRestClient(c *SlurmServerConfig) *slurmrestdapi.APIClient { client := slurmrestdapi.NewAPIClient(cfg) return client } + +// LocalCommandOutput runs a command on a node and returns output in string format +func LocalCommandOutput(command string) string { + log.Printf("Running cmd: %s\n", command) + out, _ := exec.Command("bash", "-c", command).CombinedOutput() + return strings.TrimSpace(string(out)) +}