Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the code to use scontrol in place of slurm APIs to drain the #74

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 18 additions & 11 deletions redfish-exporter/.env
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
UPDATED="2024-09-24"
DESCRIPTION="Redfish Event Listener/Exporter"
LISTENER_IP="0.0.0.0"
LISTENER_PORT="8080"
LISTENER_IP="10.11.18.55"
LISTENER_PORT="9003"
METRICS_PORT="2112"
USE_SSL="false"
CERTFILE="path/to/certfile"
KEYFILE="path/to/keyfile"
SLURM_USER="slurm user here"
SLURM_TOKEN="token string here, from secret when for real"
SLURM_CONTROL_NODE="slurm control node IP:Port"
SLURM_CONTROL_NODE="10.235.34.47"
SLURM_DRAIN_EXCLUDE_REASON_LIST="AMD|Pensando|RebootNeeded"
SLURM_SCONTROL_PATH="/usr/bin/scontrol"
TLS_TIMEOUT="15"

TRIGGER_EVENTS="[\
{\"Severity\":\"Fatal\",\"Action\":\"DrainNode\"},\
{\"Severity\":\"Critical\",\"Action\":\"DrainNode\"}
{\"Severity\":\"Critical\",\"Message\":\"Image 'UBB_FPGA' is being verified at 'ERoT'|This is an e2e critical test event\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNeeded\"},\
{\"Severity\":\"Info\",\"Message\":\"Image 'UBB_FPGA' is being verified at 'ERoT'\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"},\
{\"Severity\":\"Warning\",\"Message\":\"Image 'UBB_FPGA' is being verified at 'ERoT'|This is an e2e test event message\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"}
]"

# Subscription (v1.5+)
Expand All @@ -28,11 +30,11 @@ TRIGGER_EVENTS="[\

# Deprecated <v1.5
SUBSCRIPTION_PAYLOAD="{\
\"Destination\":\"http://host.docker.internal:8080\",\
\"EventTypes\":[\"Alert\",\"StatusChange\"],\
\"Destination\":\"http://10.11.18.55:9003\",\
\"EventTypes\":[\"Alert\"],\
\"Protocol\":\"Redfish\",\
\"Context\":\"YourContextData\",\
\"Oem\":{\"Supermicro\":{\"EnableSubscription\":true}}\
\"Oem\":{\"Supermicro\": {\"EnableSubscription\": true}}\
}"

# Config for setting default labels in Prometheus counter metrics.
Expand All @@ -41,5 +43,10 @@ PROMETHEUS_CONFIG="{\
}"

REDFISH_SERVERS="[\
{\"ip\":\"http://127.0.0.1:8000\",\"username\":\"Username1\",\"password\":\"Password1\",\"loginType\":\"Session\",\"slurmNode\":\"Node1\"}
{\"ip\":\"https://10.235.37.54\",\"username\":\"ADMIN\",\"password\":\"PHHCJZUHDV\",\"loginType\":\"Session\",\"slurmNode\":\"smc300x-ccs-aus-GPUFCE9\"},
{\"ip\":\"https://10.235.37.48\",\"username\":\"ADMIN\",\"password\":\"PHHCJZUHDV\",\"loginType\":\"Session\",\"slurmNode\":\"smc300x-ccs-aus-GPUFCE9\"}

]"

REDFISH_SERVERS_COMMON_CONFIG="{\
\"hostSuffix\":\"ipmi.cluster\",\"username\":\"<username>\",\"password\":\"<password>\"}"
176 changes: 153 additions & 23 deletions redfish-exporter/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,23 @@ package main
import (
"crypto/tls"
"encoding/json"
"fmt"
"log"
"net"
"os"
"strconv"
"strings"

"github.com/joho/godotenv"
"gopkg.in/yaml.v3"
)

const (
DefaultListenerPort = "8080"
DefaultMetricsPort = "2112"
DefaultUseSSL = "false"
DefaultSeverityConfig = "Fatal,Critical,Informational"
NodeDrainPolicyFile = "nodeDrainPolicy.json"
)

type Config struct {
Expand All @@ -49,28 +53,56 @@ type Config struct {
CertFile string
KeyFile string
}
SlurmToken string
SlurmControlNode string
SlurmUser string
SubscriptionPayload SubscriptionPayload
RedfishServers []RedfishServer
TriggerEvents []TriggerEvent
PrometheusConfig PrometheusConfig
context *tls.Config
eventCount int
dataBuffer []byte
SlurmToken string
SlurmControlNode string
SlurmUser string
SlurmScontrolPath string
SlurmDrainExcludeStr string
SubscriptionPayload SubscriptionPayload
RedfishServers []RedfishServer
TriggerEvents map[string]map[string][]EventInfo //map[Severity][MessageRegistry.MessageId][]EventInfo
PrometheusConfig PrometheusConfig
context *tls.Config
eventCount int
dataBuffer []byte
TlsTimeOut string
}

type EventInfo struct {
UniqueString string
Category string
Subcategory string
DrainReasonPrefix string
}

type TriggerEvent struct {
Severity string `json:"Severity"`
Action string `json:"Action"`
Severity string `json:"Severity"`
Action string `json:"Action"`
Message string `json:"Message"`
DrainReasonPrefix string `json:"DrainReasonPrefix"`
}

type TriggerEventsInfo struct {
Category string `json:"Category"`
Subcategory string `json:"Subcategory"`
MessageRegistry string `json:"MessageRegistry"`
MessageId string `json:"MessageId"`
UniqueString string `json:"UniqueString"`
Severity string `json:"Severity"`
DrainReasonPrefix string `json:"DrainReasonPrefix"`
Enable bool `json:"Enable"`
}

type PrometheusConfig struct {
Severity []string `json:"Severity"`
}

func setupConfig() Config {
type target struct {
Targets []string `yaml:"targets"`
Labels map[string]string `yaml:"labels"`
}

func setupConfig(targetFile string) Config {
// Load .env file
err := godotenv.Load()
if err != nil {
Expand Down Expand Up @@ -119,20 +151,15 @@ func setupConfig() Config {
AppConfig.SlurmToken = os.Getenv("SLURM_TOKEN")
AppConfig.SlurmControlNode = os.Getenv("SLURM_CONTROL_NODE")
AppConfig.SlurmUser = os.Getenv("SLURM_USER")
AppConfig.SlurmDrainExcludeStr = os.Getenv("SLURM_DRAIN_EXCLUDE_REASON_LIST")
AppConfig.SlurmScontrolPath = os.Getenv("SLURM_SCONTROL_PATH")
AppConfig.TlsTimeOut = os.Getenv("TLS_TIMEOUT")

subscriptionPayloadJSON := os.Getenv("SUBSCRIPTION_PAYLOAD")
if err := json.Unmarshal([]byte(subscriptionPayloadJSON), &AppConfig.SubscriptionPayload); err != nil {
log.Fatalf("Failed to parse SUBSCRIPTION_PAYLOAD: %v", err)
}

triggerEventsJSON := os.Getenv("TRIGGER_EVENTS")
if triggerEventsJSON != "" {
err = json.Unmarshal([]byte(triggerEventsJSON), &AppConfig.TriggerEvents)
if err != nil {
log.Fatalf("Failed to unmarshal TRIGGER_EVENTS: %v", err)
}
}

prometheusConfigJSON := os.Getenv("PROMETHEUS_CONFIG")
if prometheusConfigJSON != "" {
err = json.Unmarshal([]byte(prometheusConfigJSON), &AppConfig.PrometheusConfig)
Expand All @@ -148,10 +175,113 @@ func setupConfig() Config {
redfishServersJSON := os.Getenv("REDFISH_SERVERS")
if redfishServersJSON == "" {
log.Println("REDFISH_SERVERS environment variable is not set or is empty")
} else {
if err := json.Unmarshal([]byte(redfishServersJSON), &AppConfig.RedfishServers); err != nil {
log.Fatalf("Failed to parse REDFISH_SERVERS: %v", err)
}
}

// Read the node drain policy config file
nodeDrainPolicyConfig, err := os.ReadFile(NodeDrainPolicyFile)

if err != nil {
log.Fatalf("Failed to read: %v", NodeDrainPolicyFile)
}

triggerEventsInfo := []TriggerEventsInfo{}
err = json.Unmarshal(nodeDrainPolicyConfig, &triggerEventsInfo)
if err != nil {
log.Fatalf("Failed to unmarshal file: %v | err: %v", NodeDrainPolicyFile, err)
}

tInfoMap := map[string]map[string][]EventInfo{}

for _, evt := range triggerEventsInfo {
fmt.Printf("Trigger Event: %+v\n", evt)
if evt.Enable != true {
continue
}
eInfo := EventInfo{}
eInfo.Category = evt.Category
eInfo.Subcategory = evt.Subcategory
eInfo.DrainReasonPrefix = evt.DrainReasonPrefix
eInfo.UniqueString = evt.UniqueString
key := ""
if evt.MessageRegistry == "" {
key = evt.MessageId
} else {
key = evt.MessageRegistry + "." + evt.MessageId
}
if ee, ok := tInfoMap[evt.Severity]; !ok {
eInfoMap := map[string][]EventInfo{}
eInfoMap[key] = []EventInfo{eInfo}
tInfoMap[evt.Severity] = eInfoMap
} else {
ee[key] = append(ee[key], eInfo)
}
}

AppConfig.TriggerEvents = tInfoMap

for kk, tt := range AppConfig.TriggerEvents {
fmt.Println("Severity: ", kk)
for kkk, ttt := range tt {
fmt.Println("key: ", kkk)
fmt.Printf("event: %+v\n", ttt)
}
}

// Read and parse the REDFISH_SERVERS_COMMON_CONFIG environment variable
redfishServersCommonConfigJSON := os.Getenv("REDFISH_SERVERS_COMMON_CONFIG")
if redfishServersCommonConfigJSON == "" {
log.Println("redfishServersCommonConfigJSON environment variable is not set or is empty")
return AppConfig
}
if err := json.Unmarshal([]byte(redfishServersJSON), &AppConfig.RedfishServers); err != nil {
log.Fatalf("Failed to parse REDFISH_SERVERS: %v", err)
redfishServersCommonConfig := RedfishServersCommongConfig{}
if err := json.Unmarshal([]byte(redfishServersCommonConfigJSON), &redfishServersCommonConfig); err != nil {
log.Fatalf("Failed to parse REDFISH_SERVERS_COMMON_CONFIG: %v", err)
}

if targetFile == "" {
log.Println("No target file provided")
return AppConfig
}

targetYamlFile, err := os.ReadFile(targetFile)

if err != nil {
log.Fatalf("Failed to read file: %v", targetFile)
}

targets := []target{}

err = yaml.Unmarshal(targetYamlFile, &targets)

if err != nil {
log.Fatalf("Error parsing target file: %v | err: %v", targetFile, err)
}

for _, t := range targets {
log.Println("target: ", t.Targets)

for _, hostName := range t.Targets {
// add this target to Redfish servers
server := RedfishServer{}
bmcHost := fmt.Sprintf(hostName+".%v", redfishServersCommonConfig.HostSuffix)
ips, err := net.LookupIP(bmcHost)
if err != nil || len(ips) == 0 {
log.Printf("[error] Couldn't get the IP for host: %v | ips: %v | err: %v", bmcHost, ips, err)
continue
}
log.Println("IPs: ", ips)

server.IP = fmt.Sprintf("https://%v", ips[0])
server.LoginType = "Session"
server.Username = redfishServersCommonConfig.UserName
server.Password = redfishServersCommonConfig.Password
server.SlurmNode = hostName
AppConfig.RedfishServers = append(AppConfig.RedfishServers, server)
}
}

return AppConfig
Expand Down
1 change: 1 addition & 0 deletions redfish-exporter/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ require (
github.com/nod-ai/ADA/redfish-exporter v0.0.0-20241002210630-2ef2d1070d90
github.com/prometheus/client_golang v1.20.4
github.com/stmcginnis/gofish v0.19.0
gopkg.in/yaml.v3 v3.0.1
)

require (
Expand Down
3 changes: 3 additions & 0 deletions redfish-exporter/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
62 changes: 50 additions & 12 deletions redfish-exporter/listener.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,37 @@ func (s *Server) handleConnection(AppConfig Config, conn net.Conn) {
}
}

func getDrainReasonPrefix(info EventInfo) string {
return info.DrainReasonPrefix + ": " + info.Category + ": " + info.Subcategory
}

func isTriggerEvent(evt Event, config Config) (bool, string) {
tInfoMap := config.TriggerEvents

if eInfoMap, ok := tInfoMap[evt.Severity]; !ok {
return false, ""
} else {
if eInfo, ok1 := eInfoMap[evt.MessageId]; !ok1 {
return false, ""
} else {
if len(eInfo) == 1 {
return true, getDrainReasonPrefix(eInfo[0])
} else {
for _, info := range eInfo {
strs := strings.Split(info.UniqueString, "|")
for _, str := range strs {
if strings.Contains(evt.Message, str) == true {
return true, getDrainReasonPrefix(info)
}
}

}
}
}
}
return false, ""
}

func (s *Server) processRequest(AppConfig Config, conn net.Conn, req *http.Request, eventCount *int, dataBuffer *[]byte) error {
// Extract method, headers, and payload
method := req.Method
Expand Down Expand Up @@ -217,19 +248,26 @@ func (s *Server) processRequest(AppConfig Config, conn net.Conn, req *http.Reque
log.Printf("Message ID: %s", messageId)
log.Printf("Message Args: %v", messageArgs)
log.Printf("Origin Of Condition: %s", originOfCondition)
for _, triggerEvent := range AppConfig.TriggerEvents {
if severity == triggerEvent.Severity {
log.Printf("Matched Trigger Event: %s with action %s", triggerEvent.Severity, triggerEvent.Action)
// Sending event belongs to redfish_utils. Each server may have different slurm node associated, and redfish_servers has the info/map.
if s.slurmQueue != nil {
redfishServerInfo := getServerInfoByIP(AppConfig.RedfishServers, ip)
if len(strings.TrimSpace(redfishServerInfo.SlurmNode)) == 0 {
log.Printf("failed to get the slurm node name, cannot perform action: %v", triggerEvent.Action)
break
}
s.slurmQueue.Add(redfishServerInfo.IP, redfishServerInfo.SlurmNode, triggerEvent.Severity, triggerEvent.Action)

trigger, drainReason := isTriggerEvent(event, AppConfig)
if trigger == true {
log.Printf("Matched Trigger Event: %s | messageId: %s | message: %s", event.Severity, event.MessageId, event.Message)
// Sending event belongs to redfish_utils. Each server may have different slurm node associated, and redfish_servers has the info/map.
if s.slurmQueue != nil {
redfishServerInfo := getServerInfoByIP(AppConfig.RedfishServers, ip)
if len(strings.TrimSpace(redfishServerInfo.SlurmNode)) == 0 {
log.Println("failed to get the slurm node name, cannot perform drain action")
continue
}
break
evt := slurm.AddEventReq{
RedfishServerIP: redfishServerInfo.IP,
SlurmNodeName: redfishServerInfo.SlurmNode,
Severity: event.Severity,
DrainReason: drainReason,
ExcludeStr: AppConfig.SlurmDrainExcludeStr,
ScontrolPath: AppConfig.SlurmScontrolPath,
}
s.slurmQueue.Add(evt)
}
}
}
Expand Down
Loading