Skip to content

Commit

Permalink
Merge pull request #80 from Comcast/cpu_storcontroller_updates
Browse files Browse the repository at this point in the history
updates to storage controller and memory metrics
  • Loading branch information
ibrahimkk-moideen authored Jun 20, 2024
2 parents 8b8651a + a081449 commit 3e60fc2
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 29 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ log is based on the [Keep a CHANGELOG](http://keepachangelog.com/) project.
- Add HPE Proliant XL420 Support [#33](https://github.com/Comcast/fishymetrics/issues/33)
- consolidate exporters into a single generic exporter [#52](https://github.com/Comcast/fishymetrics/issues/52)
- update Dockerfile to comply with opensource packaging requirements [#61](https://github.com/Comcast/fishymetrics/issues/61)
- Storage controller status metric for HP servers [#79](https://github.com/Comcast/fishymetrics/issues/79)
- Ignore CPU metrics if Processor is Absent [#79](https://github.com/Comcast/fishymetrics/issues/79)
- Added support for metrics collection from Dell servers [#77](https://github.com/Comcast/fishymetrics/issues/77)

## Fixed

Expand Down
68 changes: 64 additions & 4 deletions exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,14 @@ type SystemEndpoints struct {
systems []string
power []string
thermal []string
volumes []string
virtualDrives []string
}

type DriveEndpoints struct {
logicalDriveURLs []string
physicalDriveURLs []string
arrayControllerURLs []string
logicalDriveURLs []string
physicalDriveURLs []string
}

type Excludes map[string]interface{}
Expand Down Expand Up @@ -238,8 +241,47 @@ func NewExporter(ctx context.Context, target, uri, profile, model string, exclud
return nil, err
}

// newer servers have volumes endpoint in storage controller, these volumes hold virtual drives member urls
if len(sysEndpoints.storageController) > 0 {
var controllerOutput oem.System
for _, controller := range sysEndpoints.storageController {
controllerOutput, err = getSystemsMetadata(exp.url+controller, target, retryClient)
if err != nil {
log.Error("error when getting storage controller metadata", zap.Error(err), zap.Any("trace_id", ctx.Value("traceID")))
return nil, err
}
if controllerOutput.Volumes.URL != "" {
url := appendSlash(controllerOutput.Volumes.URL)
if checkUnique(sysEndpoints.volumes, url) {
sysEndpoints.volumes = append(sysEndpoints.volumes, url)
}
}
}
}
if len(sysEndpoints.volumes) > 0 {
for _, volume := range sysEndpoints.volumes {
virtualDrives, err := getMemberUrls(exp.url+volume, target, retryClient)
if err != nil {
log.Error("error when getting virtual drive member urls", zap.Error(err), zap.Any("trace_id", ctx.Value("traceID")))
return nil, err
}
if len(virtualDrives) > 0 {
for _, virtualDrive := range virtualDrives {
if strings.Contains(virtualDrive, "Virtual") {
url := appendSlash(virtualDrive)
if checkUnique(sysEndpoints.virtualDrives, url) {
sysEndpoints.virtualDrives = append(sysEndpoints.virtualDrives, url)
}
}
}
}
}
}

log.Debug("systems endpoints response", zap.Strings("systems_endpoints", sysEndpoints.systems),
zap.Strings("storage_ctrl_endpoints", sysEndpoints.storageController),
zap.Strings("volumes_endpoints", sysEndpoints.volumes),
zap.Strings("virtual_drives_endpoints", sysEndpoints.virtualDrives),
zap.Strings("drives_endpoints", sysEndpoints.drives),
zap.Strings("power_endpoints", sysEndpoints.power),
zap.Strings("thermal_endpoints", sysEndpoints.thermal),
Expand Down Expand Up @@ -305,11 +347,24 @@ func NewExporter(ctx context.Context, target, uri, profile, model string, exclud
}
}

log.Debug("drive endpoints response", zap.Strings("logical_drive_endpoints", driveEndpointsResp.logicalDriveURLs),
if len(sysEndpoints.storageController) == 0 && ss == "" {
driveEndpointsResp, err = getAllDriveEndpoints(ctx, exp.url, exp.url+sysEndpoints.systems[0]+"Storage/", target, retryClient)
if err != nil {
log.Error("error when getting drive endpoints", zap.Error(err), zap.Any("trace_id", ctx.Value("traceID")))
return nil, err
}
}

log.Debug("drive endpoints response", zap.Strings("array_controller_endpoints", driveEndpointsResp.arrayControllerURLs),
zap.Strings("logical_drive_endpoints", driveEndpointsResp.logicalDriveURLs),
zap.Strings("physical_drive_endpoints", driveEndpointsResp.physicalDriveURLs),
zap.Any("trace_id", ctx.Value("traceID")))

// Loop through logicalDriveURLs, physicalDriveURLs, and nvmeDriveURLs and append each URL to the tasks pool
// Loop through arrayControllerURLs, logicalDriveURLs, physicalDriveURLs, and nvmeDriveURLs and append each URL to the tasks pool
for _, url := range driveEndpointsResp.arrayControllerURLs {
tasks = append(tasks, pool.NewTask(common.Fetch(exp.url+url, target, profile, retryClient), exp.url+url, handle(&exp, STORAGE_CONTROLLER)))
}

for _, url := range driveEndpointsResp.logicalDriveURLs {
tasks = append(tasks, pool.NewTask(common.Fetch(exp.url+url, target, profile, retryClient), exp.url+url, handle(&exp, LOGICALDRIVE)))
}
Expand All @@ -328,6 +383,11 @@ func NewExporter(ctx context.Context, target, uri, profile, model string, exclud
tasks = append(tasks, pool.NewTask(common.Fetch(exp.url+url, target, profile, retryClient), exp.url+url, handle(&exp, STORAGE_CONTROLLER)))
}

// virtual drives
for _, url := range sysEndpoints.virtualDrives {
tasks = append(tasks, pool.NewTask(common.Fetch(exp.url+url, target, profile, retryClient), exp.url+url, handle(&exp, LOGICALDRIVE)))
}

// power
for _, url := range sysEndpoints.power {
tasks = append(tasks, pool.NewTask(common.Fetch(exp.url+url, target, profile, retryClient), exp.url+url, handle(&exp, POWER)))
Expand Down
69 changes: 64 additions & 5 deletions exporter/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,24 @@ func (e *Exporter) exportPowerMetrics(body []byte) error {
for _, pv := range pm.Voltages {
if pv.Status.State == "Enabled" {
var volts float64
var upperThresCrit float64
switch pv.ReadingVolts.(type) {
case float64:
volts = pv.ReadingVolts.(float64)
case string:
volts, _ = strconv.ParseFloat(pv.ReadingVolts.(string), 32)
}
(*pow)["voltageOutput"].WithLabelValues(pv.Name, e.ChassisSerialNumber, e.Model).Set(volts)
switch pv.UpperThresholdCritical.(type) {
case float64:
upperThresCrit = pv.UpperThresholdCritical.(float64)
case string:
upperThresCrit, _ = strconv.ParseFloat(pv.UpperThresholdCritical.(string), 32)
}
if volts == 0 && upperThresCrit == 0 {
continue
} else {
(*pow)["voltageOutput"].WithLabelValues(pv.Name, e.ChassisSerialNumber, e.Model).Set(volts)
}
if pv.Status.Health == "OK" {
state = OK
} else {
Expand Down Expand Up @@ -199,7 +210,11 @@ func (e *Exporter) exportThermalMetrics(body []byte) error {
}

if fan.FanName != "" {
(*therm)["fanSpeed"].WithLabelValues(fan.FanName, e.ChassisSerialNumber, e.Model).Set(float64(fan.CurrentReading))
if float64(fan.CurrentReading) != 0 {
(*therm)["fanSpeed"].WithLabelValues(fan.FanName, e.ChassisSerialNumber, e.Model).Set(float64(fan.CurrentReading))
} else {
(*therm)["fanSpeed"].WithLabelValues(fan.FanName, e.ChassisSerialNumber, e.Model).Set(fanSpeed)
}
} else {
(*therm)["fanSpeed"].WithLabelValues(fan.Name, e.ChassisSerialNumber, e.Model).Set(fanSpeed)
}
Expand Down Expand Up @@ -291,10 +306,24 @@ func (e *Exporter) exportLogicalDriveMetrics(body []byte) error {
var state float64
var dllogical oem.LogicalDriveMetrics
var dllogicaldrive = (*e.DeviceMetrics)["logicalDriveMetrics"]
var ldName string
var raidType string
var volIdentifier string
err := json.Unmarshal(body, &dllogical)
if err != nil {
return fmt.Errorf("Error Unmarshalling LogicalDriveMetrics - " + err.Error())
}
if dllogical.Raid == "" {
ldName = dllogical.DisplayName
raidType = dllogical.RaidType
if len(dllogical.Identifiers) > 0 {
volIdentifier = dllogical.Identifiers[0].DurableName
}
} else {
ldName = dllogical.LogicalDriveName
raidType = dllogical.Raid
volIdentifier = dllogical.VolumeUniqueIdentifier
}
// Check physical drive is enabled then check status and convert string to numeric values
if dllogical.Status.State == "Enabled" {
if dllogical.Status.Health == "OK" {
Expand All @@ -306,7 +335,7 @@ func (e *Exporter) exportLogicalDriveMetrics(body []byte) error {
state = DISABLED
}

(*dllogicaldrive)["raidStatus"].WithLabelValues(dllogical.Name, e.ChassisSerialNumber, e.Model, dllogical.LogicalDriveName, dllogical.VolumeUniqueIdentifier, dllogical.Raid).Set(state)
(*dllogicaldrive)["raidStatus"].WithLabelValues(dllogical.Name, e.ChassisSerialNumber, e.Model, ldName, volIdentifier, raidType).Set(state)
return nil
}

Expand Down Expand Up @@ -374,13 +403,26 @@ func (e *Exporter) exportStorageControllerMetrics(body []byte) error {
if sc.Status.State == "Enabled" {
if sc.Status.Health == "OK" {
state = OK
} else if sc.Status.Health == "" && sc.Status.HealthRollup == "" {
continue
} else {
state = BAD
}
(*drv)["storageControllerStatus"].WithLabelValues(scm.Name, e.ChassisSerialNumber, e.Model, sc.FirmwareVersion, sc.Model).Set(state)
}
}

if len(scm.StorageController.StorageController) == 0 {
if scm.Status.State == "Enabled" {
if scm.Status.Health == "OK" {
state = OK
} else {
state = BAD
}
(*drv)["storageControllerStatus"].WithLabelValues(scm.Name, e.ChassisSerialNumber, e.Model, scm.ControllerFirmware.FirmwareVersion, scm.Model).Set(state)
}
}

return nil
}

Expand All @@ -390,18 +432,29 @@ func (e *Exporter) exportMemorySummaryMetrics(body []byte) error {
var state float64
var dlm oem.System
var dlMemory = (*e.DeviceMetrics)["memoryMetrics"]
var totalSystemMemoryGiB string
err := json.Unmarshal(body, &dlm)
if err != nil {
return fmt.Errorf("Error Unmarshalling MemorySummaryMetrics - " + err.Error())
}
// Check memory status and convert string to numeric values
if dlm.MemorySummary.Status.HealthRollup == "OK" {
// Ignore memory summary if status is not present
if dlm.MemorySummary.Status.HealthRollup == "" {
return nil
} else if dlm.MemorySummary.Status.HealthRollup == "OK" {
state = OK
} else {
state = BAD
}

(*dlMemory)["memoryStatus"].WithLabelValues(e.ChassisSerialNumber, e.Model, strconv.Itoa(dlm.MemorySummary.TotalSystemMemoryGiB)).Set(state)
switch dlm.MemorySummary.TotalSystemMemoryGiB.(type) {
case int:
totalSystemMemoryGiB = strconv.Itoa(dlm.MemorySummary.TotalSystemMemoryGiB.(int))
case float64:
totalSystemMemoryGiB = strconv.FormatFloat(dlm.MemorySummary.TotalSystemMemoryGiB.(float64), 'f', -1, 64)
}

(*dlMemory)["memoryStatus"].WithLabelValues(e.ChassisSerialNumber, e.Model, totalSystemMemoryGiB).Set(state)

return nil
}
Expand Down Expand Up @@ -566,6 +619,12 @@ func (e *Exporter) exportProcessorMetrics(body []byte) error {
case int:
totCores = strconv.Itoa(pm.TotalCores.(int))
}

// Ignore metrics if processor is absent
if pm.Status.State == "Absent" {
return nil
}

if pm.Status.Health == "OK" {
state = OK
} else {
Expand Down
37 changes: 37 additions & 0 deletions exporter/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,43 @@ func getAllDriveEndpoints(ctx context.Context, fqdn, initialUrl, host string, cl
return driveEndpoints, err
}

// This if condition is for servers with iLO6. Gather metrics only from controllers with drives
// /redfish/v1/Systems/XXXX/Storage/XXXXX/
if len(arrayCtrlResp.StorageDrives) > 0 {
for _, member := range arrayCtrlResp.StorageDrives {
driveEndpoints.physicalDriveURLs = append(driveEndpoints.physicalDriveURLs, appendSlash(member.URL))
}

// If Volumes are present, parse volumes endpoint until all urls are found
if arrayCtrlResp.Volumes.URL != "" {
volumeOutput, err := getDriveEndpoint(fqdn+arrayCtrlResp.Volumes.URL, host, client)
if err != nil {
log.Error("api call "+fqdn+arrayCtrlResp.Volumes.URL+" failed", zap.Error(err), zap.Any("trace_id", ctx.Value("traceID")))
return driveEndpoints, err
}

for _, member := range volumeOutput.Members {
driveEndpoints.logicalDriveURLs = append(driveEndpoints.logicalDriveURLs, appendSlash(member.URL))
}
}

if arrayCtrlResp.Controllers.URL != "" {
controllerOutput, err := getDriveEndpoint(fqdn+arrayCtrlResp.Controllers.URL, host, client)
if err != nil {
log.Error("api call "+fqdn+arrayCtrlResp.Controllers.URL+" failed", zap.Error(err), zap.Any("trace_id", ctx.Value("traceID")))
return driveEndpoints, err
}

for _, member := range controllerOutput.Members {
driveEndpoints.arrayControllerURLs = append(driveEndpoints.arrayControllerURLs, appendSlash(member.URL))
}
}
} else if arrayCtrlResp.LinksUpper.PhysicalDrives.URL != "" || arrayCtrlResp.LinksLower.PhysicalDrives.URL != "" {
// /redfish/v1/Systems/XXXX/SmartStorage/ArrayControllers/X/
driveEndpoints.arrayControllerURLs = append(driveEndpoints.arrayControllerURLs, appendSlash(member.URL))
}

// all other servers apart from iLO6
// If LogicalDrives is present, parse logical drive endpoint until all urls are found
if arrayCtrlResp.LinksUpper.LogicalDrives.URL != "" {
logicalDriveOutput, err := getDriveEndpoint(fqdn+arrayCtrlResp.LinksUpper.LogicalDrives.URL, host, client)
Expand Down
41 changes: 26 additions & 15 deletions oem/drive.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,24 @@ type NVMeDriveMetrics struct {
// Logical Drives
// /redfish/v1/Systems/X/SmartStorage/ArrayControllers/X/LogicalDrives/X/
type LogicalDriveMetrics struct {
Id string `json:"Id"`
CapacityMiB int `json:"CapacityMiB"`
Description string `json:"Description"`
InterfaceType string `json:"InterfaceType"`
LogicalDriveName string `json:"LogicalDriveName"`
LogicalDriveNumber int `json:"LogicalDriveNumber"`
Name string `json:"Name"`
Raid string `json:"Raid"`
Status Status `json:"Status"`
StripeSizebytes int `json:"StripeSizebytes"`
VolumeUniqueIdentifier string `json:"VolumeUniqueIdentifier"`
Id string `json:"Id"`
CapacityMiB int `json:"CapacityMiB"`
Description string `json:"Description"`
DisplayName string `json:"DisplayName"`
InterfaceType string `json:"InterfaceType"`
Identifiers []Identifiers `json:"Identifiers"`
LogicalDriveName string `json:"LogicalDriveName"`
LogicalDriveNumber int `json:"LogicalDriveNumber"`
Name string `json:"Name"`
Raid string `json:"Raid"`
RaidType string `json:"RAIDType"`
Status Status `json:"Status"`
StripeSizebytes int `json:"StripeSizebytes"`
VolumeUniqueIdentifier string `json:"VolumeUniqueIdentifier"`
}

type Identifiers struct {
DurableName string `json:"DurableName"`
}

// Disk Drives
Expand Down Expand Up @@ -116,10 +123,14 @@ func (w *LocationWrapper) UnmarshalJSON(data []byte) error {
// /redfish/v1/Systems/X/SmartStorage/ArrayControllers/ for Logical and Physical Drives
// /redfish/v1/Chassis/X/Drives/ for NVMe Drive(s)
type GenericDrive struct {
Members []Members `json:"Members,omitempty"`
LinksUpper LinksUpper `json:"Links,omitempty"`
LinksLower LinksLower `json:"links,omitempty"`
MembersCount int `json:"[email protected],omitempty"`
Members []Members `json:"Members,omitempty"`
LinksUpper LinksUpper `json:"Links,omitempty"`
LinksLower LinksLower `json:"links,omitempty"`
MembersCount int `json:"[email protected],omitempty"`
DriveCount int `json:"[email protected],omitempty"`
StorageDrives []Link `json:"Drives,omitempty"`
Volumes Link `json:"Volumes,omitempty"`
Controllers Link `json:"Controllers,omitempty"`
}

type Members struct {
Expand Down
2 changes: 1 addition & 1 deletion oem/power.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ type PowerMetrics struct {
// PowerControl is the top level json object for metadata on power supply consumption
type PowerControl struct {
MemberID string `json:"MemberId"`
PowerCapacityWatts int `json:"PowerCapacityWatts,omitempty"`
PowerCapacityWatts interface{} `json:"PowerCapacityWatts,omitempty"`
PowerConsumedWatts interface{} `json:"PowerConsumedWatts"`
PowerMetrics PowerMetric `json:"PowerMetrics"`
}
Expand Down
Loading

0 comments on commit 3e60fc2

Please sign in to comment.