diff --git a/cmd/icingadb/main.go b/cmd/icingadb/main.go index de2cbdcfc..0da321320 100644 --- a/cmd/icingadb/main.go +++ b/cmd/icingadb/main.go @@ -167,10 +167,15 @@ func run() int { synctx, cancelSynctx := context.WithCancel(ha.Environment().NewContext(hactx)) g, synctx := errgroup.WithContext(synctx) // WaitGroups for initial synchronization. - // Runtime updates must wait for initial synchronization to complete. + // Runtime updates and history pipelines must wait for the initial synchronization to + // complete by draining the `initConfigSyncDone` channel. configInitSync := sync.WaitGroup{} stateInitSync := &sync.WaitGroup{} + // A channel used to notify both the runtime updates and history pipelines workers + // about the successful initial config sync completion including the SLA lifecycles. + initConfigSyncDone := make(chan struct{}) + // Clear the runtime update streams before starting anything else (rather than after the sync), // otherwise updates may be lost. runtimeConfigUpdateStreams, runtimeStateUpdateStreams, err := rt.ClearStreams(synctx) @@ -243,7 +248,18 @@ func run() int { }) g.Go(func() error { + // Unblock the runtime updates and history pipelines workers. + defer close(initConfigSyncDone) + + // Wait for the actual initial config sync to finish before syncing the SLA lifecycles. configInitSync.Wait() + + logger.Info("Syncing Host and Service initial SLA lifecycle") + + if err := icingadb.SyncCheckablesSlaLifecycle(synctx, db); err != nil { + return err + } + telemetry.OngoingSyncStartMilli.Store(0) syncEnd := time.Now() @@ -279,7 +295,8 @@ func run() int { }) g.Go(func() error { - configInitSync.Wait() + // Wait for the initial config sync including the SLA lifecycles to finish! + <-initConfigSyncDone if err := synctx.Err(); err != nil { return err @@ -304,7 +321,7 @@ func run() int { g.Go(func() error { // Wait for config and state sync to avoid putting additional pressure on the database. - configInitSync.Wait() + <-initConfigSyncDone stateInitSync.Wait() if err := synctx.Err(); err != nil { diff --git a/doc/10-Sla-Reporting.md b/doc/10-Sla-Reporting.md new file mode 100644 index 000000000..203256257 --- /dev/null +++ b/doc/10-Sla-Reporting.md @@ -0,0 +1,232 @@ +# SLA Reporting + +A Service Level Agreement (SLA) is a legally binding contract between a service provider and a customer. +Its purpose is to define the level of service that the supplier promises to deliver to the customer. + +Icinga DB is designed to automatically identify and record the most relevant checkable events exclusively in a separate +table. By default, these events are retained forever unless you have set the retention +[`sla-days` option](03-Configuration.md#retention). It is important to note that Icinga DB records the raw events in +the database without any interpretation. In order to generate and visualise SLA reports of specific hosts and services +based on the accumulated events over time, [Icinga Reporting](https://icinga.com/docs/icinga-reporting/latest/doc/02-Installation/) +is the optimal complement, facilitating comprehensive SLA report generation within a specific timeframe. + +## Technical Description + +!!! info + + This documentation provides a detailed technical explanation of how Icinga DB fulfils all the + necessary requirements for the generation of an accurate service level agreement (SLA). + +Icinga DB provides a built-in support for automatically storing the relevant events of your hosts and services without +a manual action. Generally, these events are every **hard** state change a particular checkable encounters and all the +downtimes scheduled for that checkable throughout its entire lifetime. It is important to note that the aforementioned +events are not analogous to those utilised by [Icinga DB Web](https://icinga.com/docs/icinga-db-web/latest/) for +visualising host and service states. + +!!! info + + [Acknowledgements](https://icinga.com/docs/icinga-2/latest/doc/08-advanced-topics/#acknowledgements) are special + events that mainly control the Icinga 2 notification logic behaviour, and acknowledging a host problem will not + have an impact on the SLA result for that host. + +In case of a hard state change of a monitored host or service, Icinga DB records the precise temporal occurrence of +that state change in milliseconds within the `sla_history_state` table. The following image serves as a visual +illustration of the relational and representational aspects of state change events. + +![SLA history state](images/sla_history_state.png) + +In contrast, two timestamps are retained for downtimes, one indicating the commencement of the downtime as +`downtime_start` and the other denoting its end time, designated as `downtime_end` within the `sla_history_downtime` +table. For the sake of completeness, the following image provides also a visual representation of the +`sla_history_downtime` table and its relations. + +![SLA history downtime](images/sla_history_downtime.png) + +In certain circumstances, namely when a checkable is created and subsequently never deleted, this approach has been +empirically [demonstrated](#computing-sla-ok-percent) to be sufficient. Nevertheless, in the case of a host being +deleted and then recreated a couple of days later, the generation of SLA reports in +[Icinga Reporting](https://icinga.com/docs/icinga-reporting/latest/doc/02-Installation/) for that host at +the end of the week may yield disparate results, depending on the host state prior to its deletion. + +In order to generate SLA reports with the greatest possible accuracy, we have decided to supplement the existing data +with information regarding the **creation** and **deletion** of hosts and services in a new `sla_lifecycle` table, +introduced in Icinga DB **1.3.0**. This new table has a composite primary key consisting of two columns: `id` and +`delete_time`. In this way, the `delete_time` is used to indicate whether a specific checkable has been deleted, +with a `0` value denoting non-deleted object. In a perfect world, we would use `NULL` instead of `0` but due to the +[primary key `constraints`](https://dev.mysql.com/doc/refman/8.4/en/create-table.html#create-table-indexes-keys), +this is not possible. The `id` column represents either the service or host ID for which that particular sla lifecycle +is being generated. + +![SLA lifecycle](images/sla_lifecycle.png) + +The upgrade script for `1.3.0` generates a `create_time` SLA lifecycle entry for all existing hosts and services. +However, since that script has no knowledge of the creation time of these existing objects, the timestamp for them is +produced in the following manner: As previously outlined, Icinga DB has the capability to store timestamps for both +hard state changes and downtimes since its first stable release. This enables the upgrade script to identify the least +event timestamp of a given checkable from the `sla_history_state` and `sla_history_downtime` tables. In cases where no +timestamps can be obtained from the aforementioned tables, it simply fallbacks to `now`, i.e. in such situations, +the creation time of the checkable in question is set to the current timestamp. + +### Events Processing + +It is noteworthy that Icinga DB does not record checkable **soft** state changes, in the regular `service_state` +and `host_state` tables. However, for the `state_history`, for instance, all state changes are retrieved from Redis® +and persisted to the database, irrespective of whether it is in High Availability (HA) or single-instance mode. +Each time Icinga DB processes a state change, it checks whether it is a hard state change and generates the +corresponding SLA state event in the `sla_history_state` table. Similarly, Icinga DB generates a corresponding SLA +history downtime (`sla_history_downtime`) each time it receives a downtime-triggered or ended/cancelled event from +Icinga 2. Notably, if Icinga DB is operating in HA mode, this process takes place in parallel, i.e. both instances +concurrently write their respective histories to the database, consequently, the aforementioned tables have also to +track the endpoint ids in their `endpoint_id` column. + +Though, it is not necessary to be concerned about the potential duplicate entries in the database, as long as the events +from both Icinga DB instances have the same timestamp, there will be no duplicates. For instance, downtimes have unique +and deterministic IDs, allowing the second instance to detect if that very same downtime has already been recorded by +the other instance. + +The checkable events of the types **created** and **deleted**, on the other hand, are special and represent the life +cycle of the chekables. These two events are always written by a single Icinga DB instance at a time (if in HA mode) to +the `sla_lifecycle` table, denoting the creation and deletion of a checkable in Icinga 2. Unfortunately, Icinga 2 lacks +the capability to accurately determine the deletion time of an object. It should also be noted that Icinga DB is also +incapable of identifying the precise timestamp of an object's deletion. Instead, it simply records the time at which +the deletion event for that particular checkable occurred and populates the `sla_lifecycle` table accordingly. +Consequently, if a checkable is deleted while Icinga DB is stopped or not in an operational state, the events that +Icinga DB would otherwise record once it is restarted will not reflect the actual deletion or creation time. + +#### Initial Config Sync + +Each time when either the Icinga DB or Icinga 2 service is reloaded, Icinga DB computes something called a config +`delta`. The config delta determines which objects need to be deleted from the database, to be updated, or are new and +need to be inserted. After successfully computing and dumping the config, Icinga DB then performs a simple SQL +`INSERT INTO sla_lifecycle` statements for all checkables that don't already have **created** SLA event with +`delete_time` set to `0` and sets their `create_time` to `now`. Additionally, it also updates the `delete_time` column +of each existing SLA lifecycle entries whose checkable IDs cannot be found in the `host/service` tables. + +#### Runtime Updates + +When a host or service is created or deleted at runtime, either using +[Icinga Director](https://icinga.com/docs/icinga-director/latest/doc/01-Introduction/) or the plain `/v1/objects` API +endpoint, Icinga DB automatically generates an SLA lifecycle entry denoting the checkable creation or deletion time. +For all runtime *created* checkables, the SLA lifecycle entries are inserted using a slightly sophisticated SQL `INSERT` +statement with ignore on error mechanism, i.e. if it encounters a `duplicate key` error, it simply suppresses the error +and discards the query. In contrast, for runtime *deleted* checkables, it assumes that there is an SLA lifecycle +**created** event for these checkables, and uses a simple `UPDATE` statement setting their deletion time to now. +Consequently, should there be no corresponding **created** event for these checkables, the update statement becomes a +no-op, as the [initial config dump](#initial-config-sync) should have created the necessary entries for all existing +objects. + +### Computing SLA OK percent + +The following is a simplified explanation of the current (Icinga DB `1.3.0`) methodology behind the `get_sla_ok_percent` +SQL procedure, used to calculate the SLA OK percent. It is a fundamental characteristic of functional specifications +for Icinga Reporting to only generate reports covering a specific timeframe. Accordingly, the `get_sla_ok_percent` +SQL procedure necessitates the input of the start and end timeframes within which the SLA is to be calculated. + +First, it is necessary to identify the latest [`hard_state`](#hard-state-vs-previous-hard-state) of the service or host +that occurred at or prior to the timeline start date, and marking it as the initial one. In case the first query fails +to determine a `hard_state` entry, it proceeds to search for a [`previous_hard_state`](#hard-state-vs-previous-hard-state) +entry in the `sla_history_state` table that have been recorded after the start of the timeline. If this approach also +fails to retrieve the desired outcome, the regular non-historical `host_state` or `service_state` table is then +examined for the current state. Should this also produce no results, then it uses `OK` as its initial state. + +Next, we need to get the total time of the specified timeframe, expressed in milliseconds, for which we're going to +compute the SLA OK percent (`total_time = timeline_end - timeline_start`). + +Afterward, it traverses the entire state and downtime events within the provided timeframe, performing a series of +simple arithmetic operations. The complete algorithmic process is illustrated in the following pseudocode. + +``` +total_time := timeline_end - timeline_start + +// Mark the timeline start date as our last event time for now. +last_event_time := timeline_start + +// The problem time of a given host or service is initially set to zero. +problem_time := 0 + +// The previous_hard_state is determined dynamically as described above, however, +// for the purposes of this analysis, we'll just set it to 'OK'. +previous_hard_state := OK + +// Loop through all the state and downtime events within the provided timeframe ordered by their timestamp. +for event in (sla_history_state, sla_history_downtime) do + if (event.previous_hard_state is PENDING) then + // A PENDING state event indicates that the host or service in question has not yet had a check result that + // clearly identifies its state. Consequently, such events become irrelevant for the purposes of calculating + // the SLA and we must exclude the duration of that PENDING state from the total time. + total_time = total_time - (event.event_time - last_event_time) + else if (previous_hard_state is greater than OK/UP + AND previous_hard_state is not PENDING + AND checkable is not in DOWNTIME) then + // If the previous_hard_state is set to a non-OK state and the host or service in question was not in downtime, + // we consider that time slot to be problematic and add the duration to the problem time. + problem_time = problem_time + (event.event_time - last_event_time) + endif + + // Set the "last_event_time" to the timestamp of the event being currently processed. + last_event_time = event.event_time + + if (event.type is "state change event") then + // If the event being currently processed is a state change event, we mark its + // latest hard state as the previous one for the next iteration. + previous_hard_state = event.hard_state + endif +endloop +``` + +At this point, we now have computed the problem time of a particular host or service for a given time frame. The final +step is to determine the percentage of the remaining total time. In other words, we want to find out how much of the +total time is taken up by the problem time, so that we can obtain our final SLA OK percentage result. + +``` +sla_ok_percent := 100 * (total_time - problem_time) / total_time +``` + +The following example illustrates the practical implications of this concept. Suppose we have the following SLA events: +```json +{ + "state": [ + {"event_time": 1200, "hard_state": 2, "previous_hard_state": 0}, + {"event_time": 1500, "hard_state": 0, "previous_hard_state": 2} + ], + "downtime": [ + {"downtime_start": 1100, "downtime_end": 1300}, + {"downtime_start": 1400, "downtime_end": 1600} + ] +} +``` + +We would now like to calculate the SLA OK percent for the timeframe from `1000` to `2000`. + +``` +total_time := 2000 - 1000 +problem_time := 0 + +- 1000..1200 // in OK state (see the previous_hard_state of the first state event), so nothing to do. +- 1200..1300 // in Critical state, but was also set in downtime from (1100..1300), so here also nothing to do. +- 1300..1400 // still in Critical state and is not in downtime, so we count that time slot as a problem time. +problem_time = problem_time + 1400 - 1300 + +- 1400..1500 // still in critical state, but we have an active downtime during this period again, so nothing to do. +- 1500..2000 // in OK state + +// So, this indicates that our host was either not in a problem state or was set +// to a downtime for 90% of the period from 1000 to 2000. +sla_ok_percent := 100 * (total_time - problem_time) / total_time +``` + +## Appendix + +### Hard State vs. Previous Hard State + +The `hard_state` column denotes the most recent hard state of the host and service. +Conversely, the `previous_hard_state` column indicates the preceding hard state that was formerly stored in the +`hard_state` column prior to the host or service transitioning to a new hard state. Please refer to the tabular +representation below for a visual representation of this information. + +| previous_hard_state | hard_state | +|-------------------------------|------------| +| PENDING (no check result yet) | OK | +| OK | Warning | +| Warning | Critical | +| Critical | OK | diff --git a/doc/images/sla_history_downtime.png b/doc/images/sla_history_downtime.png new file mode 100644 index 000000000..76e6929c4 Binary files /dev/null and b/doc/images/sla_history_downtime.png differ diff --git a/doc/images/sla_history_state.png b/doc/images/sla_history_state.png new file mode 100644 index 000000000..460dcb551 Binary files /dev/null and b/doc/images/sla_history_state.png differ diff --git a/doc/images/sla_lifecycle.png b/doc/images/sla_lifecycle.png new file mode 100644 index 000000000..e7c30231f Binary files /dev/null and b/doc/images/sla_lifecycle.png differ diff --git a/pkg/icingadb/runtime_updates.go b/pkg/icingadb/runtime_updates.go index 46d7bbe29..0a9bd1b99 100644 --- a/pkg/icingadb/runtime_updates.go +++ b/pkg/icingadb/runtime_updates.go @@ -10,6 +10,7 @@ import ( "github.com/icinga/icinga-go-library/redis" "github.com/icinga/icinga-go-library/strcase" "github.com/icinga/icinga-go-library/structify" + "github.com/icinga/icinga-go-library/types" "github.com/icinga/icingadb/pkg/common" "github.com/icinga/icingadb/pkg/contracts" v1 "github.com/icinga/icingadb/pkg/icingadb/v1" @@ -58,7 +59,7 @@ func (r *RuntimeUpdates) ClearStreams(ctx context.Context) (config, state redis. } // Sync synchronizes runtime update streams from s.redis to s.db and deletes the original data on success. -// Note that Sync must be only be called configuration synchronization has been completed. +// Note that Sync must only be called once configuration synchronization has been completed. // allowParallel allows synchronizing out of order (not FIFO). func (r *RuntimeUpdates) Sync( ctx context.Context, factoryFuncs []database.EntityFactoryFunc, streams redis.Streams, allowParallel bool, @@ -71,10 +72,22 @@ func (r *RuntimeUpdates) Sync( s := common.NewSyncSubject(factoryFunc) stat := getCounterForEntity(s.Entity()) + // Multiplexer channels used to distribute the Redis entities to several consumers. + upsertEntitiesMultiplexer := make(chan database.Entity, 1) + deleteIdsMultiplexer := make(chan any, 1) + updateMessages := make(chan redis.XMessage, r.redis.Options.XReadCount) upsertEntities := make(chan database.Entity, r.redis.Options.XReadCount) deleteIds := make(chan interface{}, r.redis.Options.XReadCount) + var insertSlaEntities chan database.Entity + var updateSlaEntities chan database.Entity + switch s.Entity().(type) { + case *v1.Host, *v1.Service: + insertSlaEntities = make(chan database.Entity, r.redis.Options.XReadCount) + updateSlaEntities = make(chan database.Entity, r.redis.Options.XReadCount) + } + var upsertedFifo chan database.Entity var deletedFifo chan interface{} var upsertCount int @@ -95,13 +108,47 @@ func (r *RuntimeUpdates) Sync( r.logger.Debugf("Syncing runtime updates of %s", s.Name()) g.Go(structifyStream( - ctx, updateMessages, upsertEntities, upsertedFifo, deleteIds, deletedFifo, + ctx, updateMessages, upsertEntitiesMultiplexer, upsertedFifo, deleteIdsMultiplexer, deletedFifo, structify.MakeMapStructifier( reflect.TypeOf(s.Entity()).Elem(), "json", contracts.SafeInit), )) + // This worker consumes the "upsert" event from Redis and redistributes the entities to the "upsertEntities" + // channel and for Host/Service entities also to the "insertSlaEntities" channel. + g.Go(func() error { + defer close(upsertEntities) + if insertSlaEntities != nil { + defer close(insertSlaEntities) + } + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case entity, ok := <-upsertEntitiesMultiplexer: + if !ok { + return nil + } + + select { + case upsertEntities <- entity: + case <-ctx.Done(): + return ctx.Err() + } + + if insertSlaEntities != nil { + select { + case insertSlaEntities <- entity: + case <-ctx.Done(): + return ctx.Err() + } + } + } + } + }) + g.Go(func() error { var counter com.Counter defer periodic.Start(ctx, r.logger.Interval(), func(_ periodic.Tick) { @@ -125,6 +172,59 @@ func (r *RuntimeUpdates) Sync( ) }) + // Consumes from the "insertSlaEntities" channel and bulk inserts into the "sla_lifecycle" table. + g.Go(func() error { + var counter com.Counter + defer periodic.Start(ctx, r.logger.Interval(), func(_ periodic.Tick) { + if count := counter.Reset(); count > 0 { + r.logger.Infof("Inserted %d %s sla lifecycles", count, s.Name()) + } + }).Stop() + + stmt, _ := r.db.BuildInsertIgnoreStmt(v1.NewSlaLifecycle()) + return r.db.NamedBulkExec( + ctx, stmt, upsertCount, r.db.GetSemaphoreForTable(slaLifecycleTable), + CreateSlaLifecyclesFromCheckables(ctx, s.Entity(), g, insertSlaEntities, false), + com.NeverSplit[database.Entity], database.OnSuccessIncrement[database.Entity](&counter)) + }) + + // This worker consumes the "delete" event from Redis and redistributes the IDs to the "deleteIds" + // channel and for Host/Service entities also to the "updateSlaEntities" channel. + g.Go(func() error { + defer close(deleteIds) + if updateSlaEntities != nil { + defer close(updateSlaEntities) + } + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case deleteId, ok := <-deleteIdsMultiplexer: + if !ok { + return nil + } + + select { + case deleteIds <- deleteId: + case <-ctx.Done(): + return ctx.Err() + } + + if updateSlaEntities != nil { + entity := factoryFunc() + entity.SetID(deleteId.(types.Binary)) + + select { + case updateSlaEntities <- entity: + case <-ctx.Done(): + return ctx.Err() + } + } + } + } + }) + g.Go(func() error { var counter com.Counter defer periodic.Start(ctx, r.logger.Interval(), func(_ periodic.Tick) { @@ -142,6 +242,23 @@ func (r *RuntimeUpdates) Sync( return r.db.BulkExec(ctx, r.db.BuildDeleteStmt(s.Entity()), deleteCount, sem, deleteIds, onSuccess...) }) + + // Consumes from the "updateSlaEntities" channel and updates the "delete_time" of each + // SLA lifecycle entry with "delete_time = 0" to now. + g.Go(func() error { + var counter com.Counter + defer periodic.Start(ctx, r.logger.Interval(), func(_ periodic.Tick) { + if count := counter.Reset(); count > 0 { + r.logger.Infof("Updated %d %s sla lifecycles", count, s.Name()) + } + }).Stop() + + stmt := fmt.Sprintf(`UPDATE %s SET delete_time = :delete_time WHERE "id" = :id AND "delete_time" = 0`, slaLifecycleTable) + return r.db.NamedBulkExec( + ctx, stmt, deleteCount, r.db.GetSemaphoreForTable(slaLifecycleTable), + CreateSlaLifecyclesFromCheckables(ctx, s.Entity(), g, updateSlaEntities, true), + com.NeverSplit[database.Entity], database.OnSuccessIncrement[database.Entity](&counter)) + }) } // customvar and customvar_flat sync. diff --git a/pkg/icingadb/sla_lifecycle.go b/pkg/icingadb/sla_lifecycle.go new file mode 100644 index 000000000..a27a25e3d --- /dev/null +++ b/pkg/icingadb/sla_lifecycle.go @@ -0,0 +1,123 @@ +package icingadb + +import ( + "context" + "fmt" + "github.com/icinga/icinga-go-library/backoff" + "github.com/icinga/icinga-go-library/database" + "github.com/icinga/icinga-go-library/retry" + "github.com/icinga/icinga-go-library/types" + v1 "github.com/icinga/icingadb/pkg/icingadb/v1" + "github.com/pkg/errors" + "golang.org/x/sync/errgroup" + "time" +) + +// slaLifecycleTable defines the table name of v1.SlaLifecycle type. +var slaLifecycleTable = database.TableName(v1.NewSlaLifecycle()) + +// CreateSlaLifecyclesFromCheckables transforms the given checkables to sla lifecycle struct +// and streams them into a returned channel. +func CreateSlaLifecyclesFromCheckables( + ctx context.Context, subject database.Entity, g *errgroup.Group, entities <-chan database.Entity, isDeleteEvent bool, +) <-chan database.Entity { + slaLifecycles := make(chan database.Entity, 1) + + g.Go(func() error { + defer close(slaLifecycles) + + env, ok := v1.EnvironmentFromContext(ctx) + if !ok { + return errors.New("can't get environment from context") + } + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case checkable, ok := <-entities: + if !ok { + return nil + } + + sl := &v1.SlaLifecycle{ + EnvironmentMeta: v1.EnvironmentMeta{EnvironmentId: env.Id}, + CreateTime: types.UnixMilli(time.Now()), + DeleteTime: types.UnixMilli(time.Unix(0, 0)), + } + + if isDeleteEvent { + sl.DeleteTime = types.UnixMilli(time.Now()) + sl.CreateTime = types.UnixMilli(time.Unix(0, 0)) + } + + switch subject.(type) { + case *v1.Host: + sl.Id = checkable.ID().(types.Binary) + sl.HostId = sl.Id + case *v1.Service: + sl.Id = checkable.ID().(types.Binary) + sl.ServiceId = sl.Id + sl.HostId = checkable.(*v1.Service).HostId + default: + return errors.Errorf("sla lifecycle for type %T is not supported", checkable) + } + + select { + case slaLifecycles <- sl: + case <-ctx.Done(): + return ctx.Err() + } + } + } + }) + + return slaLifecycles +} + +// SyncCheckablesSlaLifecycle inserts one `create_time` sla lifecycle entry for each of the checkables from +// the `host` and `service` tables and updates the `delete_time` of each of the sla lifecycle entries whose +// host/service IDs cannot be found in the `host/service` tables. +// +// It's unlikely, but when a given Checkable doesn't already have a `create_time` entry in the database, the update +// query won't update anything. Likewise, the insert statements may also become a no-op if the Checkables already +// have a `create_time` entry with ´delete_time = 0`. +// +// This function retries any database errors for at least `5m` before giving up and failing with an error. +func SyncCheckablesSlaLifecycle(ctx context.Context, db *database.DB) error { + hostInsertStmtFmt := ` +INSERT INTO %[1]s (id, environment_id, host_id, create_time) + SELECT id, environment_id, id, %[2]d AS create_time + FROM host WHERE NOT EXISTS(SELECT 1 FROM %[1]s WHERE service_id IS NULL AND delete_time = 0 AND host_id = host.id)` + + hostUpdateStmtFmt := ` +UPDATE %[1]s SET delete_time = %[2]d + WHERE service_id IS NULL AND delete_time = 0 AND NOT EXISTS(SELECT 1 FROM host WHERE host.id = %[1]s.id)` + + serviceInsertStmtFmt := ` +INSERT INTO %[1]s (id, environment_id, host_id, service_id, create_time) + SELECT id, environment_id, host_id, id, %[2]d AS create_time + FROM service WHERE NOT EXISTS(SELECT 1 FROM %[1]s WHERE delete_time = 0 AND service_id = service.id)` + + serviceUpdateStmtFmt := ` +UPDATE %[1]s SET delete_time = %[2]d + WHERE delete_time = 0 AND service_id IS NOT NULL AND NOT EXISTS(SELECT 1 FROM service WHERE service.id = %[1]s.id)` + + return retry.WithBackoff( + ctx, + func(context.Context) error { + eventTime := time.Now().UnixMilli() + for _, queryFmt := range []string{hostInsertStmtFmt, hostUpdateStmtFmt, serviceInsertStmtFmt, serviceUpdateStmtFmt} { + query := fmt.Sprintf(queryFmt, slaLifecycleTable, eventTime) + if _, err := db.ExecContext(ctx, query); err != nil { + return database.CantPerformQuery(err, query) + } + } + + return nil + }, + retry.Retryable, + backoff.NewExponentialWithJitter(1*time.Millisecond, 1*time.Second), + db.GetDefaultRetrySettings(), + ) +} diff --git a/pkg/icingadb/v1/sla_lifecycle.go b/pkg/icingadb/v1/sla_lifecycle.go new file mode 100644 index 000000000..3e8853a42 --- /dev/null +++ b/pkg/icingadb/v1/sla_lifecycle.go @@ -0,0 +1,24 @@ +package v1 + +import ( + "github.com/icinga/icinga-go-library/database" + "github.com/icinga/icinga-go-library/types" +) + +type SlaLifecycle struct { + EntityWithoutChecksum `json:",inline"` + EnvironmentMeta `json:",inline"` + HostId types.Binary `json:"host_id"` + ServiceId types.Binary `json:"service_id"` + CreateTime types.UnixMilli `json:"create_time"` + DeleteTime types.UnixMilli `json:"delete_time"` +} + +func NewSlaLifecycle() database.Entity { + return &SlaLifecycle{} +} + +// Assert interface compliance. +var ( + _ database.Entity = (*SlaLifecycle)(nil) +) diff --git a/schema/mysql/schema.sql b/schema/mysql/schema.sql index a1db99084..fb66638e0 100644 --- a/schema/mysql/schema.sql +++ b/schema/mysql/schema.sql @@ -1334,6 +1334,20 @@ CREATE TABLE sla_history_downtime ( INDEX idx_sla_history_downtime_env_downtime_end (environment_id, downtime_end) COMMENT 'Filter for sla history retention' ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC; +CREATE TABLE sla_lifecycle ( + id binary(20) NOT NULL COMMENT 'host.id if service_id is NULL otherwise service.id', + environment_id binary(20) NOT NULL COMMENT 'environment.id', + host_id binary(20) NOT NULL COMMENT 'host.id (may reference already deleted hosts)', + service_id binary(20) DEFAULT NULL COMMENT 'service.id (may reference already deleted services)', + + -- These columns are nullable, but as we're using the delete_time to build the composed primary key, we have to set + -- this to `0` instead, since it's not allowed to use a nullable column as part of the primary key. + create_time bigint unsigned NOT NULL DEFAULT 0 COMMENT 'unix timestamp the event occurred', + delete_time bigint unsigned NOT NULL DEFAULT 0 COMMENT 'unix timestamp the delete event occurred', + + PRIMARY KEY (id, delete_time) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC; + CREATE TABLE icingadb_schema ( id int unsigned NOT NULL AUTO_INCREMENT, version smallint unsigned NOT NULL, diff --git a/schema/mysql/upgrades/1.3.0.sql b/schema/mysql/upgrades/1.3.0.sql index 047903807..1cf613fd8 100644 --- a/schema/mysql/upgrades/1.3.0.sql +++ b/schema/mysql/upgrades/1.3.0.sql @@ -18,5 +18,91 @@ ALTER TABLE checkcommand_envvar MODIFY COLUMN envvar_key varchar(255) NOT NULL; ALTER TABLE eventcommand_envvar MODIFY COLUMN envvar_key varchar(255) NOT NULL; ALTER TABLE notificationcommand_envvar MODIFY COLUMN envvar_key varchar(255) NOT NULL; +CREATE TABLE sla_lifecycle ( + id binary(20) NOT NULL COMMENT 'host.id if service_id is NULL otherwise service.id', + environment_id binary(20) NOT NULL COMMENT 'environment.id', + host_id binary(20) NOT NULL COMMENT 'host.id (may reference already deleted hosts)', + service_id binary(20) DEFAULT NULL COMMENT 'service.id (may reference already deleted services)', + + -- These columns are nullable, but as we're using the delete_time to build the composed primary key, we have to set + -- this to `0` instead, since it's not allowed to use a nullable column as part of the primary key. + create_time bigint unsigned NOT NULL DEFAULT 0 COMMENT 'unix timestamp the event occurred', + delete_time bigint unsigned NOT NULL DEFAULT 0 COMMENT 'unix timestamp the delete event occurred', + + PRIMARY KEY (id, delete_time) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC; + +-- Insert a sla lifecycle create_time entry for all existing hosts with the LEAST timestamp found in either +-- the sla_history_state or sla_history_downtime table, otherwise fallback to the current Unix timestamp. +INSERT INTO sla_lifecycle (id, environment_id, host_id, create_time) + SELECT host.id, + host.environment_id, + host.id, + -- In MySQL/MariaDB, LEAST() returns NULL if either event_time or downtime_start is NULL, which is not + -- desirable for our use cases. So we need to work around this behaviour by nesting some COALESCE() calls. + COALESCE(LEAST(COALESCE(MIN(event_time), MIN(downtime_start)), COALESCE(MIN(downtime_start), MIN(event_time))), UNIX_TIMESTAMP() * 1000) AS create_time + FROM host + LEFT JOIN sla_history_state shs on host.id = shs.host_id AND shs.service_id IS NULL + LEFT JOIN sla_history_downtime shd on host.id = shd.host_id AND shd.service_id IS NULL + GROUP BY host.id, host.environment_id + ON DUPLICATE KEY UPDATE sla_lifecycle.id = sla_lifecycle.id; + +-- Insert a sla lifecycle deleted entry for all not existing hosts with the GREATEST timestamp +-- found in either the sla_history_state or sla_history_downtime table. +INSERT INTO sla_lifecycle (id, environment_id, host_id, delete_time) + SELECT host_id AS id, + environment_id, + host_id, + MAX(event_time) AS delete_time + FROM (SELECT host_id, environment_id, MAX(event_time) AS event_time + FROM sla_history_state + WHERE service_id IS NULL AND NOT EXISTS(SELECT 1 FROM host WHERE id = host_id) + GROUP BY host_id, environment_id + UNION ALL + SELECT host_id, environment_id, MAX(downtime_end) AS event_time + FROM sla_history_downtime + WHERE service_id IS NULL AND NOT EXISTS(SELECT 1 FROM host WHERE id = host_id) + GROUP BY host_id, environment_id + ) AS deleted_hosts + GROUP BY host_id, environment_id HAVING MAX(event_time) IS NOT NULL + ON DUPLICATE KEY UPDATE sla_lifecycle.id = sla_lifecycle.id; + +-- Insert a sla lifecycle create_time entry for all existing services with the LEAST timestamp found in either +-- the sla_history_state or sla_history_downtime table, otherwise fallback to the current Unix timestamp. +INSERT INTO sla_lifecycle (id, environment_id, host_id, service_id, create_time) + SELECT service.id, + service.environment_id, + service.host_id, + service.id, + -- In MySQL/MariaDB, LEAST() returns NULL if either event_time or downtime_start is NULL, which is not + -- desirable for our use cases. So we need to work around this behaviour by nesting some COALESCE() calls. + COALESCE(LEAST(COALESCE(MIN(event_time), MIN(downtime_start)), COALESCE(MIN(downtime_start), MIN(event_time))), UNIX_TIMESTAMP() * 1000) AS create_time + FROM service + LEFT JOIN sla_history_state shs on service.id = shs.service_id + LEFT JOIN sla_history_downtime shd on service.id = shd.service_id + GROUP BY service.id, service.host_id, service.environment_id + ON DUPLICATE KEY UPDATE sla_lifecycle.id = sla_lifecycle.id; + +-- Insert a sla lifecycle deleted entry for all not existing hosts with the GREATEST timestamp +-- found in either the sla_history_state or sla_history_downtime table. +INSERT INTO sla_lifecycle (id, environment_id, host_id, service_id, delete_time) + SELECT service_id AS id, + environment_id, + host_id, + service_id, + MAX(event_time) AS delete_time + FROM (SELECT service_id, environment_id, host_id, MAX(event_time) AS event_time + FROM sla_history_state + WHERE service_id IS NOT NULL AND NOT EXISTS(SELECT 1 FROM service WHERE id = service_id) + GROUP BY service_id, environment_id, host_id + UNION ALL + SELECT service_id, environment_id, host_id, MAX(downtime_end) AS event_time + FROM sla_history_downtime + WHERE service_id IS NOT NULL AND NOT EXISTS(SELECT 1 FROM service WHERE id = service_id) + GROUP BY service_id, environment_id, host_id + ) AS deleted_services + GROUP BY service_id, environment_id, host_id HAVING MAX(event_time) IS NOT NULL + ON DUPLICATE KEY UPDATE sla_lifecycle.id = sla_lifecycle.id; + INSERT INTO icingadb_schema (version, timestamp) VALUES (6, UNIX_TIMESTAMP() * 1000); diff --git a/schema/pgsql/schema.sql b/schema/pgsql/schema.sql index a68d41a1d..6def3e347 100644 --- a/schema/pgsql/schema.sql +++ b/schema/pgsql/schema.sql @@ -2171,6 +2171,27 @@ COMMENT ON COLUMN sla_history_downtime.downtime_id IS 'downtime.id (may referenc COMMENT ON COLUMN sla_history_downtime.downtime_start IS 'start time of the downtime'; COMMENT ON COLUMN sla_history_downtime.downtime_end IS 'end time of the downtime'; +CREATE TABLE sla_lifecycle ( + id bytea20 NOT NULL, + environment_id bytea20 NOT NULL, + host_id bytea20 NOT NULL, + service_id bytea20 DEFAULT NULL, + + -- These columns are nullable, but as we're using the delete_time to build the composed primary key, we have to set + -- this to `0` instead, since it's not allowed to use a nullable column as part of the primary key. + create_time biguint NOT NULL DEFAULT 0, + delete_time biguint NOT NULL DEFAULT 0, + + CONSTRAINT pk_sla_lifecycle PRIMARY KEY (id, delete_time) +); + +COMMENT ON COLUMN sla_lifecycle.id IS 'host.id if service_id is NULL otherwise service.id'; +COMMENT ON COLUMN sla_lifecycle.environment_id IS 'environment.id'; +COMMENT ON COLUMN sla_lifecycle.host_id IS 'host.id (may reference already deleted hosts)'; +COMMENT ON COLUMN sla_lifecycle.service_id IS 'service.id (may reference already deleted services)'; +COMMENT ON COLUMN sla_lifecycle.create_time IS 'unix timestamp the event occurred'; +COMMENT ON COLUMN sla_lifecycle.delete_time IS 'unix timestamp the delete event occurred'; + CREATE SEQUENCE icingadb_schema_id_seq; CREATE TABLE icingadb_schema ( diff --git a/schema/pgsql/upgrades/1.3.0.sql b/schema/pgsql/upgrades/1.3.0.sql index 80d75f5d3..5980b2885 100644 --- a/schema/pgsql/upgrades/1.3.0.sql +++ b/schema/pgsql/upgrades/1.3.0.sql @@ -15,5 +15,94 @@ ALTER TABLE checkcommand_envvar ALTER COLUMN envvar_key TYPE varchar(255); ALTER TABLE eventcommand_envvar ALTER COLUMN envvar_key TYPE varchar(255); ALTER TABLE notificationcommand_envvar ALTER COLUMN envvar_key TYPE varchar(255); +CREATE TABLE sla_lifecycle ( + id bytea20 NOT NULL, + environment_id bytea20 NOT NULL, + host_id bytea20 NOT NULL, + service_id bytea20 DEFAULT NULL, + + -- These columns are nullable, but as we're using the delete_time to build the composed primary key, we have to set + -- this to `0` instead, since it's not allowed to use a nullable column as part of the primary key. + create_time biguint NOT NULL DEFAULT 0, + delete_time biguint NOT NULL DEFAULT 0, + + CONSTRAINT pk_sla_lifecycle PRIMARY KEY (id, delete_time) +); + +COMMENT ON COLUMN sla_lifecycle.id IS 'host.id if service_id is NULL otherwise service.id'; +COMMENT ON COLUMN sla_lifecycle.environment_id IS 'environment.id'; +COMMENT ON COLUMN sla_lifecycle.host_id IS 'host.id (may reference already deleted hosts)'; +COMMENT ON COLUMN sla_lifecycle.service_id IS 'service.id (may reference already deleted services)'; +COMMENT ON COLUMN sla_lifecycle.create_time IS 'unix timestamp the event occurred'; +COMMENT ON COLUMN sla_lifecycle.delete_time IS 'unix timestamp the delete event occurred'; + +-- Insert a sla lifecycle create_time entry for all existing hosts with the LEAST timestamp found in either +-- the sla_history_state or sla_history_downtime table, otherwise fallback to the current Unix timestamp. +INSERT INTO sla_lifecycle (id, environment_id, host_id, create_time) + SELECT host.id, + host.environment_id, + host.id, + COALESCE(LEAST(MIN(event_time), MIN(downtime_start)), EXTRACT(EPOCH FROM now()) * 1000) AS create_time + FROM host + LEFT JOIN sla_history_state shs on host.id = shs.host_id AND shs.service_id IS NULL + LEFT JOIN sla_history_downtime shd on host.id = shd.host_id AND shd.service_id IS NULL + GROUP BY host.id, host.environment_id + ON CONFLICT ON CONSTRAINT pk_sla_lifecycle DO NOTHING; + +-- Insert a sla lifecycle deleted entry for all not existing hosts with the GREATEST timestamp +-- found in either the sla_history_state or sla_history_downtime table. +INSERT INTO sla_lifecycle (id, environment_id, host_id, delete_time) + SELECT host_id AS id, + environment_id, + host_id, + MAX(event_time) AS delete_time + FROM (SELECT host_id, environment_id, MAX(event_time) AS event_time + FROM sla_history_state + WHERE service_id IS NULL AND NOT EXISTS(SELECT 1 FROM host WHERE id = host_id) + GROUP BY host_id, environment_id + UNION ALL + SELECT host_id, environment_id, MAX(downtime_end) AS event_time + FROM sla_history_downtime + WHERE service_id IS NULL AND NOT EXISTS(SELECT 1 FROM host WHERE id = host_id) + GROUP BY host_id, environment_id + ) AS deleted_hosts + GROUP BY host_id, environment_id HAVING MAX(event_time) IS NOT NULL + ON CONFLICT ON CONSTRAINT pk_sla_lifecycle DO NOTHING; + +-- Insert a sla lifecycle create_time entry for all existing services with the LEAST timestamp found in either +-- the sla_history_state or sla_history_downtime table, otherwise fallback to the current Unix timestamp. +INSERT INTO sla_lifecycle (id, environment_id, host_id, service_id, create_time) + SELECT service.id, + service.environment_id, + service.host_id, + service.id, + COALESCE(LEAST(MIN(event_time), MIN(downtime_start)), EXTRACT(EPOCH FROM now()) * 1000) AS create_time + FROM service + LEFT JOIN sla_history_state shs on service.id = shs.service_id + LEFT JOIN sla_history_downtime shd on service.id = shd.service_id + GROUP BY service.id, service.host_id, service.environment_id + ON CONFLICT ON CONSTRAINT pk_sla_lifecycle DO NOTHING; + +-- Insert a sla lifecycle deleted entry for all not existing hosts with the GREATEST timestamp +-- found in either the sla_history_state or sla_history_downtime table. +INSERT INTO sla_lifecycle (id, environment_id, host_id, service_id, delete_time) + SELECT service_id AS id, + environment_id, + host_id, + service_id, + MAX(event_time) AS delete_time + FROM (SELECT service_id, environment_id, host_id, MAX(event_time) AS event_time + FROM sla_history_state + WHERE service_id IS NOT NULL AND NOT EXISTS(SELECT 1 FROM service WHERE id = service_id) + GROUP BY service_id, environment_id, host_id + UNION ALL + SELECT service_id, environment_id, host_id, MAX(downtime_end) AS event_time + FROM sla_history_downtime + WHERE service_id IS NOT NULL AND NOT EXISTS(SELECT 1 FROM service WHERE id = service_id) + GROUP BY service_id, environment_id, host_id + ) AS deleted_services + GROUP BY service_id, environment_id, host_id HAVING MAX(event_time) IS NOT NULL + ON CONFLICT ON CONSTRAINT pk_sla_lifecycle DO NOTHING; + INSERT INTO icingadb_schema (version, timestamp) VALUES (4, extract(epoch from now()) * 1000); diff --git a/tests/config_sync_delta_slalifecycle.conf b/tests/config_sync_delta_slalifecycle.conf new file mode 100644 index 000000000..e0c1e5ef4 --- /dev/null +++ b/tests/config_sync_delta_slalifecycle.conf @@ -0,0 +1,11 @@ +for (var index in range(5)) { + var hostName = "sla-lifecycle-host-" + index + object Host hostName { + check_command = "hostalive" + } + + object Service "sla-lifecycle-service" use (hostName) { + host_name = hostName + check_command = "dummy" + } +} diff --git a/tests/go.mod b/tests/go.mod index 981b4236c..f0000f5a7 100644 --- a/tests/go.mod +++ b/tests/go.mod @@ -6,13 +6,13 @@ require ( github.com/go-sql-driver/mysql v1.8.1 github.com/goccy/go-yaml v1.12.0 github.com/google/uuid v1.6.0 + github.com/icinga/icinga-go-library v0.3.0 github.com/icinga/icinga-testing v0.0.0-20240322142451-494ccd6d03e8 github.com/jmoiron/sqlx v1.4.0 github.com/lib/pq v1.10.9 github.com/redis/go-redis/v9 v9.7.0 github.com/stretchr/testify v1.9.0 go.uber.org/zap v1.27.0 - golang.org/x/exp v0.0.0-20221012112151-59b0eab1532e golang.org/x/sync v0.8.0 ) diff --git a/tests/go.sum b/tests/go.sum index 54a112ed1..aac658004 100644 --- a/tests/go.sum +++ b/tests/go.sum @@ -66,6 +66,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= +github.com/icinga/icinga-go-library v0.3.0 h1:BeoomAiQC5RTRWCNqNkgbdTGxQ7ZFfkruR4HCSn5e0k= +github.com/icinga/icinga-go-library v0.3.0/go.mod h1:YN7XJN3W0FodD+j4kirO89zk2tgvanXWt1RMV8UgOLo= github.com/icinga/icinga-testing v0.0.0-20240322142451-494ccd6d03e8 h1:PI+39IY1BjN24JC3B6Jy0rhwm3hqC4SnQFxbZjXOaHk= github.com/icinga/icinga-testing v0.0.0-20240322142451-494ccd6d03e8/go.mod h1:xjNiwePgnSVKJWPG/iFG7pNOibU/OWp01Zdl08o+EeI= github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= @@ -171,8 +173,6 @@ golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliY golang.org/x/crypto v0.16.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= -golang.org/x/exp v0.0.0-20221012112151-59b0eab1532e h1:/SJUJZl3kz7J5GzAx5lgaKvqKGd4OfzshwDMr6YJCC4= -golang.org/x/exp v0.0.0-20221012112151-59b0eab1532e/go.mod h1:cyybsKvd6eL0RnXn6p/Grxp8F5bW7iYuBgsNCOHpMYE= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= diff --git a/tests/object_sync_test.go b/tests/object_sync_test.go index 23d2c96a0..a6130b5f5 100644 --- a/tests/object_sync_test.go +++ b/tests/object_sync_test.go @@ -6,6 +6,7 @@ import ( "database/sql" _ "embed" "fmt" + "github.com/icinga/icinga-go-library/types" "github.com/icinga/icinga-testing/services" "github.com/icinga/icinga-testing/utils" "github.com/icinga/icinga-testing/utils/eventually" @@ -16,9 +17,9 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/zap" - "golang.org/x/exp/slices" "io" "reflect" + "slices" "sort" "strings" "testing" @@ -30,6 +31,9 @@ import ( var testSyncConfRaw string var testSyncConfTemplate = template.Must(template.New("testdata.conf").Parse(testSyncConfRaw)) +//go:embed config_sync_delta_slalifecycle.conf +var slaLifecycleConfigSync []byte + var usergroups = []string{ "testusergroup1", "testusergroup2", @@ -91,9 +95,9 @@ func TestObjectSync(t *testing.T) { require.NoError(t, err, "generate icinga2 notification config") } //logger.Sugar().Infof("config:\n\n%s\n\n", conf.String()) - i.WriteConfig("etc/icinga2/conf.d/testdata.conf", conf.Bytes()) + i.WriteConfig("etc/icinga2/conf.d/testdata.conf", slaLifecycleConfigSync) i.EnableIcingaDb(r) - i.Reload() + require.NoError(t, i.Reload(), "Icinga 2 reload") // Wait for Icinga 2 to signal a successful dump before starting // Icinga DB to ensure that we actually test the initial sync. @@ -104,15 +108,58 @@ func TestObjectSync(t *testing.T) { logger.Debug("starting icingadb") it.IcingaDbInstanceT(t, r, rdb) + // Wait some time to give Icinga DB a chance to finish syncing the 5 dummy hosts and services. + time.Sleep(2 * time.Second) + db, err := sqlx.Open(rdb.Driver(), rdb.DSN()) require.NoError(t, err, "connecting to SQL database shouldn't fail") t.Cleanup(func() { _ = db.Close() }) + dummyHostIdsMap := map[string]SlaLifecycle{} + dummyServiceIdsMap := map[string]SlaLifecycle{} + scanDummyObjFunc := func(typ string) { + var query string + if typ == "service" { + query = `SELECT service.name AS service_name, host.name as host_name, service.id AS service_id, host_id FROM service INNER JOIN host ON host.id=host_id` + } else { + query = `SELECT "name" AS host_name, id AS host_id FROM host` + } + + rows, err := db.Queryx(query) + assert.NoError(t, err, "select all dummy %s ids", typ) + defer rows.Close() + + for rows.Next() { + obj := &struct { + HostName string `db:"host_name"` + ServiceName string `db:"service_name"` + SlaLifecycle `db:",inline"` + }{ + SlaLifecycle: SlaLifecycle{CreateTime: types.UnixMilli(time.Now())}, + } + + require.NoError(t, rows.StructScan(obj), "scan dummy %s", typ) + + if typ == "host" { + dummyHostIdsMap[obj.HostName] = obj.SlaLifecycle + } else { + dummyServiceIdsMap[obj.HostName+"!"+obj.ServiceName] = obj.SlaLifecycle + } + } + } + + // Fetch the dummy host and service ids from the database before reloading Icinga 2 with the new config. + go scanDummyObjFunc("host") + go scanDummyObjFunc("service") + + // Write the regular conf bytes excluding the dummy bytes we wrote earlier and reload the Icinga 2 instance. + i.WriteConfig("etc/icinga2/conf.d/testdata.conf", conf.Bytes()) + require.NoError(t, i.Reload(), "Icinga 2 reload") + t.Run("Host", func(t *testing.T) { t.Parallel() for _, host := range data.Hosts { - host := host t.Run("Verify-"+host.VariantInfoString(), func(t *testing.T) { t.Parallel() @@ -144,7 +191,6 @@ func TestObjectSync(t *testing.T) { t.Parallel() for _, service := range data.Services { - service := service t.Run("Verify-"+service.VariantInfoString(), func(t *testing.T) { t.Parallel() @@ -171,6 +217,49 @@ func TestObjectSync(t *testing.T) { } }) + t.Run("SlaLifeCycle", func(t *testing.T) { + t.Parallel() + + deleteTime := types.UnixMilli(time.Now()) + t.Run("Hosts", func(t *testing.T) { + t.Parallel() + + for i := 0; i < 5; i++ { + host := &Host{Name: "sla-lifecycle-host-" + fmt.Sprint(i)} + + t.Run("Verify-Host-"+fmt.Sprint(i), func(t *testing.T) { + t.Parallel() + + slinfo := dummyHostIdsMap[host.Name] + slinfo.DeleteTime = deleteTime + + eventually.Assert(t, func(t require.TestingT) { + verifySlaLifecycleRow(t, db, &slinfo, false) + }, 20*time.Second, 1*time.Second) + }) + } + }) + + t.Run("Services", func(t *testing.T) { + t.Parallel() + + for i := 0; i < 5; i++ { + service := &Service{Name: "sla-lifecycle-service", HostName: newString("sla-lifecycle-host-" + fmt.Sprint(i))} + + t.Run("Verify-Service-"+fmt.Sprint(i), func(t *testing.T) { + t.Parallel() + + slinfo := dummyServiceIdsMap[*service.HostName+"!"+service.Name] + slinfo.DeleteTime = deleteTime + + eventually.Assert(t, func(t require.TestingT) { + verifySlaLifecycleRow(t, db, &slinfo, false) + }, 20*time.Second, 1*time.Second) + }) + } + }) + }) + t.Run("HostGroup", func(t *testing.T) { t.Parallel() // TODO(jb): add tests @@ -324,8 +413,6 @@ func TestObjectSync(t *testing.T) { t.Parallel() for _, service := range makeTestSyncServices(t) { - service := service - t.Run("CreateAndDelete-"+service.VariantInfoString(), func(t *testing.T) { t.Parallel() @@ -418,6 +505,76 @@ func TestObjectSync(t *testing.T) { }) }) + t.Run("SlaLifeCycle", func(t *testing.T) { + t.Parallel() + + assertCheckableFunc := func(checkable any, objType string, objName string, host string, service string) { + client.CreateObject(t, objType, objName, map[string]any{ + "attrs": makeIcinga2ApiAttributes(checkable, false), + }) + + slinfo := &SlaLifecycle{CreateTime: types.UnixMilli(time.Now())} + eventually.Assert(t, func(t require.TestingT) { + // We can't join on the host/service tables, as the sla lifecycle entries may reference checkables + // that have already been deleted. So fetch the host/service id from DB before performing the actual test. + require.NoError(t, fetchCheckableId(db, slinfo, host, service)) + + verifySlaLifecycleRow(t, db, slinfo, false) + }, 20*time.Second, 1*time.Second) + + client.DeleteObject(t, objType, objName, false) + + slinfo.DeleteTime = types.UnixMilli(time.Now()) + eventually.Assert(t, func(t require.TestingT) { + verifySlaLifecycleRow(t, db, slinfo, false) + }, 20*time.Second, 1*time.Second) + + client.CreateObject(t, objType, objName, map[string]interface{}{ + "attrs": makeIcinga2ApiAttributes(checkable, false), + }) + + // We are recreating this checkable, so we only have to change the timestamps as the + // checkable id will remain the same. + slinfo.CreateTime = types.UnixMilli(time.Now()) + slinfo.DeleteTime = types.UnixMilli(time.Time{}) + + eventually.Assert(t, func(t require.TestingT) { + verifySlaLifecycleRow(t, db, slinfo, true) + }, 20*time.Second, 1*time.Second) + + client.DeleteObject(t, objType, objName, false) + + slinfo.DeleteTime = types.UnixMilli(time.Now()) + eventually.Assert(t, func(t require.TestingT) { + verifySlaLifecycleRow(t, db, slinfo, true) + }, 20*time.Second, 1*time.Second) + } + + t.Run("Host", func(t *testing.T) { + t.Parallel() + + for hostId, host := range makeTestSyncHosts(t) { + t.Run("Verify-Host-"+fmt.Sprint(hostId), func(t *testing.T) { + t.Parallel() + + assertCheckableFunc(host, "hosts", host.Name, host.Name, "") + }) + } + }) + + t.Run("Service", func(t *testing.T) { + t.Parallel() + + for serviceId, service := range makeTestSyncServices(t) { + t.Run("Verify-Service-"+fmt.Sprint(serviceId), func(t *testing.T) { + t.Parallel() + + assertCheckableFunc(service, "services", *service.HostName+"!"+service.Name, *service.HostName, service.Name) + }) + } + }) + }) + t.Run("User", func(t *testing.T) { t.Parallel() @@ -1188,6 +1345,86 @@ func verifyIcingaDbRow(t require.TestingT, db *sqlx.DB, obj interface{}) { require.False(t, rows.Next(), "SQL query should return only one row: %s", query) } +// verifySlaLifecycleRow verifies the sla lifecycle entries matching the given host/service id. It checks the creation +// and deletion time of the specified checkable. When the provided checkable was recreated, it also additionally requires +// two sla lifecycle entries to exist that match the checkables id. +func verifySlaLifecycleRow(t require.TestingT, db *sqlx.DB, slinfo *SlaLifecycle, isRecreated bool) { + query := `SELECT "create_time", "delete_time" FROM "sla_lifecycle" WHERE "host_id" = ?` + args := []interface{}{slinfo.HostID} + if !slinfo.ServiceID.Valid() { + query += ` AND "service_id" IS NULL` + } else { + query += ` AND "service_id" = ?` + args = append(args, slinfo.ServiceID) + } + query += ` ORDER BY "create_time" ASC` + + var resultSet []SlaLifecycle + err := db.Select(&resultSet, db.Rebind(query), args...) + require.NoError(t, err, "querying sla lifecycle should not fail: Query: %q", query) + + zerotimestamp := time.Unix(0, 0) + var result SlaLifecycle + + if isRecreated { + require.Len(t, resultSet, 2, "there should be two sla lifecycle entry") + + result = resultSet[1] + recreated := resultSet[0] + assert.NotEqual(t, zerotimestamp, recreated.CreateTime.Time()) + assert.NotEqual(t, zerotimestamp, recreated.DeleteTime.Time()) + + assert.Less(t, recreated.CreateTime.Time(), slinfo.CreateTime.Time()) + assert.Less(t, recreated.DeleteTime.Time(), slinfo.CreateTime.Time()) + + if !slinfo.DeleteTime.Time().IsZero() { + assert.Less(t, recreated.CreateTime.Time(), slinfo.DeleteTime.Time()) + assert.Less(t, recreated.DeleteTime.Time(), slinfo.DeleteTime.Time()) + } + } else { + require.Len(t, resultSet, 1, "there should be one sla lifecycle entry") + + result = resultSet[0] + } + + assert.NotEqual(t, zerotimestamp, result.CreateTime.Time()) + assert.WithinDuration(t, slinfo.CreateTime.Time(), result.CreateTime.Time(), time.Minute) + + if slinfo.DeleteTime.Time().IsZero() { + assert.Equal(t, zerotimestamp, result.DeleteTime.Time()) + } else { + assert.NotEqual(t, zerotimestamp, result.DeleteTime.Time()) + assert.Less(t, result.CreateTime.Time(), result.DeleteTime.Time()) + assert.WithinDuration(t, slinfo.DeleteTime.Time(), result.DeleteTime.Time(), time.Minute) + } +} + +// fetchCheckableId retrieves host/service id from the database matching the given host/service +// name and scans to the provided slinfo. Returns an error on any database failure. +func fetchCheckableId(db *sqlx.DB, slinfo *SlaLifecycle, host string, service string) error { + query := `SELECT "sla_lifecycle"."host_id", "sla_lifecycle"."service_id" + FROM "sla_lifecycle" INNER JOIN "host" ON "host"."id"="sla_lifecycle"."host_id"` + + where := ` WHERE "host"."name" = ?` + args := []interface{}{host} + if service == "" { + where += ` AND "service_id" IS NULL` + } else { + query += ` INNER JOIN "service" ON "service"."id"="sla_lifecycle"."service_id"` + where += ` AND "service"."name" = ?` + args = append(args, service) + } + + return db.QueryRowx(db.Rebind(query+where), args...).StructScan(slinfo) +} + +type SlaLifecycle struct { + CreateTime types.UnixMilli `db:"create_time"` + DeleteTime types.UnixMilli `db:"delete_time"` + HostID types.Binary `db:"host_id"` + ServiceID types.Binary `db:"service_id"` +} + // newString allocates a new *string and initializes it. This helper function exists as // there seems to be no way to achieve this within a single statement. func newString(s string) *string {