From e8c5d51e6c04cb9a61b5f4de33c9a629db2457e1 Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Thu, 4 Jul 2024 23:58:54 +0530 Subject: [PATCH 01/19] changes for integration of otel (#64) * changes for integration of otel * changes for adding otel collector functional test * changes for incorporate suggestions adding functional tests * optimize the code related to cal logging * reading az values from envs --------- Co-authored-by: Rajesh S --- go.mod | 26 +- lib/config.go | 34 +- lib/main.go | 24 +- lib/statelog.go | 82 +++-- tests/unittest/otel_basic/main_test.go | 128 +++++++ .../otel_incorrect_endpoint/main_test.go | 117 ++++++ .../unittest/otel_with_skip_cal/main_test.go | 114 ++++++ tests/unittest/testutil/defs.go | 44 +++ tests/unittest/testutil/main.go | 7 + tests/unittest/testutil/setup.go | 102 +++++- utility/logger/otel/config/otelconfig.go | 64 ++++ utility/logger/otel/defs.go | 136 +++++++ utility/logger/otel/error_handler.go | 79 ++++ utility/logger/otel/logger.go | 308 ++++++++++++++++ utility/logger/otel/state_logger.go | 310 ++++++++++++++++ utility/logger/otel/test/mock_collector.go | 341 ++++++++++++++++++ utility/logger/otel/test/state_logger_test.go | 332 +++++++++++++++++ 17 files changed, 2204 insertions(+), 44 deletions(-) create mode 100644 tests/unittest/otel_basic/main_test.go create mode 100644 tests/unittest/otel_incorrect_endpoint/main_test.go create mode 100644 tests/unittest/otel_with_skip_cal/main_test.go create mode 100644 tests/unittest/testutil/defs.go create mode 100644 utility/logger/otel/config/otelconfig.go create mode 100644 utility/logger/otel/defs.go create mode 100644 utility/logger/otel/error_handler.go create mode 100644 utility/logger/otel/logger.go create mode 100644 utility/logger/otel/state_logger.go create mode 100644 utility/logger/otel/test/mock_collector.go create mode 100644 utility/logger/otel/test/state_logger_test.go diff --git a/go.mod b/go.mod index 372ef230..fbac4a03 100644 --- a/go.mod +++ b/go.mod @@ -1,14 +1,36 @@ module github.com/paypal/hera -go 1.18 +go 1.20 require ( github.com/go-sql-driver/mysql v1.7.1 github.com/godror/godror v0.26.3 github.com/lib/pq v1.10.3 + go.opentelemetry.io/otel v1.24.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.24.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.24.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.24.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.24.0 + go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.24.0 + go.opentelemetry.io/otel/metric v1.24.0 + go.opentelemetry.io/otel/sdk v1.24.0 + go.opentelemetry.io/otel/sdk/metric v1.24.0 + go.opentelemetry.io/proto/otlp v1.2.0 + google.golang.org/protobuf v1.34.1 ) require ( + github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/go-logfmt/logfmt v0.5.0 // indirect - google.golang.org/protobuf v1.33.0 // indirect + github.com/go-logr/logr v1.4.1 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect + go.opentelemetry.io/otel/trace v1.24.0 // indirect + golang.org/x/net v0.25.0 // indirect + golang.org/x/sys v0.20.0 // indirect + golang.org/x/text v0.15.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240520151616-dc85e6b867a5 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240515191416-fc5f0ca64291 // indirect + google.golang.org/grpc v1.64.0 // indirect ) diff --git a/lib/config.go b/lib/config.go index 4b8bdc66..6303c004 100644 --- a/lib/config.go +++ b/lib/config.go @@ -20,6 +20,7 @@ package lib import ( "errors" "fmt" + otelconfig "github.com/paypal/hera/utility/logger/otel/config" "os" "path/filepath" "strings" @@ -29,7 +30,7 @@ import ( "github.com/paypal/hera/utility/logger" ) -//The Config contains all the static configuration +// The Config contains all the static configuration type Config struct { CertChainFile string KeyFile string // leave blank for no SSL @@ -219,7 +220,7 @@ func parseMapStrStr(encoded string) map[string]string { } // InitConfig initializes the configuration, both the static configuration (from hera.txt) and the dynamic configuration -func InitConfig() error { +func InitConfig(poolName string) error { currentDir, abserr := filepath.Abs(filepath.Dir(os.Args[0])) if abserr != nil { @@ -461,9 +462,38 @@ func InitConfig() error { gAppConfig.MaxDesiredHealthyWorkerPct = 90 } + //Initialize OTEL configs + initializeOTELConfigs(cdb, poolName) + if logger.GetLogger().V(logger.Info) { + otelconfig.OTelConfigData.Dump() + } return nil } +// This function takes care of initialize OTEL configuration +func initializeOTELConfigs(cdb config.Config, poolName string) { + otelconfig.OTelConfigData = &otelconfig.OTelConfig{} + //TODO initialize the values + otelconfig.OTelConfigData.Enabled = cdb.GetOrDefaultBool("enable_otel", false) + otelconfig.OTelConfigData.SkipCalStateLog = cdb.GetOrDefaultBool("skip_cal_statelog", false) + otelconfig.OTelConfigData.MetricNamePrefix = cdb.GetOrDefaultString("otel_metric_prefix", "pp.occ") + otelconfig.OTelConfigData.Host = cdb.GetOrDefaultString("otel_agent_host", "localhost") + otelconfig.OTelConfigData.HttpPort = cdb.GetOrDefaultInt("otel_agent_http_port", 4318) + otelconfig.OTelConfigData.GRPCPort = cdb.GetOrDefaultInt("otel_agent_grpc_port", 4317) + otelconfig.OTelConfigData.UseOtelGRPC = cdb.GetOrDefaultBool("otel_agent_use_grpc", false) + otelconfig.OTelConfigData.MetricsURLPath = cdb.GetOrDefaultString("otel_agent_metrics_uri", "") + otelconfig.OTelConfigData.TraceURLPath = cdb.GetOrDefaultString("otel_agent_trace_uri", "") + otelconfig.OTelConfigData.PoolName = poolName + otelconfig.OTelConfigData.UseTls = cdb.GetOrDefaultBool("otel_use_tls", false) + otelconfig.OTelConfigData.TLSCertPath = cdb.GetOrDefaultString("otel_tls_cert_path", "") + otelconfig.OTelConfigData.ResolutionTimeInSec = cdb.GetOrDefaultInt("otel_resolution_time_in_sec", 1) + otelconfig.OTelConfigData.ExporterTimeout = cdb.GetOrDefaultInt("otel_exporter_time_in_sec", 30) + otelconfig.OTelConfigData.EnableRetry = cdb.GetOrDefaultBool("otel_enable_exporter_retry", false) + otelconfig.OTelConfigData.ResourceType = gAppConfig.StateLogPrefix + otelconfig.OTelConfigData.OTelErrorReportingInterval = cdb.GetOrDefaultInt("otel_error_reporting_interval_in_sec", 60) + otelconfig.SetOTelIngestToken(cdb.GetOrDefaultString("otel_ingest_token", "")) +} + // CheckOpsConfigChange checks if the ops config file needs to be reloaded and reloads it if necessary. // it is called every several seconds from a dedicated go-routine. func CheckOpsConfigChange() { diff --git a/lib/main.go b/lib/main.go index 7a6227fd..7040103b 100644 --- a/lib/main.go +++ b/lib/main.go @@ -18,8 +18,11 @@ package lib import ( + "context" "flag" "fmt" + otellogger "github.com/paypal/hera/utility/logger/otel" + otelconfig "github.com/paypal/hera/utility/logger/otel/config" "math/rand" "os" "os/signal" @@ -50,7 +53,7 @@ func Run() { rand.Seed(time.Now().Unix()) - err := InitConfig() + err := InitConfig(*namePtr) if err != nil { if logger.GetLogger().V(logger.Alert) { logger.GetLogger().Log(logger.Alert, "failed to initialize configuration:", err.Error()) @@ -106,7 +109,24 @@ func Run() { caltxn.SetCorrelationID("runtxn") caltxn.Completed() - GetStateLog().SetStartTime(time.Now()) + //Initialize OTEL + if otelconfig.OTelConfigData.Enabled { + shutdownFunc, err := otellogger.Init(context.Background()) + if err != nil { + logger.GetLogger().Log(logger.Alert, fmt.Sprintf("failed to initialize OTEL, err: %v", err)) + evt := cal.NewCalEvent("OTEL_INIT", *namePtr, "2", fmt.Sprintf("erro: %v", err)) + evt.Completed() + if otelconfig.OTelConfigData.SkipCalStateLog { + logger.GetLogger().Log(logger.Alert, fmt.Sprintf("OTEL initialization failed. Only the OTEL state-log has been enabled. It is not safe to start the server")) + FullShutdown() + } + } + GetStateLog().SetStartTime(time.Now()) + defer otellogger.StopMetricCollection() //Stop sending metrics data + defer shutdownFunc(context.Background()) //During exit from mux, this will takecare of OTEL providers clean-up + } else { + GetStateLog().SetStartTime(time.Now()) + } go func() { sleep := time.Duration(GetConfig().ConfigReloadTimeMs) diff --git a/lib/statelog.go b/lib/statelog.go index 67a1e9ae..1c9ee2d1 100644 --- a/lib/statelog.go +++ b/lib/statelog.go @@ -21,6 +21,9 @@ import ( "bytes" "errors" "fmt" + otel_logger "github.com/paypal/hera/utility/logger/otel" + otelconfig "github.com/paypal/hera/utility/logger/otel/config" + "go.opentelemetry.io/otel" "log" "os" "path/filepath" @@ -71,11 +74,9 @@ type ConnStateInfo struct { perStateCnt []int } -// // StateLog is exposed as a singleton. all stateful resources are protected behind a // message channel that sychronizes incoming messages. user should not call any of // the internal functions that are not threadsafe. -// type StateLog struct { // // array of maps for different workertypes with each value holding a two dimension @@ -482,6 +483,10 @@ func (sl *StateLog) init() error { // // for each shard, initialize map // + var totalWorkersCount int //Use this value to initialize bufferred channel for statelog metrics + // + // for each shard, initialize map + // for s := 0; s < sl.maxShardSize; s++ { sl.mWorkerStates[s] = make(map[HeraWorkerType][][]*WorkerStateInfo, wtypeTotalCount) sl.mConnStates[s] = make(map[HeraWorkerType][]*ConnStateInfo, wtypeTotalCount) @@ -494,7 +499,7 @@ func (sl *StateLog) init() error { for t := 0; t < int(wtypeTotalCount); t++ { instCnt := workerpoolcfg[s][HeraWorkerType(t)].instCnt workerCnt := workerpoolcfg[s][HeraWorkerType(t)].maxWorkerCnt - + totalWorkersCount += workerCnt sl.mWorkerStates[s][HeraWorkerType(t)] = make([][]*WorkerStateInfo, instCnt) sl.mConnStates[s][HeraWorkerType(t)] = make([]*ConnStateInfo, instCnt) sl.mTypeTitles[s][HeraWorkerType(t)] = make([]string, instCnt) @@ -522,11 +527,10 @@ func (sl *StateLog) init() error { } } } - // // prepare horizontal (state) and vertical (workertype) titles. // - var shardEnabled = (GetConfig().EnableSharding && (GetConfig().NumOfShards >= 1)) + var shardEnabled = GetConfig().EnableSharding && (GetConfig().NumOfShards >= 1) var buf bytes.Buffer buf.WriteString("-----------") for i := 0; i < (MaxWorkerState + MaxConnState - 1); i++ { @@ -565,6 +569,16 @@ func (sl *StateLog) init() error { sl.mEventChann = make(chan StateEvent, 3000) + if otelconfig.OTelConfigData.Enabled { + // Initialize statelog_metrics to send metrics information currently we are ignoring registration object returned from this call + stateStartErr := otel_logger.StartMetricsCollection(totalWorkersCount, + otel_logger.WithMetricProvider(otel.GetMeterProvider()), + otel_logger.WithAppName(otelconfig.OTelConfigData.PoolName)) + + if stateStartErr != nil { + logger.GetLogger().Log(logger.Alert, "failed to start metric collection agent for statelogs", stateStartErr) + } + } // // start periodical reporting // @@ -750,6 +764,14 @@ func (sl *StateLog) genReport() { if workerCnt == 0 { continue } + // Initialize statedata object + workerStatesData := otel_logger.WorkersStateData{ + ShardId: int(s), + WorkerType: int(t), + InstanceId: int(n), + StateData: make(map[string]int64), + } + // // count all request/response for all workers under the instance // @@ -790,35 +812,39 @@ func (sl *StateLog) genReport() { stateCnt[MaxWorkerState+c] = 0 } } - // - // write collection into calheartbeat(cased out) and log (oneline). - // - hb := cal.NewCalHeartBeat("STATE", sl.mTypeTitles[s][HeraWorkerType(t)][n], cal.TransOK, "") - for i := 0; i < (MaxWorkerState + MaxConnState - 1); i++ { - buf.WriteString(fmt.Sprintf("%6d", stateCnt[i])) - hb.AddDataInt(StateNames[i], int64(stateCnt[i])) - } - hb.AddDataInt("req", int64(reqCnt-sl.mLastReqCnt[s][HeraWorkerType(t)][n])) - hb.AddDataInt("resp", int64(respCnt-sl.mLastRspCnt[s][HeraWorkerType(t)][n])) - /* - buf.WriteString(fmt.Sprintf("%6d", totalConnections)) - if sl.HasActiveWorker() { - buf.WriteString(fmt.Sprintf("%6d", 1)) - } else { - buf.WriteString(fmt.Sprintf("%6d", 0)) + + //Send statelog data to OTEL statsdata channel + if otelconfig.OTelConfigData.Enabled { + for i := 0; i < (MaxWorkerState + MaxConnState - 1); i++ { + buf.WriteString(fmt.Sprintf("%6d", stateCnt[i])) + workerStatesData.StateData[StateNames[i]] = int64(stateCnt[i]) } - if sl.ProxyHasCapacity(GetConfig().BacklogLimit, GetConfig().ReadonlyBacklogLimit) { - buf.WriteString(fmt.Sprintf("%6d", 1)) - } else { - buf.WriteString(fmt.Sprintf("%6d", 0)) + //Adding req and response metrics to OTEL + workerStatesData.StateData["req"] = int64(reqCnt - sl.mLastReqCnt[s][HeraWorkerType(t)][n]) + workerStatesData.StateData["resp"] = int64(respCnt - sl.mLastRspCnt[s][HeraWorkerType(t)][n]) + otel_logger.AddDataPointToOTELStateDataChan(&workerStatesData) + } else { + for i := 0; i < (MaxWorkerState + MaxConnState - 1); i++ { + buf.WriteString(fmt.Sprintf("%6d", stateCnt[i])) } - */ - hb.Completed() + } + + if !otelconfig.OTelConfigData.Enabled || (otelconfig.OTelConfigData.Enabled && !otelconfig.OTelConfigData.SkipCalStateLog) { + // write collection into calheartbeat(cased out) and log (oneline). + //If enable_otel_metrics_only not enabled then it sends CAL heart beat event or else send data to file and OTEL agent + hb := cal.NewCalHeartBeat("STATE", sl.mTypeTitles[s][HeraWorkerType(t)][n], cal.TransOK, "") + for i := 0; i < (MaxWorkerState + MaxConnState - 1); i++ { + hb.AddDataInt(StateNames[i], int64(stateCnt[i])) + } + hb.AddDataInt("req", int64(reqCnt-sl.mLastReqCnt[s][HeraWorkerType(t)][n])) + hb.AddDataInt("resp", int64(respCnt-sl.mLastRspCnt[s][HeraWorkerType(t)][n])) + hb.Completed() + } sl.fileLogger.Println(getTime() + buf.String()) sl.mLastReqCnt[s][HeraWorkerType(t)][n] = reqCnt sl.mLastRspCnt[s][HeraWorkerType(t)][n] = respCnt - } // instance + } // instance// instance } // wtype } // sharding } diff --git a/tests/unittest/otel_basic/main_test.go b/tests/unittest/otel_basic/main_test.go new file mode 100644 index 00000000..60a3b6b6 --- /dev/null +++ b/tests/unittest/otel_basic/main_test.go @@ -0,0 +1,128 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31002" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["sharding_cfg_reload_interval"] = "0" + appcfg["rac_sql_interval"] = "0" + appcfg["child.executable"] = "mysqlworker" + appcfg["enable_otel"] = "true" + appcfg["otel_resolution_time_in_sec"] = "3" + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "3" + opscfg["opscfg.default.server.log_level"] = "5" + os.Setenv("AVAILABILITY_ZONE", "test-dev") + os.Setenv("ENVIRONMENT", "dev") + return appcfg, opscfg, testutil.MySQLWorker +} + +func before() error { + tableName = os.Getenv("TABLE_NAME") + if tableName == "" { + tableName = "jdbc_hera_test" + } + if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { + // mysql + testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") + } + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func TestOTELMetricsBasic(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsBasic begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + // cleanup and insert one row in the table + conn, err := db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + tx, _ := conn.BeginTx(ctx, nil) + sqlTxt := "/*cmd*/delete from " + tableName + stmt, _ := tx.PrepareContext(ctx, sqlTxt) + _, err = stmt.Exec() + if err != nil { + t.Fatalf("Error preparing test (delete table) %s with %s ==== sql\n", err.Error(), sqlTxt) + } + + stmt, _ = tx.PrepareContext(ctx, "/*cmd*/insert into "+tableName+" (id, int_val, str_val) VALUES(?, ?, ?)") + _, err = stmt.Exec(1, time.Now().Unix(), "val 1") + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + if err != nil { + t.Fatalf("Error commit %s\n", err.Error()) + } + + stmt, _ = conn.PrepareContext(ctx, "/*cmd*/Select id, int_val from "+tableName+" where id=?") + rows, _ := stmt.Query(1) + if !rows.Next() { + t.Fatalf("Expected 1 row") + } + + time.Sleep(10 * time.Second) + rows.Close() + stmt.Close() + + cancel() + conn.Close() + //Read OTEL log file for metrics validation + logFilePath := filepath.Join(testutil.GetOTELLogDirPath(), "otel_collector.log") + count := testutil.RegexCountFile("{\"key\":\"application\",\"value\":{\"stringValue\":\"hera-test\"}", logFilePath) + if count < 1 { + t.Fatalf("OTEL event should contain application as hera-test") + } + initCount := testutil.RegexCountFile("\"name\":\"pp.occ.init_connection.count\"", logFilePath) + if initCount < 1 { + t.Fatalf("OTEL event should contain metric name pp.occ.init_connection.count") + } + tagsCount := testutil.RegexCountFile("{\"key\":\"InstanceId\",\"value\":{\"intValue\":\"0\"}},{\"key\":\"ShardId\",\"value\":{\"intValue\":\"0\"}},{\"key\":\"WorkerType\",\"value\":{\"intValue\":\"0\"}", + logFilePath) + if tagsCount < 1 { + t.Fatalf("mandatory tags InstanceId, ShardId, WorkerType should present") + } + azCount := testutil.RegexCountFile("{\"key\":\"az\",\"value\":{\"stringValue\":\"test-dev\"}", logFilePath) + if azCount < 1 { + t.Fatalf("az configured as test-dev and its value should present in otel metric dimension") + } + envCount := testutil.RegexCountFile("{\"key\":\"environment\",\"value\":{\"stringValue\":\"dev\"}", logFilePath) + if envCount < 1 { + t.Fatalf("az configured as test-dev and its value should present in otel metric dimension") + } + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsBasic done -------------------------------------------------------------") +} diff --git a/tests/unittest/otel_incorrect_endpoint/main_test.go b/tests/unittest/otel_incorrect_endpoint/main_test.go new file mode 100644 index 00000000..bd96879f --- /dev/null +++ b/tests/unittest/otel_incorrect_endpoint/main_test.go @@ -0,0 +1,117 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "os" + "strings" + "testing" + "time" + + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31002" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["sharding_cfg_reload_interval"] = "0" + appcfg["rac_sql_interval"] = "0" + appcfg["child.executable"] = "mysqlworker" + appcfg["enable_otel"] = "true" + appcfg["otel_resolution_time_in_sec"] = "3" + appcfg["otel_agent_metrics_uri"] = "v2/metrics" + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "3" + opscfg["opscfg.default.server.log_level"] = "5" + os.Setenv("AVAILABILITY_ZONE", "test-dev") + os.Setenv("ENVIRONMENT", "dev") + return appcfg, opscfg, testutil.MySQLWorker +} + +func before() error { + tableName = os.Getenv("TABLE_NAME") + if tableName == "" { + tableName = "jdbc_hera_test" + } + if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { + // mysql + testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") + } + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func TestOTELMetricsIncorrectEndPoint(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsIncorrectEndPoint begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + // cleanup and insert one row in the table + conn, err := db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + tx, _ := conn.BeginTx(ctx, nil) + sqlTxt := "/*cmd*/delete from " + tableName + stmt, _ := tx.PrepareContext(ctx, sqlTxt) + _, err = stmt.Exec() + if err != nil { + t.Fatalf("Error preparing test (delete table) %s with %s ==== sql\n", err.Error(), sqlTxt) + } + + stmt, _ = tx.PrepareContext(ctx, "/*cmd*/insert into "+tableName+" (id, int_val, str_val) VALUES(?, ?, ?)") + _, err = stmt.Exec(1, time.Now().Unix(), "val 1") + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + if err != nil { + t.Fatalf("Error commit %s\n", err.Error()) + } + + stmt, _ = conn.PrepareContext(ctx, "/*cmd*/Select id, int_val from "+tableName+" where id=?") + rows, _ := stmt.Query(1) + if !rows.Next() { + t.Fatalf("Expected 1 row") + } + + time.Sleep(10 * time.Second) + rows.Close() + stmt.Close() + + publishingErrors := testutil.RegexCountFile("otel publishing error", "hera.log") + if publishingErrors < 2 { + t.Fatalf("otel publishing error should present in log because of in-correct OTEL port number") + } + + calPublishingErrors := testutil.RegexCountFile("otel publishing error", "cal.log") + if calPublishingErrors < 1 { + t.Fatalf("otel publishing error should present in CAL log because of in-correct OTEL port number") + } + if calPublishingErrors > 1 { + t.Fatalf("otel runtime errors logging interval is 60 secs, we should not see more than one error in CAL") + } + cancel() + conn.Close() + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsIncorrectEndPoint done -------------------------------------------------------------") +} diff --git a/tests/unittest/otel_with_skip_cal/main_test.go b/tests/unittest/otel_with_skip_cal/main_test.go new file mode 100644 index 00000000..79b271c6 --- /dev/null +++ b/tests/unittest/otel_with_skip_cal/main_test.go @@ -0,0 +1,114 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "os" + "strings" + "testing" + "time" + + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31002" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["sharding_cfg_reload_interval"] = "0" + appcfg["rac_sql_interval"] = "0" + appcfg["child.executable"] = "mysqlworker" + appcfg["enable_otel"] = "true" + appcfg["otel_resolution_time_in_sec"] = "3" + appcfg["skip_cal_statelog"] = "true" + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "3" + opscfg["opscfg.default.server.log_level"] = "5" + os.Setenv("AVAILABILITY_ZONE", "test-dev") + os.Setenv("ENVIRONMENT", "dev") + return appcfg, opscfg, testutil.MySQLWorker +} + +func before() error { + tableName = os.Getenv("TABLE_NAME") + if tableName == "" { + tableName = "jdbc_hera_test" + } + if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { + // mysql + testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") + } + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func TestOTELMetricsSkipCALEndPoint(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsSkipCALEndPoint begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + // cleanup and insert one row in the table + conn, err := db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + tx, _ := conn.BeginTx(ctx, nil) + sqlTxt := "/*cmd*/delete from " + tableName + stmt, _ := tx.PrepareContext(ctx, sqlTxt) + _, err = stmt.Exec() + if err != nil { + t.Fatalf("Error preparing test (delete table) %s with %s ==== sql\n", err.Error(), sqlTxt) + } + + stmt, _ = tx.PrepareContext(ctx, "/*cmd*/insert into "+tableName+" (id, int_val, str_val) VALUES(?, ?, ?)") + _, err = stmt.Exec(1, time.Now().Unix(), "val 1") + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + if err != nil { + t.Fatalf("Error commit %s\n", err.Error()) + } + + stmt, _ = conn.PrepareContext(ctx, "/*cmd*/Select id, int_val from "+tableName+" where id=?") + rows, _ := stmt.Query(1) + if !rows.Next() { + t.Fatalf("Expected 1 row") + } + + time.Sleep(10 * time.Second) + rows.Close() + stmt.Close() + + publishingErrors := testutil.RegexCountFile("otel publishing error", "hera.log") + if publishingErrors > 0 { + t.Fatalf("should not see otel publishing errors with correct end-points") + } + + stateLogs := testutil.RegexCountFile("STATE\thera\t0\tinit", "cal.log") + if stateLogs > 0 { + t.Fatalf("skip_cal_statelog enabled, so we should not see state logs in CAL logs.") + } + cancel() + conn.Close() + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsSkipCALEndPoint done -------------------------------------------------------------") +} diff --git a/tests/unittest/testutil/defs.go b/tests/unittest/testutil/defs.go new file mode 100644 index 00000000..3d17f629 --- /dev/null +++ b/tests/unittest/testutil/defs.go @@ -0,0 +1,44 @@ +package testutil + +const otelConfigYamlData = `receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + http: + endpoint: "0.0.0.0:4318" + +exporters: + logging: + loglevel: debug + file: + path: /var/log/otel/otel_collector.log + +service: + pipelines: + traces: + receivers: [otlp] + processors: [] + exporters: [logging, file] + + metrics: + receivers: [otlp] + processors: [] + exporters: [logging, file] +` + +const otelCollectorDockerDef = `version: '3.8' + +services: + otel_basic-collector: + container_name: otel_basic-collector + image: otel/opentelemetry-collector-contrib:latest + ports: + - "4317:4317" #gRPC port + - "4318:4318" #HTTP port + volumes: + - ./otel_config.yaml:/etc/otel/config.yaml + - ./otel_logs:/var/log/otel + + command: ["--config", "/etc/otel/config.yaml"] +` diff --git a/tests/unittest/testutil/main.go b/tests/unittest/testutil/main.go index 9096e1ce..3e08310e 100644 --- a/tests/unittest/testutil/main.go +++ b/tests/unittest/testutil/main.go @@ -20,12 +20,19 @@ func setup(cfg cfgFunc) error { if err != nil { return err } + if appcfg["enable_otel"] == "true" { + err = mx.StartOTelAgent() + } + if err != nil { + return err + } err = mx.StartServer() return err } func teardown() { mx.StopServer() + mx.StopOTelAgent() } func copyFile(src, dest string) error { diff --git a/tests/unittest/testutil/setup.go b/tests/unittest/testutil/setup.go index 795c78d0..bae0fc94 100644 --- a/tests/unittest/testutil/setup.go +++ b/tests/unittest/testutil/setup.go @@ -24,12 +24,20 @@ import ( type Mux interface { StartServer() error StopServer() + StartOTelAgent() error + StopOTelAgent() error } /** commons used by mux tests */ +const OTEL_AGENT_DOCKER_CONFIG_PATH = "docker_compose_otel_collector.yaml" +const OTEL_AGENT_CONFIG_FILE_PATH = "otel_config.yaml" +const OTEL_LOG_DIR = "otel_logs" + +var otelLogsDir string + type WorkerType int const ( @@ -43,7 +51,7 @@ type DBType int const ( Oracle DBType = iota MySQL - PostgreSQL + PostgreSQL ) type mux struct { @@ -94,7 +102,7 @@ func (m *mux) setupWorkdir() { func (m *mux) setupConfig() error { // opscfg - for k,v := range m.opscfg { + for k, v := range m.opscfg { m.appcfg[k] = v } if m.wType == MySQLWorker { @@ -149,7 +157,7 @@ func doBuildAndSymlink(binname string) { var err error _, err = os.Stat(binname) if err != nil { - binpath := os.Getenv("GOPATH")+"/bin/"+binname + binpath := os.Getenv("GOPATH") + "/bin/" + binname _, err = os.Stat(binpath) if err != nil { srcname := binname @@ -199,11 +207,11 @@ func MakeDB(dockerName string, dbName string, dbType DBType) (ip string) { os.Setenv("password", "1-testDb") waitLoop := 1 for { - err := DBDirect("select 1", "127.0.0.1", dbName/*"heratestdb"*/, MySQL) + err := DBDirect("select 1", "127.0.0.1", dbName /*"heratestdb"*/, MySQL) if err != nil { time.Sleep(1 * time.Second) logger.GetLogger().Log(logger.Debug, "waiting for mysql server to come up "+ipBuf.String()+" "+dockerName) - fmt.Printf("waiting for db to come up %d %s\n",waitLoop, err.Error()) + fmt.Printf("waiting for db to come up %d %s\n", waitLoop, err.Error()) waitLoop++ continue } else { @@ -211,7 +219,6 @@ func MakeDB(dockerName string, dbName string, dbType DBType) (ip string) { } } - q := "CREATE USER 'appuser'@'%' IDENTIFIED BY '1-testDb'" err := DBDirect(q, ipBuf.String(), dbName, MySQL) if err != nil { @@ -266,7 +273,7 @@ func MakeDB(dockerName string, dbName string, dbType DBType) (ip string) { os.Setenv("postgresql_ip", ipBuf.String()) return ipBuf.String() - } + } return "" } @@ -384,10 +391,10 @@ func (m *mux) StartServer() error { ip := MakeDB("postgres22", "heratestdb", PostgreSQL) os.Setenv("TWO_TASK", ip+"/heratestdb?connect_timeout=60&sslmode=disable") twoTask := os.Getenv("TWO_TASK") - os.Setenv ("TWO_TASK_0", twoTask) - os.Setenv ("TWO_TASK_1", twoTask) + os.Setenv("TWO_TASK_0", twoTask) + os.Setenv("TWO_TASK_1", twoTask) twoTask1 := os.Getenv("TWO_TASK") - fmt.Println ("TWO_TASK_1: ", twoTask1) + fmt.Println("TWO_TASK_1: ", twoTask1) } } @@ -452,3 +459,78 @@ func (m *mux) StopServer() { os.Chdir(m.origDir) logger.GetLogger().Log(logger.Info, "Exit StopServer time=", time.Now().Unix()) } + +func (m *mux) StartOTelAgent() error { + logger.GetLogger().Log(logger.Info, "starting OTEL agent locally at: ", time.Now()) + err := generateConfigData() + if err != nil { + return err + logger.GetLogger().Log(logger.Alert, "error while Generating configuration datax, error: ", err) + } + shutdownAgent := exec.Command("docker-compose", "-f", OTEL_AGENT_DOCKER_CONFIG_PATH, "down") + err = shutdownAgent.Run() + if err != nil { + logger.GetLogger().Log(logger.Alert, "error while stopping OTEL agent, error: ", err) + } + startCommand := exec.Command("docker-compose", "-f", OTEL_AGENT_DOCKER_CONFIG_PATH, "up", "-d") + err = startCommand.Run() + if err != nil { + logger.GetLogger().Log(logger.Alert, "failed to start OTEL agent, error: ", err) + } + return err +} + +func (m *mux) StopOTelAgent() error { + logger.GetLogger().Log(logger.Info, "starting OTEL agent locally at: ", time.Now()) + shutdownAgent := exec.Command("docker-compose", "-f", OTEL_AGENT_DOCKER_CONFIG_PATH, "down") + err := shutdownAgent.Run() + + if err != nil { + logger.GetLogger().Log(logger.Alert, "error while stopping OTEL agent, error: ", err) + } + return err +} + +func generateConfigData() error { + workingDir, _ := os.Getwd() + otelLogsDir = filepath.Join(workingDir, OTEL_LOG_DIR) + _, err := os.Stat(otelLogsDir) + if !os.IsNotExist(err) { + os.RemoveAll(otelLogsDir) + } + err = os.MkdirAll(otelLogsDir, 0777) + if err != nil { + return err + } + configFilePath := filepath.Join(workingDir, OTEL_AGENT_CONFIG_FILE_PATH) + _, err = os.Stat(configFilePath) + if os.IsNotExist(err) { + configFile, err := os.OpenFile(configFilePath, os.O_CREATE|os.O_RDWR, 0644) + if err != nil { + return err + } + _, err = configFile.WriteString(otelConfigYamlData) + if err != nil { + return err + } + configFile.Close() + } + dockerDefinitionFile := filepath.Join(workingDir, OTEL_AGENT_DOCKER_CONFIG_PATH) + _, err = os.Stat(dockerDefinitionFile) + if os.IsNotExist(err) { + dockerFile, err := os.OpenFile(dockerDefinitionFile, os.O_CREATE|os.O_RDWR, 0777) + if err != nil { + return err + } + _, err = dockerFile.WriteString(otelCollectorDockerDef) + if err != nil { + return err + } + dockerFile.Close() + } + return nil +} + +func GetOTELLogDirPath() string { + return otelLogsDir +} diff --git a/utility/logger/otel/config/otelconfig.go b/utility/logger/otel/config/otelconfig.go new file mode 100644 index 00000000..5f2b0a0c --- /dev/null +++ b/utility/logger/otel/config/otelconfig.go @@ -0,0 +1,64 @@ +package config + +import ( + "errors" + "fmt" + "github.com/paypal/hera/utility/logger" + "sync/atomic" +) + +var OTelConfigData *OTelConfig +var OTelIngestTokenData atomic.Value + +// OTelConfig represent configuration related to OTEL collector to export data +type OTelConfig struct { + MetricNamePrefix string + Host string + HttpPort int + GRPCPort int + MetricsURLPath string + TraceURLPath string + PoolName string + ResourceType string + Enabled bool + SkipCalStateLog bool + ResolutionTimeInSec int + ExporterTimeout int + UseTls bool + TLSCertPath string + UseOtelGRPC bool + OTelErrorReportingInterval int + EnableRetry bool +} + +// Validation function to check whether pool name is configured or not +func (config *OTelConfig) validate() error { + if len(config.PoolName) <= 0 { + logger.GetLogger().Log(logger.Alert, "OTEL configurations validation failed, PoolName m=not configured") + return errors.New("OTEL configurations validation failed, PoolName m=not configured") + } + return nil +} + +func (config *OTelConfig) Dump() { + logger.GetLogger().Log(logger.Info, fmt.Sprintf("Host : %s", config.Host)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("Http Port: %d", config.HttpPort)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("GRPC Port: %d", config.GRPCPort)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("Poolname: %s", config.PoolName)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("ResolutionTimeInSec: %d", config.ResolutionTimeInSec)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("UseTls: %t", config.UseTls)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("UrlPath: %s", config.MetricsURLPath)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("UseOtelGRPC: %t", config.UseOtelGRPC)) +} + +func (config *OTelConfig) PopulateMetricNamePrefix(metricName string) string { + return fmt.Sprintf("%s.%s", config.MetricNamePrefix, metricName) +} + +func SetOTelIngestToken(value string) { + OTelIngestTokenData.Store(value) +} + +func GetOTelIngestToken() string { + return OTelIngestTokenData.Load().(string) +} diff --git a/utility/logger/otel/defs.go b/utility/logger/otel/defs.go new file mode 100644 index 00000000..5a7b49eb --- /dev/null +++ b/utility/logger/otel/defs.go @@ -0,0 +1,136 @@ +package otel + +import ( + "go.opentelemetry.io/otel/metric" + "sync" +) + +// "init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd", "cls" +// Following Metric Names will get instrumented as part of StateLogMetrics +const ( + // Worker States + InitConnCountMetric = "init_connection.count" + AccptConnCountMetric = "accept_connection.count" + WaitConnCountMetric = "wait_connection.count" + BusyConnCountMetric = "busy_connection.count" + ScheduledConnCountMetric = "scheduled_connection.count" + FinishedConnCountMetric = "finished_connection.count" + QuiescedConnCountMetric = "quiesced_connection.count" + + // Connection States + AssignedConnCountMetric = "assigned_connection.count" + IdleConnCountMetric = "idle_connection.count" + BacklogConnCountMetric = "backlog_connection.count" + StrdConnCountMetric = "stranded_connection.count" +) + +const ( + Target = string("target") + Endpoint = string("target_ip_port") + TLS_version = string("tls_version") + Application = string("Application") + ShardId = string("ShardId") + WorkerType = string("WorkerType") + InstanceId = string("InstanceId") + Datapoints = string("datapoints") +) + +const OtelInstrumentationVersion string = "v1.0" + +// DEFAULT_OTEL_COLLECTOR_PROTOCOL default OTEL configurations point to QA collector +const DEFAULT_OTEL_COLLECTOR_PROTOCOL string = "grpc" +const DEFAULT_OTEL_COLLECTOR__IP string = "0.0.0.0" +const DEFAULT_GRPC_OTEL_COLLECTOR_PORT string = "4317" +const DEFAULT_HTTP_OTEL_COLLECTOR_PORT string = "4318" +const COLLECTOR_POLLING_INTERVAL_SECONDS int32 = 5 + +const StateLogMeterName = "occ-statelog-data" + +// LoggingOTELPublishingInterval This controls how frequently log OTEL publishing error +const LoggingOTELPublishingInterval = 15 + +//****************************** variables *************************** + +type Tags struct { + TagName string + TagValue string +} + +type WorkersStateData struct { + ShardId int + WorkerType int + InstanceId int + StateData map[string]int64 +} + +type ( + ServerType int +) + +// StateData Represents stats by a worker +type StateData struct { + Name string + Value float64 + Dimensions metric.MeasurementOption +} + +type DataPoint struct { + attr metric.MeasurementOption + data int64 +} + +// StateLogMetrics state_log_metrics reports workers states +type StateLogMetrics struct { + + //Statelog metrics configuration data + metricsConfig stateLogMetricsConfig + + meter metric.Meter + + //Channel to receive statelog data + mStateDataChan chan *WorkersStateData + + //Channel to close sending data + doneCh chan struct{} + + stateLock sync.Mutex + + registration metric.Registration + + initState metric.Int64ObservableGauge + acptState metric.Int64ObservableGauge + waitState metric.Int64ObservableGauge + busyState metric.Int64ObservableGauge + schdState metric.Int64ObservableGauge + fnshState metric.Int64ObservableGauge + quceState metric.Int64ObservableGauge + asgnState metric.Int64ObservableGauge + idleState metric.Int64ObservableGauge + bklgState metric.Int64ObservableGauge + strdState metric.Int64ObservableGauge +} + +// Object represents the workers states data for worker belongs to specific shardId and workperType with flat-map +// between statename vs count. +type stateLogMetricsConfig struct { + // MeterProvider sets the metric.MeterProvider. If nil, the global + // Provider will be used. + MeterProvider metric.MeterProvider + appName string +} + +// MetricProviderOption Define confuration for metric Provider Option +type MetricProviderOption struct { + metric.MeterProvider +} + +// StateLogOption Option Interface define configuration parameters for statelog metrics agent +type StateLogOption interface { + apply(*stateLogMetricsConfig) +} + +// AppNameOption Define Option for OCCName +type AppNameOption string + +// Headers +const IngestTokenHeader = "X-Sf-Token" diff --git a/utility/logger/otel/error_handler.go b/utility/logger/otel/error_handler.go new file mode 100644 index 00000000..440e9341 --- /dev/null +++ b/utility/logger/otel/error_handler.go @@ -0,0 +1,79 @@ +package otel + +import ( + "fmt" + "github.com/paypal/hera/cal" + "github.com/paypal/hera/utility/logger" + "reflect" + "sync" + "sync/atomic" + "time" +) + +type OTelErrorHandler struct{} + +type OTelErrorData struct { + err error + occurredTime int64 +} + +var ( + oTelErrorLoggingLock sync.Mutex + errorTicker *time.Ticker + gErrorDataMap atomic.Value + logTickerInitialized atomic.Bool +) + +// Handle function handles errors in async way whenever runtime error while publishing data to OTEL agent. +func (handler OTelErrorHandler) Handle(err error) { + if err == nil { + return + } + logger.GetLogger().Log(logger.Warning, fmt.Sprintf("otel publishing error %v", err)) + oTelErrorLoggingLock.Lock() + defer oTelErrorLoggingLock.Unlock() + errorDataMapVal := gErrorDataMap.Load() + errorDataMap := errorDataMapVal.(map[string]*OTelErrorData) + if errorDataMap == nil { + errorDataMap = make(map[string]*OTelErrorData) + } + errorDataMap[reflect.TypeOf(err).String()] = &OTelErrorData{err: err, occurredTime: time.Now().Unix()} + if !logTickerInitialized.Load() { + handler.logOTelErrorCalEvent(errorDataMap) + errorDataMap = make(map[string]*OTelErrorData) //Reinitialize the map after process it. + gErrorDataMap.Store(errorDataMap) + logTickerInitialized.Store(true) + } else { + gErrorDataMap.Store(errorDataMap) + } +} + +// logOTELErrorCalEvent Log CAL event peridiocally every 15 mins in case any issues with OTEL data publish +func (handler OTelErrorHandler) processOTelErrorsMap() { + go func() { + for { + select { + case <-errorTicker.C: + oTelErrorLoggingLock.Lock() + errorDataMapVal := gErrorDataMap.Load() + errorDataMap := errorDataMapVal.(map[string]*OTelErrorData) + if errorDataMap != nil && len(errorDataMap) > 0 { + handler.logOTelErrorCalEvent(errorDataMap) + errorDataMap = make(map[string]*OTelErrorData) //Reinitialize the map after process it. + gErrorDataMap.Store(errorDataMap) + } + oTelErrorLoggingLock.Unlock() + } + } + }() +} + +// logOTelErrorCalEvent It takes of logging OTEL +func (handler OTelErrorHandler) logOTelErrorCalEvent(errorDataMap map[string]*OTelErrorData) { + for _, errorData := range errorDataMap { + event := cal.NewCalEvent("OTEL", "CONNECTION", "2", fmt.Sprintf("%v", errorData.err)) + event.AddDataInt("occurredTime", errorData.occurredTime) + event.AddDataInt("loggedTime", time.Now().Unix()) + event.Completed() + } +} diff --git a/utility/logger/otel/logger.go b/utility/logger/otel/logger.go new file mode 100644 index 00000000..4187b320 --- /dev/null +++ b/utility/logger/otel/logger.go @@ -0,0 +1,308 @@ +package otel + +import ( + "context" + "errors" + "fmt" + "github.com/paypal/hera/utility/logger" + "github.com/paypal/hera/utility/logger/otel/config" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" + "go.opentelemetry.io/otel/sdk/resource" + "go.opentelemetry.io/otel/sdk/trace" + "os" + "sync" + "time" +) + +var oTelInitializeOnce sync.Once + +// Init takes care of initialize OTEL SDK once during startup +func Init(ctx context.Context) (shutdown func(ctx context.Context) error, err error) { + oTelInitializeOnce.Do(func() { + shutdown, err = initializeOTelSDK(ctx) + }) + return shutdown, err +} + +// InitializeOTelSDK SetupOTelSDK bootstrap the OTEL SDK pipeline initialization +func initializeOTelSDK(ctx context.Context) (shutdown func(ctx context.Context) error, err error) { + var shutdownFuncs []func(context.Context) error + //shutdown calls cleanup function registered via shutdown functions + //The errors from calls are joined + shutdown = func(ctx context.Context) error { + var localErr error + for _, fn := range shutdownFuncs { + if fnErr := fn(ctx); fnErr != nil { + localErr = errors.Join(localErr, fnErr) // You can use other error accumulation strategies if needed + } + } + if localErr != nil { + logger.GetLogger().Log(logger.Warning, fmt.Sprintf("error while performing otel shutdown, err: %v", localErr)) + } + shutdownFuncs = nil + return localErr + } + + //handle error calls shutdown for cleanup and make sure all errors returned + handleErr := func(inErr error) { + err = errors.Join(inErr, shutdown(ctx)) + } + + errorTicker = time.NewTicker(time.Duration(config.OTelConfigData.OTelErrorReportingInterval) * time.Second) + + errorDataMap := make(map[string]*OTelErrorData) //Initialize the map after process it. + gErrorDataMap.Store(errorDataMap) + + traceProvider, err := newTraceProvider(ctx) //Initialize trace provider + if err != nil { + handleErr(err) + return nil, err + } + shutdownFuncs = append(shutdownFuncs, traceProvider.Shutdown) + otel.SetTracerProvider(traceProvider) + + //Setup meter provider + meterProvider, err := newMeterProvider(ctx) + otel.SetMeterProvider(meterProvider) + if err != nil { + handleErr(err) + return nil, err + } + shutdownFuncs = append(shutdownFuncs, meterProvider.Shutdown) + + oTelErrorHandler := OTelErrorHandler{} + otel.SetErrorHandler(oTelErrorHandler) //Register custom error handler + oTelErrorHandler.processOTelErrorsMap() //Spawn Go routine peridically process OTEL errors + shutdownFuncs = append(shutdownFuncs, func(ctx context.Context) error { + errorTicker.Stop() + return nil + }) + return shutdown, err +} + +func newTraceProvider(ctx context.Context) (*trace.TracerProvider, error) { + + traceExporter, err := getTraceExporter(ctx) + if err != nil { + return nil, err + } + + traceProvider := trace.NewTracerProvider( + trace.WithBatcher(traceExporter, + trace.WithBatchTimeout(5*time.Second), + trace.WithExportTimeout(2*time.Second), + trace.WithMaxExportBatchSize(10), + trace.WithMaxQueueSize(10), + ), + // Default is 5s. Set to 1s for demonstrative purposes. + trace.WithResource(getResourceInfo(config.OTelConfigData.PoolName)), + ) + return traceProvider, nil +} + +// Initialize newMeterProvider respective exporter either HTTP or GRPC exporter +func newMeterProvider(ctx context.Context) (*metric.MeterProvider, error) { + metricExporter, err := getMetricExporter(ctx) + + if err != nil { + logger.GetLogger().Log(logger.Alert, "failed to initialize metric exporter, error %v", err) + return nil, err + } + + meterProvider := metric.NewMeterProvider( + metric.WithResource(getResourceInfo(config.OTelConfigData.PoolName)), + metric.WithReader(metric.NewPeriodicReader(metricExporter, + metric.WithInterval(time.Duration(config.OTelConfigData.ResolutionTimeInSec)*time.Second))), + ) + return meterProvider, nil +} + +// getMetricExporter Initialize metric exporter based protocol selected by user. +func getMetricExporter(ctx context.Context) (metric.Exporter, error) { + if config.OTelConfigData.UseOtelGRPC { + return newGRPCExporter(ctx) + } + return newHTTPExporter(ctx) +} + +// getTraceExporter Initialize span exporter based protocol(GRPC or HTTP) selected by user. +func getTraceExporter(ctx context.Context) (*otlptrace.Exporter, error) { + if config.OTelConfigData.UseOtelGRPC { + return newGRPCTraceExporter(ctx) + } + return newHTTPTraceExporter(ctx) +} + +// newHTTPExporter Initilizes The "otlpmetrichttp" exporter in OpenTelemetry is used to export metrics data using the +// OpenTelemetry Protocol (OTLP) over HTTP. +func newHTTPExporter(ctx context.Context) (metric.Exporter, error) { + headers := make(map[string]string) + headers[IngestTokenHeader] = config.GetOTelIngestToken() + + //Currently all metrics uses delta-temporality: Delta Temporality: Use when you are interested in the rate of change + //over time or when you need to report only the differences (deltas) between measurements. + //This is useful for metrics like CPU usage, request rates, or other metrics where the rate of change is important. + var temporalitySelector = func(instrument metric.InstrumentKind) metricdata.Temporality { return metricdata.DeltaTemporality } + + return otlpmetrichttp.New(ctx, + otlpmetrichttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.HttpPort)), + otlpmetrichttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlpmetrichttp.WithCompression(otlpmetrichttp.NoCompression), + otlpmetrichttp.WithTemporalitySelector(temporalitySelector), + otlpmetrichttp.WithHeaders(headers), + otlpmetrichttp.WithRetry(otlpmetrichttp.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlpmetrichttp.WithURLPath(config.OTelConfigData.MetricsURLPath), + otlpmetrichttp.WithInsecure(), //Since agent is local + ) +} + +// newGRPCExporter Initializes The "otlpmetricgrpc" exporter in OpenTelemetry is used to export metrics data using the +// OpenTelemetry Protocol (OTLP) over GRPC. +func newGRPCExporter(ctx context.Context) (metric.Exporter, error) { + + headers := make(map[string]string) + headers[IngestTokenHeader] = config.GetOTelIngestToken() + + //Currently all metrics uses delta-temporality: Delta Temporality: Use when you are interested in the rate of change + //over time or when you need to report only the differences (deltas) between measurements. + //This is useful for metrics like CPU usage, request rates, or other metrics where the rate of change is important. + var temporalitySelector = func(instrument metric.InstrumentKind) metricdata.Temporality { return metricdata.DeltaTemporality } + + return otlpmetricgrpc.New(ctx, + otlpmetricgrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.GRPCPort)), + otlpmetricgrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlpmetricgrpc.WithHeaders(headers), + otlpmetricgrpc.WithReconnectionPeriod(time.Duration(5)*time.Second), + otlpmetricgrpc.WithTemporalitySelector(temporalitySelector), + otlpmetricgrpc.WithRetry(otlpmetricgrpc.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlpmetricgrpc.WithInsecure(), //Since agent is local + ) +} + +// newHTTPTraceExporter Initilizes The "otlptracehttp" exporter in OpenTelemetry is used to export spans data using the +// OpenTelemetry Protocol (OTLP) over HTTP. +func newHTTPTraceExporter(ctx context.Context) (*otlptrace.Exporter, error) { + headers := make(map[string]string) + headers[IngestTokenHeader] = config.GetOTelIngestToken() + + return otlptracehttp.New(ctx, + otlptracehttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.HttpPort)), + otlptracehttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlptracehttp.WithHeaders(headers), + otlptracehttp.WithRetry(otlptracehttp.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlptracehttp.WithURLPath(config.OTelConfigData.TraceURLPath), + otlptracehttp.WithInsecure(), //Since agent is local + ) +} + +// newGRPCTraceExporter Initilizes The "otlptracegrpc" exporter in OpenTelemetry is used to export spans data using the +// OpenTelemetry Protocol (OTLP) over GRPC. +func newGRPCTraceExporter(ctx context.Context) (*otlptrace.Exporter, error) { + + headers := make(map[string]string) + headers[IngestTokenHeader] = config.GetOTelIngestToken() + + return otlptracegrpc.New(ctx, + otlptracegrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.GRPCPort)), + otlptracegrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlptracegrpc.WithHeaders(headers), + otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlptracegrpc.WithInsecure(), //Since agent is local + ) +} + +// getResourceInfo provide application context level attributes during initialization +func getResourceInfo(appName string) *resource.Resource { + hostname, _ := os.Hostname() + + // Create a slice to hold the attributes + attributes := []attribute.KeyValue{ + attribute.String("container_host", hostname), + attribute.String("application", appName), + attribute.String("source", "otel"), + } + + environment, isPresent := os.LookupEnv("ENVIRONMENT") + if !isPresent { + environment = "dev" + } + az, isPresent := os.LookupEnv("AVAILABILITY_ZONE") + if !isPresent { + az = "dev" + } + attributes = append(attributes, attribute.String("az", az)) + attributes = append(attributes, attribute.String("environment", environment)) + + resource := resource.NewWithAttributes(fmt.Sprintf("%s resource", config.OTelConfigData.ResourceType), + attributes..., + ) + return resource +} diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go new file mode 100644 index 00000000..a02eb0d5 --- /dev/null +++ b/utility/logger/otel/state_logger.go @@ -0,0 +1,310 @@ +package otel + +import ( + "context" + "fmt" + "github.com/paypal/hera/utility/logger" + otelconfig "github.com/paypal/hera/utility/logger/otel/config" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "sync" + "time" +) + +const defaultAppName string = "occ" + +// This lock prevents a race between batch observer and instrument registration +var registerStateMetrics sync.Once +var metricsStateLogger *StateLogMetrics + +// Implement apply function in to configure meter provider +func (o MetricProviderOption) apply(c *stateLogMetricsConfig) { + if o.MeterProvider != nil { + c.MeterProvider = o.MeterProvider + } +} + +// Implement apply function in to configure pool name +func (appName AppNameOption) apply(c *stateLogMetricsConfig) { + if appName != "" { + c.appName = string(appName) + } +} + +// WithAppName Create StateLogMetrics with OCC Name +func WithAppName(appName string) StateLogOption { + return AppNameOption(appName) +} + +// WithMetricProvider Create StateLogMetrics with provided meter Provider +func WithMetricProvider(provider metric.MeterProvider) StateLogOption { + return MetricProviderOption{provider} +} + +// newConfig computes a config from the supplied Options. +func newConfig(opts ...StateLogOption) stateLogMetricsConfig { + statesConfig := stateLogMetricsConfig{ + MeterProvider: otel.GetMeterProvider(), + appName: defaultAppName, + } + + for _, opt := range opts { + opt.apply(&statesConfig) + } + return statesConfig +} + +// StartMetricsCollection initializes reporting of stateLogMetrics using the supplied config. +func StartMetricsCollection(totalWorkersCount int, opt ...StateLogOption) error { + stateLogMetricsConfig := newConfig(opt...) + + //Verification of config data + if stateLogMetricsConfig.appName == "" { + stateLogMetricsConfig.appName = defaultAppName + } + + if stateLogMetricsConfig.MeterProvider == nil { + stateLogMetricsConfig.MeterProvider = otel.GetMeterProvider() + } + + var err error + //Registers instrumentation for metrics + registerStateMetrics.Do(func() { + //Initialize state-log metrics + metricsStateLogger = &StateLogMetrics{ + meter: stateLogMetricsConfig.MeterProvider.Meter(StateLogMeterName, + metric.WithInstrumentationVersion(OtelInstrumentationVersion)), + metricsConfig: stateLogMetricsConfig, + mStateDataChan: make(chan *WorkersStateData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*2), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, + doneCh: make(chan struct{}), + } + err = metricsStateLogger.register() + }) + return err +} + +// StopMetricCollection Send notification to stateLogMetrics.doneCh to stop metric collection +func StopMetricCollection() { + select { + case metricsStateLogger.doneCh <- struct{}{}: + return + default: + logger.GetLogger().Log(logger.Info, "channel has already been closed.") + return + } +} + +// AddDataPointToOTELStateDataChan Send data to stateLogMetrics.mStateDataChan channel +func AddDataPointToOTELStateDataChan(dataPoint *WorkersStateData) { + select { + case metricsStateLogger.mStateDataChan <- dataPoint: + return + case <-time.After(time.Millisecond * 100): + logger.GetLogger().Log(logger.Alert, "timeout occurred while adding record to stats data channel") + } +} + +// Define Instrumentation for each metrics and register with StateLogMetrics +func (stateLogMetrics *StateLogMetrics) register() error { + + //"init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd", "cls" + var err error + if stateLogMetrics.initState, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(InitConnCountMetric), + metric.WithDescription("Number of workers in init state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for init state", err) + return err + } + + if stateLogMetrics.acptState, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(AccptConnCountMetric), + metric.WithDescription("Number of workers in accept state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for accept state", err) + return err + } + + if stateLogMetrics.waitState, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(WaitConnCountMetric), + metric.WithDescription("Number of workers in wait state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for wait state", err) + return err + } + + if stateLogMetrics.busyState, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(BusyConnCountMetric), + metric.WithDescription("Number of workers in busy state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for busy state", err) + return err + } + + if stateLogMetrics.schdState, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(ScheduledConnCountMetric), + metric.WithDescription("Number of workers in scheduled state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for scheduled state", err) + return err + } + + if stateLogMetrics.fnshState, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(FinishedConnCountMetric), + metric.WithDescription("Number of workers in finished state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for finished state", err) + return err + } + + if stateLogMetrics.quceState, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(QuiescedConnCountMetric), + metric.WithDescription("Number of workers in quiesced state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for quiesced state", err) + return err + } + + if stateLogMetrics.asgnState, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(AssignedConnCountMetric), + metric.WithDescription("Number of workers in assigned state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for assigned state", err) + return err + } + + if stateLogMetrics.idleState, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(IdleConnCountMetric), + metric.WithDescription("Number of workers in idle state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for idle state", err) + return err + } + + if stateLogMetrics.bklgState, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(BacklogConnCountMetric), + metric.WithDescription("Number of workers in backlog state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for backlog state", err) + return err + } + + if stateLogMetrics.strdState, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(StrdConnCountMetric), + metric.WithDescription("Number of connections in stranded state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for stranded state", err) + return err + } + + stateLogMetrics.registration, err = stateLogMetrics.meter.RegisterCallback( + func(ctx context.Context, observer metric.Observer) error { + return stateLogMetrics.asyncStateLogMetricsPoll(observer) + }, + []metric.Observable{ + stateLogMetrics.initState, + stateLogMetrics.acptState, + stateLogMetrics.waitState, + stateLogMetrics.busyState, + stateLogMetrics.schdState, + stateLogMetrics.fnshState, + stateLogMetrics.quceState, + stateLogMetrics.asgnState, + stateLogMetrics.idleState, + stateLogMetrics.bklgState, + stateLogMetrics.strdState, + }...) + + if err != nil { + return err + } + return nil +} + +/* + * AasyncStatelogMetricsPoll poll operation involved periodically by OTEL collector based-on its polling interval + * it poll metrics from channel do aggregation or compute max based combination of shardId + workerType + InstanceId + */ +func (stateLogMetrics *StateLogMetrics) asyncStateLogMetricsPoll(observer metric.Observer) (err error) { + stateLogMetrics.stateLock.Lock() + defer stateLogMetrics.stateLock.Unlock() + stateLogsData := make(map[string]map[string]int64) + //Infinite loop read through the channel and send metrics +mainloop: + for { + select { + case workersState, more := <-stateLogMetrics.mStateDataChan: + if !more { + logger.GetLogger().Log(logger.Info, "Statelog metrics data channel 'mStateDataChan' has been closed.") + break mainloop + } + keyName := fmt.Sprintf("%d-%d-%d", workersState.ShardId, workersState.WorkerType, workersState.InstanceId) + + if stateLogsData[keyName] == nil { + stateLogsData[keyName] = make(map[string]int64) + } + //Update metadata information + stateLogsData[keyName][ShardId] = int64(workersState.ShardId) + stateLogsData[keyName][WorkerType] = int64(workersState.WorkerType) + stateLogsData[keyName][InstanceId] = int64(workersState.InstanceId) + stateLogsData[keyName][Datapoints] += 1 + + for key, value := range workersState.StateData { + if key == "req" || key == "resp" { + stateLogsData[keyName][key] += value + } else { + maxKey := key + "Max" + stateLogsData[keyName][key] = value + //check max update max value + if stateLogsData[keyName][maxKey] < value { + stateLogsData[keyName][maxKey] = value + } + } + } + case <-stateLogMetrics.doneCh: + logger.GetLogger().Log(logger.Info, "received stopped signal for processing statelog metric. "+ + "so unregistering callback for sending data and closing data channel") + close(stateLogMetrics.mStateDataChan) + stateLogMetrics.registration.Unregister() + default: + break mainloop + } + } + //Process metrics data + if len(stateLogsData) > 0 { + err = stateLogMetrics.sendMetricsDataToCollector(observer, stateLogsData) + } + return err +} + +/* + * Send metrics datat data-points to collector + */ +func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(observer metric.Observer, stateLogsData map[string]map[string]int64) (err error) { + for key, aggStatesData := range stateLogsData { + logger.GetLogger().Log(logger.Info, fmt.Sprintf("calculated max value and aggregation of updown counter for key: %s using datapoints size: %d", key, aggStatesData[Datapoints])) + commonLabels := []attribute.KeyValue{ + attribute.Int(ShardId, int(aggStatesData[ShardId])), + attribute.Int(WorkerType, int(aggStatesData[WorkerType])), + attribute.Int(InstanceId, int(aggStatesData[InstanceId])), + } + //Observe states data + // 1. Worker States + + observer.ObserveInt64(stateLogMetrics.initState, aggStatesData["init"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.acptState, aggStatesData["acpt"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.waitState, aggStatesData["wait"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.busyState, aggStatesData["busy"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.schdState, aggStatesData["schd"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.fnshState, aggStatesData["fnsh"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.quceState, aggStatesData["quce"], metric.WithAttributes(commonLabels...)) + + // 2. Connection States + observer.ObserveInt64(stateLogMetrics.asgnState, aggStatesData["asgn"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.idleState, aggStatesData["idle"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.bklgState, aggStatesData["bklg"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.strdState, aggStatesData["strd"], metric.WithAttributes(commonLabels...)) + } + return nil +} diff --git a/utility/logger/otel/test/mock_collector.go b/utility/logger/otel/test/mock_collector.go new file mode 100644 index 00000000..ba64445c --- /dev/null +++ b/utility/logger/otel/test/mock_collector.go @@ -0,0 +1,341 @@ +package otel + +import ( + "bytes" + "compress/gzip" + "context" + "crypto/ecdsa" + "crypto/elliptic" + cryptorand "crypto/rand" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "fmt" + "io" + "io/ioutil" + "math/big" + mathrand "math/rand" + "net" + "net/http" + "sync" + "testing" + "time" + + "google.golang.org/protobuf/proto" + + collectormetricpb "go.opentelemetry.io/proto/otlp/collector/metrics/v1" + metricpb "go.opentelemetry.io/proto/otlp/metrics/v1" +) + +const DefaultMetricsPath string = "/v1/metrics" + +type mockCollector struct { + endpoint string + server *http.Server + + spanLock sync.Mutex + metricsStorage MetricsStorage + + injectHTTPStatus []int + injectContentType string + delay <-chan struct{} + + clientTLSConfig *tls.Config + expectedHeaders map[string]string +} + +func (c *mockCollector) Stop() error { + return c.server.Shutdown(context.Background()) +} + +func (c *mockCollector) MustStop(t *testing.T) { + c.server.Shutdown(context.Background()) +} + +func (c *mockCollector) GetMetrics() []*metricpb.Metric { + c.spanLock.Lock() + defer c.spanLock.Unlock() + return c.metricsStorage.GetMetrics() +} + +func (c *mockCollector) Endpoint() string { + return c.endpoint +} + +func (c *mockCollector) ClientTLSConfig() *tls.Config { + return c.clientTLSConfig +} + +func (c *mockCollector) serveMetrics(w http.ResponseWriter, r *http.Request) { + if c.delay != nil { + select { + case <-c.delay: + case <-r.Context().Done(): + return + } + } + + if !c.checkHeaders(r) { + w.WriteHeader(http.StatusBadRequest) + return + } + response := collectormetricpb.ExportMetricsServiceResponse{} + rawResponse, err := proto.Marshal(&response) + if err != nil { + w.WriteHeader(http.StatusInternalServerError) + return + } + if injectedStatus := c.getInjectHTTPStatus(); injectedStatus != 0 { + writeReply(w, rawResponse, injectedStatus, c.injectContentType) + return + } + rawRequest, err := readRequest(r) + if err != nil { + w.WriteHeader(http.StatusInternalServerError) + return + } + // fmt.Println("---------------raw req--------------", rawRequest) + request, err := unmarshalMetricsRequest(rawRequest, r.Header.Get("content-type")) + if err != nil { + w.WriteHeader(http.StatusBadRequest) + return + } + writeReply(w, rawResponse, 0, c.injectContentType) + c.spanLock.Lock() + defer c.spanLock.Unlock() + + fmt.Println("---------------serveMetrics--------------", request) + c.metricsStorage.AddMetrics(request) +} + +func unmarshalMetricsRequest(rawRequest []byte, contentType string) (*collectormetricpb.ExportMetricsServiceRequest, error) { + request := &collectormetricpb.ExportMetricsServiceRequest{} + if contentType != "application/x-protobuf" { + return request, fmt.Errorf("invalid content-type: %s, only application/x-protobuf is supported", contentType) + } + err := proto.Unmarshal(rawRequest, request) + return request, err +} + +func (c *mockCollector) checkHeaders(r *http.Request) bool { + for k, v := range c.expectedHeaders { + got := r.Header.Get(k) + if got != v { + return false + } + } + return true +} + +func (c *mockCollector) getInjectHTTPStatus() int { + if len(c.injectHTTPStatus) == 0 { + return 0 + } + status := c.injectHTTPStatus[0] + c.injectHTTPStatus = c.injectHTTPStatus[1:] + if len(c.injectHTTPStatus) == 0 { + c.injectHTTPStatus = nil + } + return status +} + +func readRequest(r *http.Request) ([]byte, error) { + if r.Header.Get("Content-Encoding") == "gzip" { + return readGzipBody(r.Body) + } + return ioutil.ReadAll(r.Body) +} + +func readGzipBody(body io.Reader) ([]byte, error) { + rawRequest := bytes.Buffer{} + gunzipper, err := gzip.NewReader(body) + if err != nil { + return nil, err + } + defer gunzipper.Close() + _, err = io.Copy(&rawRequest, gunzipper) + if err != nil { + return nil, err + } + return rawRequest.Bytes(), nil +} + +func writeReply(w http.ResponseWriter, rawResponse []byte, injectHTTPStatus int, injectContentType string) { + status := http.StatusOK + if injectHTTPStatus != 0 { + status = injectHTTPStatus + } + contentType := "application/x-protobuf" + if injectContentType != "" { + contentType = injectContentType + } + w.Header().Set("Content-Type", contentType) + w.WriteHeader(status) + _, _ = w.Write(rawResponse) +} + +type mockCollectorConfig struct { + MetricsURLPath string + Port int + InjectHTTPStatus []int + InjectContentType string + Delay <-chan struct{} + WithTLS bool + ExpectedHeaders map[string]string +} + +func (c *mockCollectorConfig) fillInDefaults() { + if c.MetricsURLPath == "" { + c.MetricsURLPath = DefaultMetricsPath + } +} + +func runMockCollector(t *testing.T, cfg mockCollectorConfig) *mockCollector { + cfg.fillInDefaults() + ln, _ := net.Listen("tcp", fmt.Sprintf("localhost:%d", cfg.Port)) + // require.NoError(t, err) + _, portStr, _ := net.SplitHostPort(ln.Addr().String()) + // require.NoError(t, err) + m := &mockCollector{ + endpoint: fmt.Sprintf("localhost:%s", portStr), + metricsStorage: NewMetricsStorage(), + injectHTTPStatus: cfg.InjectHTTPStatus, + injectContentType: cfg.InjectContentType, + delay: cfg.Delay, + expectedHeaders: cfg.ExpectedHeaders, + } + mux := http.NewServeMux() + mux.Handle(cfg.MetricsURLPath, http.HandlerFunc(m.serveMetrics)) + server := &http.Server{ + Handler: mux, + } + if cfg.WithTLS { + pem, _ := generateWeakCertificate() + // require.NoError(t, err) + tlsCertificate, _ := tls.X509KeyPair(pem.Certificate, pem.PrivateKey) + // require.NoError(t, err) + server.TLSConfig = &tls.Config{ + Certificates: []tls.Certificate{tlsCertificate}, + } + + m.clientTLSConfig = &tls.Config{ + InsecureSkipVerify: true, + } + } + go func() { + if cfg.WithTLS { + _ = server.ServeTLS(ln, "", "") + } else { + _ = server.Serve(ln) + } + }() + m.server = server + return m +} + +type mathRandReader struct{} + +func (mathRandReader) Read(p []byte) (n int, err error) { + return mathrand.Read(p) +} + +var randReader mathRandReader + +func generateWeakCertificate() (*pemCertificate, error) { + priv, err := ecdsa.GenerateKey(elliptic.P256(), randReader) + if err != nil { + return nil, err + } + keyUsage := x509.KeyUsageDigitalSignature + notBefore := time.Now() + notAfter := notBefore.Add(time.Hour) + serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) + serialNumber, err := cryptorand.Int(randReader, serialNumberLimit) + if err != nil { + return nil, err + } + template := x509.Certificate{ + SerialNumber: serialNumber, + Subject: pkix.Name{ + Organization: []string{"otel_basic-go"}, + }, + NotBefore: notBefore, + NotAfter: notAfter, + KeyUsage: keyUsage, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + BasicConstraintsValid: true, + DNSNames: []string{"localhost"}, + IPAddresses: []net.IP{net.IPv6loopback, net.IPv4(127, 0, 0, 1)}, + } + derBytes, err := x509.CreateCertificate(randReader, &template, &template, &priv.PublicKey, priv) + if err != nil { + return nil, err + } + certificateBuffer := new(bytes.Buffer) + if err := pem.Encode(certificateBuffer, &pem.Block{Type: "CERTIFICATE", Bytes: derBytes}); err != nil { + return nil, err + } + privDERBytes, err := x509.MarshalPKCS8PrivateKey(priv) + if err != nil { + return nil, err + } + privBuffer := new(bytes.Buffer) + if err := pem.Encode(privBuffer, &pem.Block{Type: "PRIVATE KEY", Bytes: privDERBytes}); err != nil { + return nil, err + } + return &pemCertificate{ + Certificate: certificateBuffer.Bytes(), + PrivateKey: privBuffer.Bytes(), + }, nil +} + +type pemCertificate struct { + Certificate []byte + PrivateKey []byte +} + +// Collector is an interface that mock collectors should implements, +// so they can be used for the end-to-end testing. +type Collector interface { + Stop() error + GetMetrics() []*metricpb.Metric +} + +// MetricsStorage stores the metrics. Mock collectors could use it to +// store metrics they have received. +type MetricsStorage struct { + metrics []*metricpb.Metric +} + +// NewMetricsStorage creates a new metrics storage. +func NewMetricsStorage() MetricsStorage { + return MetricsStorage{} +} + +// AddMetrics adds metrics to the metrics storage. +func (s *MetricsStorage) AddMetrics(request *collectormetricpb.ExportMetricsServiceRequest) { + for _, rm := range request.GetResourceMetrics() { + // TODO (rghetia) handle multiple resource and library info. + fmt.Println("---------------AddMetrics------------------", rm) + + if len(rm.ScopeMetrics) > 0 { + s.metrics = append(s.metrics, rm.ScopeMetrics[0].Metrics...) + fmt.Println("Metric added successfully") + } else { + fmt.Println("Metrics added filed") + } + + // if len(rm.InstrumentationLibraryMetrics) > 0 { + // s.metrics = append(s.metrics, rm.InstrumentationLibraryMetrics[0].Metrics...) + // } + + } +} + +// GetMetrics returns the stored metrics. +func (s *MetricsStorage) GetMetrics() []*metricpb.Metric { + // copy in order to not change. + m := make([]*metricpb.Metric, 0, len(s.metrics)) + return append(m, s.metrics...) +} diff --git a/utility/logger/otel/test/state_logger_test.go b/utility/logger/otel/test/state_logger_test.go new file mode 100644 index 00000000..e19d1df2 --- /dev/null +++ b/utility/logger/otel/test/state_logger_test.go @@ -0,0 +1,332 @@ +package otel + +import ( + "context" + "fmt" + otellogger "github.com/paypal/hera/utility/logger/otel" + otelconfig "github.com/paypal/hera/utility/logger/otel/config" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + "math/rand" + "os" + "testing" + "time" + + "github.com/paypal/hera/utility/logger" + "go.opentelemetry.io/otel/exporters/stdout/stdoutmetric" +) + +// This initializes console exported for metrics +func initializeConsoleExporter() (*metric.MeterProvider, error) { + otelconfig.OTelConfigData = &otelconfig.OTelConfig{ + Host: "localhost", + HttpPort: 4318, + Enabled: true, + UseOtelGRPC: false, + ResolutionTimeInSec: 3, + OTelErrorReportingInterval: 10, + PoolName: "occ-testapp", + MetricNamePrefix: "pp.occ", + MetricsURLPath: DefaultMetricsPath, + } + hostname, _ := os.Hostname() + + resource := resource.NewWithAttributes("OCC resource", + attribute.String("container_host", hostname), + attribute.String("az", "devTest"), + attribute.String("environment", "dev"), + attribute.String("application", "occ-testapp"), + ) + metricExporter, err := stdoutmetric.New(stdoutmetric.WithPrettyPrint()) + if err != nil { + logger.GetLogger().Log(logger.Alert, "failed to initialize metric stdout exporter:", err) + return nil, err + } + + meterProvider := metric.NewMeterProvider( + metric.WithResource(resource), + metric.WithReader(metric.NewPeriodicReader(metricExporter, + // Default is 1m. Set to 3s for demonstrative purposes. + metric.WithInterval(3*time.Second))), + ) + otel.SetMeterProvider(meterProvider) + return meterProvider, nil +} + +func initializeCustomOTelExporter(t *testing.T) func(ctx context.Context) error { + otelconfig.OTelConfigData = &otelconfig.OTelConfig{ + Host: "localhost", + HttpPort: 4318, + Enabled: true, + UseOtelGRPC: false, + ResolutionTimeInSec: 3, + OTelErrorReportingInterval: 10, + PoolName: "occ-testapp", + MetricNamePrefix: "pp.occ", + MetricsURLPath: DefaultMetricsPath, + } + otelconfig.SetOTelIngestToken("welcome123") + ctx := context.Background() + shutdownFn, err := otellogger.Init(ctx) + + if err != nil { + t.Log(fmt.Sprintf("failed to initialize OTEL sdk during test, error: %v", err)) + t.Fatalf("failed to initialize OTEL sdk during test, error: %v", err) + } + return shutdownFn +} + +func TestVerifyStateLogMetricsInitilization(t *testing.T) { + mc := runMockCollector(t, mockCollectorConfig{ + Port: 4318, + }) + defer mc.MustStop(t) + + _, err := initializeConsoleExporter() + if err != nil { + t.Fail() + } + + err = otellogger.StartMetricsCollection(5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) + + if err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") + t.Fail() + } + time.Sleep(15 * time.Second) + otellogger.StopMetricCollection() +} + +func TestVerifyStateLogMetricsInitilizationAndContextWithTimeout(t *testing.T) { + _, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + mc := runMockCollector(t, mockCollectorConfig{ + Port: 4318, + }) + defer mc.MustStop(t) + + _, err := initializeConsoleExporter() + if err != nil { + t.Fail() + } + + err = otellogger.StartMetricsCollection(5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) + defer otellogger.StopMetricCollection() + + if err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") + t.Fail() + } +} + +func TestSendingStateLogMetrics(t *testing.T) { + mc := runMockCollector(t, mockCollectorConfig{ + Port: 4318, + WithTLS: false, + }) + defer mc.MustStop(t) + + shutDownFn := initializeCustomOTelExporter(t) + defer shutDownFn(context.Background()) + + time.Sleep(2 * time.Second) + + err := otellogger.StartMetricsCollection(5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) + + if err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") + t.Fail() + } + //"init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd", "cls"} + var stateData = map[string]int64{ + "init": 6, + "acpt": 10, + "wait": 5, + "busy": 2, + "idle": 5, + "bklg": 0, + "req": 5, + "resp": 5, + } + workersStateData := otellogger.WorkersStateData{ + ShardId: 1, + WorkerType: 1, + InstanceId: 0, + StateData: stateData, + } + otellogger.AddDataPointToOTELStateDataChan(&workersStateData) + + defer otellogger.StopMetricCollection() //Clean channel + + logger.GetLogger().Log(logger.Info, "Data Sent successfully for instrumentation") + time.Sleep(5 * time.Second) + metricsData := mc.GetMetrics() + if len(metricsData) < 11 { + t.Fatalf("got %d, wanted %d", len(metricsData), 24) + } +} + +func TestSendingStateLogMetricsConsoleExporter(t *testing.T) { + cont, err := initializeConsoleExporter() + if err != nil { + t.Fail() + } + + err2 := otellogger.StartMetricsCollection(100, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp2")) + + if err2 != nil { + logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") + t.Fail() + } + + var stateData = map[string]int64{ + "init": 0, + "acpt": 15, + "wait": 10, + "busy": 4, + "idle": 7, + "bklg": 0, + } + + var stateData2 = map[string]int64{ + "init": 2, + "acpt": 15, + "wait": 10, + "busy": 4, + "idle": 8, + "bklg": 0, + } + workersStateData := otellogger.WorkersStateData{ + ShardId: 0, + WorkerType: 0, + InstanceId: 0, + StateData: stateData, + } + + workersStateData2 := otellogger.WorkersStateData{ + ShardId: 2, + WorkerType: 0, + InstanceId: 0, + StateData: stateData2, + } + + otellogger.AddDataPointToOTELStateDataChan(&workersStateData) + time.Sleep(150 * time.Millisecond) + otellogger.AddDataPointToOTELStateDataChan(&workersStateData2) + logger.GetLogger().Log(logger.Info, "Data Sent successfully for instrumentation") + time.Sleep(2 * time.Second) + + var stateData3 = map[string]int64{ + "init": 0, + "acpt": 1, + "wait": 10, + "busy": 4, + "idle": 17, + "bklg": 0, + } + + var stateData4 = map[string]int64{ + "init": 2, + "acpt": 0, + "wait": 10, + "busy": 4, + "idle": 8, + "bklg": 5, + } + workersStateData3 := otellogger.WorkersStateData{ + ShardId: 0, + WorkerType: 0, + InstanceId: 0, + StateData: stateData3, + } + + workersStateData4 := otellogger.WorkersStateData{ + ShardId: 2, + WorkerType: 0, + InstanceId: 0, + StateData: stateData4, + } + otellogger.AddDataPointToOTELStateDataChan(&workersStateData3) + time.Sleep(150 * time.Millisecond) + otellogger.AddDataPointToOTELStateDataChan(&workersStateData4) + otellogger.StopMetricCollection() + if err3 := cont.Shutdown(context.Background()); err3 != nil { + logger.GetLogger().Log(logger.Info, "failed to stop the metric controller:", err3) + } +} + +func TestOCCStatelogGenerator(t *testing.T) { + cont, err := initializeConsoleExporter() + if err != nil { + t.Fail() + } + defer cont.Shutdown(context.Background()) + + err2 := otellogger.StartMetricsCollection(1000, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) + defer otellogger.StopMetricCollection() + go dataGenerator() + + if err2 != nil { + logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") + t.Fatalf("TestOCCStatelogGenerator failed with error %v", err) + } + <-time.After(time.Second * time.Duration(10)) +} + +func dataGenerator() { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + waitTime := time.Second * 1 + + metricNames := [11]string{"init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd"} + workerStates := [2]string{"req", "resp"} + + timer := time.NewTimer(waitTime) + + defer timer.Stop() + +mainloop: + for { + select { + case <-timer.C: + // Initialize statedata object + workerStatesData := otellogger.WorkersStateData{ + ShardId: 0, + WorkerType: 1, + InstanceId: 0, + StateData: make(map[string]int64), + } + var numberofMetrics int = 11 + var totalSum int = 100 + var tempSum int = 0 + for index := 0; index < numberofMetrics; index++ { + exactpart := int(totalSum / numberofMetrics) + randVal := rand.Intn(exactpart) + randomValue := int(int(exactpart/2) + randVal) + value := If(tempSum+randomValue > totalSum, totalSum-tempSum, randomValue) + workerStatesData.StateData[metricNames[index]] = int64(value) + tempSum += value + } + //Random index + randIndex := rand.Intn(len(metricNames)) + workerStatesData.StateData[metricNames[randIndex]] += int64(totalSum - tempSum) + workerStatesData.StateData[workerStates[0]] = int64(rand.Intn(100)) + workerStatesData.StateData[workerStates[1]] = int64(rand.Intn(100)) + otellogger.AddDataPointToOTELStateDataChan(&workerStatesData) + timer.Reset(waitTime) + case <-ctx.Done(): + logger.GetLogger().Log(logger.Info, "Timedout, so context closed") + break mainloop + } + } +} + +// Go terenary inplementation +func If[T any](cond bool, vtrue, vfalse T) T { + if cond { + return vtrue + } + return vfalse +} From c3a15a2b4ed67bd9239bd08b415cf64c60ae678c Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Fri, 5 Jul 2024 19:25:18 +0530 Subject: [PATCH 02/19] Otel integration changes (#73) * changes for integration of otel * changes for adding otel collector functional test * changes for incorporate suggestions adding functional tests * optimize the code related to cal logging * reading az values from envs * changes for adding modules details --------- Co-authored-by: Rajesh S --- go.sum | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/go.sum b/go.sum index 8cabfc71..2c4e5240 100644 --- a/go.sum +++ b/go.sum @@ -1,19 +1,68 @@ +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/go-logfmt/logfmt v0.5.0 h1:TrB8swr/68K7m9CcGut2g3UOihhbcbiMAYiuTXdEih4= github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= github.com/godror/godror v0.26.3 h1:V+z+Q/OBGgmmYzuAwyJzpcn4LsPF4Ev0xHAea68V00c= github.com/godror/godror v0.26.3/go.mod h1:1QCn6oXh3r+IlB3DLE8V6qkHXLSHd18a3Hw7szQ9/3Y= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= github.com/lib/pq v1.10.3 h1:v9QZf2Sn6AmjXtQeFpdoq/eaNtYP6IN+7lcrygsIAtg= github.com/lib/pq v1.10.3/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e h1:vcxGaoTs7kV8m5Np9uUNQin4BrLOthgV7252N8V+FwY= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo= +go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0 h1:f2jriWfOdldanBwS9jNBdeOKAQN7b4ugAMaNu1/1k9g= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0/go.mod h1:B+bcQI1yTY+N0vqMpoZbEN7+XU4tNM0DmUiOwebFJWI= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.24.0 h1:mM8nKi6/iFQ0iqst80wDHU2ge198Ye/TfN0WBS5U24Y= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.24.0/go.mod h1:0PrIIzDteLSmNyxqcGYRL4mDIo8OTuBAOI/Bn1URxac= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.24.0 h1:t6wl9SPayj+c7lEIFgm4ooDBZVb01IhLB4InpomhRw8= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.24.0/go.mod h1:iSDOcsnSA5INXzZtwaBPrKp/lWu/V14Dd+llD0oI2EA= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.24.0 h1:Mw5xcxMwlqoJd97vwPxA8isEaIoxsta9/Q51+TTJLGE= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.24.0/go.mod h1:CQNu9bj7o7mC6U7+CA/schKEYakYXWr79ucDHTMGhCM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.24.0 h1:Xw8U6u2f8DK2XAkGRFV7BBLENgnTGX9i4rQRxJf+/vs= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.24.0/go.mod h1:6KW1Fm6R/s6Z3PGXwSJN2K4eT6wQB3vXX6CVnYX9NmM= +go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.24.0 h1:JYE2HM7pZbOt5Jhk8ndWZTUWYOVift2cHjXVMkPdmdc= +go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.24.0/go.mod h1:yMb/8c6hVsnma0RpsBMNo0fEiQKeclawtgaIaOp2MLY= +go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI= +go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco= +go.opentelemetry.io/otel/sdk v1.24.0 h1:YMPPDNymmQN3ZgczicBY3B6sf9n62Dlj9pWD3ucgoDw= +go.opentelemetry.io/otel/sdk v1.24.0/go.mod h1:KVrIYw6tEubO9E96HQpcmpTKDVn9gdv35HoYiQWGDFg= +go.opentelemetry.io/otel/sdk/metric v1.24.0 h1:yyMQrPzF+k88/DbH7o4FMAs80puqd+9osbiBrJrz/w8= +go.opentelemetry.io/otel/sdk/metric v1.24.0/go.mod h1:I6Y5FjH6rvEnTTAYQz3Mmv2kl6Ek5IIrmwTLqMrrOE0= +go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI= +go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= +go.opentelemetry.io/proto/otlp v1.2.0 h1:pVeZGk7nXDC9O2hncA6nHldxEjm6LByfA2aN8IOkz94= +go.opentelemetry.io/proto/otlp v1.2.0/go.mod h1:gGpR8txAl5M03pDhMC79G6SdqNV26naRm/KDsgaHD8A= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/genproto/googleapis/api v0.0.0-20240520151616-dc85e6b867a5 h1:P8OJ/WCl/Xo4E4zoe4/bifHpSmmKwARqyqE4nW6J2GQ= +google.golang.org/genproto/googleapis/api v0.0.0-20240520151616-dc85e6b867a5/go.mod h1:RGnPtTG7r4i8sPlNyDeikXF99hMM+hN6QMm4ooG9g2g= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240515191416-fc5f0ca64291 h1:AgADTJarZTBqgjiUzRgfaBchgYB3/WFTC80GPwsMcRI= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240515191416-fc5f0ca64291/go.mod h1:EfXuqaE1J41VCDicxHzUDm+8rk+7ZdXzHV0IhO/I6s0= +google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= +google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= -google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg= +google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= From ec7d852cca912352dd1a43187af87aa53d9c0a8d Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Mon, 8 Jul 2024 13:52:27 +0530 Subject: [PATCH 03/19] go module changes for otel (#74) Co-authored-by: Rajesh S --- go.mod | 8 ++++---- go.sum | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/go.mod b/go.mod index fbac4a03..8183674e 100644 --- a/go.mod +++ b/go.mod @@ -23,13 +23,13 @@ require ( require ( github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/go-logfmt/logfmt v0.5.0 // indirect - github.com/go-logr/logr v1.4.1 // indirect + github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect go.opentelemetry.io/otel/trace v1.24.0 // indirect - golang.org/x/net v0.25.0 // indirect - golang.org/x/sys v0.20.0 // indirect - golang.org/x/text v0.15.0 // indirect + golang.org/x/net v0.26.0 // indirect + golang.org/x/sys v0.21.0 // indirect + golang.org/x/text v0.16.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240520151616-dc85e6b867a5 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240515191416-fc5f0ca64291 // indirect google.golang.org/grpc v1.64.0 // indirect diff --git a/go.sum b/go.sum index 2c4e5240..f59b3698 100644 --- a/go.sum +++ b/go.sum @@ -4,8 +4,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/go-logfmt/logfmt v0.5.0 h1:TrB8swr/68K7m9CcGut2g3UOihhbcbiMAYiuTXdEih4= github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= -github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= @@ -46,14 +46,14 @@ go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw go.opentelemetry.io/proto/otlp v1.2.0 h1:pVeZGk7nXDC9O2hncA6nHldxEjm6LByfA2aN8IOkz94= go.opentelemetry.io/proto/otlp v1.2.0/go.mod h1:gGpR8txAl5M03pDhMC79G6SdqNV26naRm/KDsgaHD8A= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= -golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= -golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= -golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= -golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= -golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/genproto/googleapis/api v0.0.0-20240520151616-dc85e6b867a5 h1:P8OJ/WCl/Xo4E4zoe4/bifHpSmmKwARqyqE4nW6J2GQ= google.golang.org/genproto/googleapis/api v0.0.0-20240520151616-dc85e6b867a5/go.mod h1:RGnPtTG7r4i8sPlNyDeikXF99hMM+hN6QMm4ooG9g2g= From 688ca394a45c69c544a711dbe443ccbd43d2e516 Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Thu, 11 Jul 2024 23:25:33 +0530 Subject: [PATCH 04/19] making changes updating port for metric and traces (#84) Co-authored-by: Rajesh S --- lib/config.go | 7 ++--- utility/logger/otel/config/otelconfig.go | 14 +++++----- utility/logger/otel/logger.go | 27 +++++++++---------- utility/logger/otel/test/state_logger_test.go | 12 ++++++--- 4 files changed, 32 insertions(+), 28 deletions(-) diff --git a/lib/config.go b/lib/config.go index 6303c004..cdd2da57 100644 --- a/lib/config.go +++ b/lib/config.go @@ -478,9 +478,10 @@ func initializeOTELConfigs(cdb config.Config, poolName string) { otelconfig.OTelConfigData.SkipCalStateLog = cdb.GetOrDefaultBool("skip_cal_statelog", false) otelconfig.OTelConfigData.MetricNamePrefix = cdb.GetOrDefaultString("otel_metric_prefix", "pp.occ") otelconfig.OTelConfigData.Host = cdb.GetOrDefaultString("otel_agent_host", "localhost") - otelconfig.OTelConfigData.HttpPort = cdb.GetOrDefaultInt("otel_agent_http_port", 4318) - otelconfig.OTelConfigData.GRPCPort = cdb.GetOrDefaultInt("otel_agent_grpc_port", 4317) - otelconfig.OTelConfigData.UseOtelGRPC = cdb.GetOrDefaultBool("otel_agent_use_grpc", false) + otelconfig.OTelConfigData.MetricsPort = cdb.GetOrDefaultInt("otel_agent_metrics_port", 4318) + otelconfig.OTelConfigData.TracePort = cdb.GetOrDefaultInt("otel_agent_trace_port", 4318) + otelconfig.OTelConfigData.OtelMetricGRPC = cdb.GetOrDefaultBool("otel_agent_use_grpc_metric", false) + otelconfig.OTelConfigData.OtelTraceGRPC = cdb.GetOrDefaultBool("otel_agent_use_grpc_trace", false) otelconfig.OTelConfigData.MetricsURLPath = cdb.GetOrDefaultString("otel_agent_metrics_uri", "") otelconfig.OTelConfigData.TraceURLPath = cdb.GetOrDefaultString("otel_agent_trace_uri", "") otelconfig.OTelConfigData.PoolName = poolName diff --git a/utility/logger/otel/config/otelconfig.go b/utility/logger/otel/config/otelconfig.go index 5f2b0a0c..20b937ab 100644 --- a/utility/logger/otel/config/otelconfig.go +++ b/utility/logger/otel/config/otelconfig.go @@ -14,8 +14,8 @@ var OTelIngestTokenData atomic.Value type OTelConfig struct { MetricNamePrefix string Host string - HttpPort int - GRPCPort int + MetricsPort int + TracePort int MetricsURLPath string TraceURLPath string PoolName string @@ -26,7 +26,8 @@ type OTelConfig struct { ExporterTimeout int UseTls bool TLSCertPath string - UseOtelGRPC bool + OtelMetricGRPC bool + OtelTraceGRPC bool OTelErrorReportingInterval int EnableRetry bool } @@ -42,13 +43,14 @@ func (config *OTelConfig) validate() error { func (config *OTelConfig) Dump() { logger.GetLogger().Log(logger.Info, fmt.Sprintf("Host : %s", config.Host)) - logger.GetLogger().Log(logger.Info, fmt.Sprintf("Http Port: %d", config.HttpPort)) - logger.GetLogger().Log(logger.Info, fmt.Sprintf("GRPC Port: %d", config.GRPCPort)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("UseOtlMetricGRPC: %t", config.OtelMetricGRPC)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("Metrics Port: %d", config.MetricsPort)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("UseOtlMetricGRPC: %t", config.OtelTraceGRPC)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("Trace Port Port: %d", config.TracePort)) logger.GetLogger().Log(logger.Info, fmt.Sprintf("Poolname: %s", config.PoolName)) logger.GetLogger().Log(logger.Info, fmt.Sprintf("ResolutionTimeInSec: %d", config.ResolutionTimeInSec)) logger.GetLogger().Log(logger.Info, fmt.Sprintf("UseTls: %t", config.UseTls)) logger.GetLogger().Log(logger.Info, fmt.Sprintf("UrlPath: %s", config.MetricsURLPath)) - logger.GetLogger().Log(logger.Info, fmt.Sprintf("UseOtelGRPC: %t", config.UseOtelGRPC)) } func (config *OTelConfig) PopulateMetricNamePrefix(metricName string) string { diff --git a/utility/logger/otel/logger.go b/utility/logger/otel/logger.go index 4187b320..879693f1 100644 --- a/utility/logger/otel/logger.go +++ b/utility/logger/otel/logger.go @@ -127,7 +127,7 @@ func newMeterProvider(ctx context.Context) (*metric.MeterProvider, error) { // getMetricExporter Initialize metric exporter based protocol selected by user. func getMetricExporter(ctx context.Context) (metric.Exporter, error) { - if config.OTelConfigData.UseOtelGRPC { + if config.OTelConfigData.OtelMetricGRPC { return newGRPCExporter(ctx) } return newHTTPExporter(ctx) @@ -135,7 +135,7 @@ func getMetricExporter(ctx context.Context) (metric.Exporter, error) { // getTraceExporter Initialize span exporter based protocol(GRPC or HTTP) selected by user. func getTraceExporter(ctx context.Context) (*otlptrace.Exporter, error) { - if config.OTelConfigData.UseOtelGRPC { + if config.OTelConfigData.OtelTraceGRPC { return newGRPCTraceExporter(ctx) } return newHTTPTraceExporter(ctx) @@ -153,7 +153,7 @@ func newHTTPExporter(ctx context.Context) (metric.Exporter, error) { var temporalitySelector = func(instrument metric.InstrumentKind) metricdata.Temporality { return metricdata.DeltaTemporality } return otlpmetrichttp.New(ctx, - otlpmetrichttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.HttpPort)), + otlpmetrichttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.MetricsPort)), otlpmetrichttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), otlpmetrichttp.WithCompression(otlpmetrichttp.NoCompression), otlpmetrichttp.WithTemporalitySelector(temporalitySelector), @@ -192,7 +192,7 @@ func newGRPCExporter(ctx context.Context) (metric.Exporter, error) { var temporalitySelector = func(instrument metric.InstrumentKind) metricdata.Temporality { return metricdata.DeltaTemporality } return otlpmetricgrpc.New(ctx, - otlpmetricgrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.GRPCPort)), + otlpmetricgrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.MetricsPort)), otlpmetricgrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), otlpmetricgrpc.WithHeaders(headers), otlpmetricgrpc.WithReconnectionPeriod(time.Duration(5)*time.Second), @@ -224,7 +224,7 @@ func newHTTPTraceExporter(ctx context.Context) (*otlptrace.Exporter, error) { headers[IngestTokenHeader] = config.GetOTelIngestToken() return otlptracehttp.New(ctx, - otlptracehttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.HttpPort)), + otlptracehttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.TracePort)), otlptracehttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), otlptracehttp.WithHeaders(headers), otlptracehttp.WithRetry(otlptracehttp.RetryConfig{ @@ -256,7 +256,7 @@ func newGRPCTraceExporter(ctx context.Context) (*otlptrace.Exporter, error) { headers[IngestTokenHeader] = config.GetOTelIngestToken() return otlptracegrpc.New(ctx, - otlptracegrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.GRPCPort)), + otlptracegrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.TracePort)), otlptracegrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), otlptracegrpc.WithHeaders(headers), otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{ @@ -290,17 +290,14 @@ func getResourceInfo(appName string) *resource.Resource { attribute.String("source", "otel"), } - environment, isPresent := os.LookupEnv("ENVIRONMENT") - if !isPresent { - environment = "dev" + environment, isEnvPresent := os.LookupEnv("ENVIRONMENT") + az, isAzPresent := os.LookupEnv("AVAILABILITY_ZONE") + if isEnvPresent { + attributes = append(attributes, attribute.String("environment", environment)) } - az, isPresent := os.LookupEnv("AVAILABILITY_ZONE") - if !isPresent { - az = "dev" + if isAzPresent { + attributes = append(attributes, attribute.String("az", az)) } - attributes = append(attributes, attribute.String("az", az)) - attributes = append(attributes, attribute.String("environment", environment)) - resource := resource.NewWithAttributes(fmt.Sprintf("%s resource", config.OTelConfigData.ResourceType), attributes..., ) diff --git a/utility/logger/otel/test/state_logger_test.go b/utility/logger/otel/test/state_logger_test.go index e19d1df2..366b7927 100644 --- a/utility/logger/otel/test/state_logger_test.go +++ b/utility/logger/otel/test/state_logger_test.go @@ -22,9 +22,11 @@ import ( func initializeConsoleExporter() (*metric.MeterProvider, error) { otelconfig.OTelConfigData = &otelconfig.OTelConfig{ Host: "localhost", - HttpPort: 4318, + MetricsPort: 4318, + TracePort: 4318, Enabled: true, - UseOtelGRPC: false, + OtelMetricGRPC: false, + OtelTraceGRPC: false, ResolutionTimeInSec: 3, OTelErrorReportingInterval: 10, PoolName: "occ-testapp", @@ -58,9 +60,11 @@ func initializeConsoleExporter() (*metric.MeterProvider, error) { func initializeCustomOTelExporter(t *testing.T) func(ctx context.Context) error { otelconfig.OTelConfigData = &otelconfig.OTelConfig{ Host: "localhost", - HttpPort: 4318, + MetricsPort: 4318, + TracePort: 4318, Enabled: true, - UseOtelGRPC: false, + OtelMetricGRPC: false, + OtelTraceGRPC: false, ResolutionTimeInSec: 3, OTelErrorReportingInterval: 10, PoolName: "occ-testapp", From 13fc52210b9e71461edcdf1728a5824cedb43cad Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Fri, 12 Jul 2024 15:28:25 +0530 Subject: [PATCH 05/19] Otel integration changes latest 2 (#85) * Occ config logging (#394) * occ configurations logging * cal event success * adding cal data * added TODOs * Remove log_occ_confogs.go * Remove testing files * source of configs - files * whitelist format change * code clean up * code review changes-1 * CR fixes * CR fixes * Delete tests/unittest/config_logging/main_test.go * clean up * Merge branch 'occ-config-logging' of /Users/simmidisetty/Documents/GitHub/OpenSourceHera/src/github.com/paypal/hera with conflicts. * test for config logging * removing test changes * tests for all cases * test * making minor changes for logging feature specific data * changes for incorporate review comments --------- Co-authored-by: simmidisetty Co-authored-by: Rajesh S * making changes updating port for metric and traces --------- Co-authored-by: satyakamala03 <128077872+satyakamala03@users.noreply.github.com> Co-authored-by: simmidisetty Co-authored-by: Rajesh S --- lib/config.go | 171 ++++++++++++++++++++- lib/main.go | 13 ++ tests/unittest/config_logging/main_test.go | 141 +++++++++++++++++ 3 files changed, 322 insertions(+), 3 deletions(-) create mode 100644 tests/unittest/config_logging/main_test.go diff --git a/lib/config.go b/lib/config.go index cdd2da57..91603acc 100644 --- a/lib/config.go +++ b/lib/config.go @@ -20,14 +20,14 @@ package lib import ( "errors" "fmt" + "github.com/paypal/hera/cal" + "github.com/paypal/hera/config" + "github.com/paypal/hera/utility/logger" otelconfig "github.com/paypal/hera/utility/logger/otel/config" "os" "path/filepath" "strings" "sync/atomic" - - "github.com/paypal/hera/config" - "github.com/paypal/hera/utility/logger" ) // The Config contains all the static configuration @@ -74,6 +74,9 @@ type Config struct { // config_reload_time_ms(30 * 1000) // ConfigReloadTimeMs int + // + // + ConfigLoggingReloadTimeHours int // custom_auth_timeout(1000) CustomAuthTimeoutMs int // time_skew_threshold_warn(2) @@ -269,6 +272,7 @@ func InitConfig(poolName string) error { } gAppConfig.ConfigReloadTimeMs = cdb.GetOrDefaultInt("config_reload_time_ms", 30*1000) + gAppConfig.ConfigLoggingReloadTimeHours = cdb.GetOrDefaultInt("config_logging_reload_time_hours", 24) gAppConfig.CustomAuthTimeoutMs = cdb.GetOrDefaultInt("custom_auth_timeout", 1000) gAppConfig.TimeSkewThresholdWarnSec = cdb.GetOrDefaultInt("time_skew_threshold_warn", 2) gAppConfig.TimeSkewThresholdErrorSec = cdb.GetOrDefaultInt("time_skew_threshold_error", 15) @@ -495,6 +499,167 @@ func initializeOTELConfigs(cdb config.Config, poolName string) { otelconfig.SetOTelIngestToken(cdb.GetOrDefaultString("otel_ingest_token", "")) } +func LogOccConfigs() { + whiteListConfigs := map[string]map[string]interface{}{ + "BACKLOG": { + "backlog_pct": gAppConfig.BacklogPct, + "request_backlog_timeout": gAppConfig.BacklogTimeoutMsec, + "short_backlog_timeout": gAppConfig.ShortBacklogTimeoutMsec, + }, + "BOUNCER": { + "bouncer_enabled": gAppConfig.BouncerEnabled, + "bouncer_startup_delay": gAppConfig.BouncerStartupDelay, + "bouncer_poll_interval_ms": gAppConfig.BouncerPollInterval, + }, + "OTEL": { + "enable_otel": otelconfig.OTelConfigData.Enabled, + "skip_cal_statelog": otelconfig.OTelConfigData.SkipCalStateLog, + "otel_agent_host": otelconfig.OTelConfigData.Host, + "otel_agent_metrics_port": otelconfig.OTelConfigData.MetricsPort, + "otel_agent_trace_port": otelconfig.OTelConfigData.TracePort, + "otel_agent_metrics_uri": otelconfig.OTelConfigData.MetricsURLPath, + "otel_agent_trace_uri": otelconfig.OTelConfigData.TraceURLPath, + "otel_resolution_time_in_sec": otelconfig.OTelConfigData.ResolutionTimeInSec, + "otel_error_reporting_interval_in_sec": otelconfig.OTelConfigData.OTelErrorReportingInterval, + }, + "PROFILE": { + "enable_profile": gAppConfig.EnableProfile, + "profile_http_port": gAppConfig.ProfileHTTPPort, + "profile_telnet_port": gAppConfig.ProfileTelnetPort, + }, + "SHARDING": { + "enable_sharding": gAppConfig.EnableSharding, + "use_shardmap": gAppConfig.UseShardMap, + "num_shards": gAppConfig.NumOfShards, + "shard_key_name": gAppConfig.ShardKeyName, + "max_scuttle": gAppConfig.MaxScuttleBuckets, + "scuttle_col_name": gAppConfig.ScuttleColName, + "shard_key_value_type_is_string": gAppConfig.ShardKeyValueTypeIsString, + "enable_whitelist_test": gAppConfig.EnableWhitelistTest, + "whitelist_children": gAppConfig.NumWhitelistChildren, + "sharding_postfix": gAppConfig.ShardingPostfix, + "sharding_cfg_reload_interval": gAppConfig.ShardingCfgReloadInterval, + "hostname_prefix": gAppConfig.HostnamePrefix, + "sharding_cross_keys_err": gAppConfig.ShardingCrossKeysErr, + //"enable_sql_rewrite", // not found anywhere? + "sharding_algo": gAppConfig.ShardingAlgoHash, + "cfg_from_tns_override_num_shards": gAppConfig.CfgFromTnsOverrideNumShards, + }, + "TAF": { + "enable_taf": gAppConfig.EnableTAF, + "cfg_from_tns_override_taf": gAppConfig.CfgFromTnsOverrideTaf, + "testing_enable_dml_taf": gAppConfig.TestingEnableDMLTaf, + "taf_timeout_ms": gAppConfig.TAFTimeoutMs, + "taf_bin_duration": gAppConfig.TAFBinDuration, + "taf_allow_slow_every_x": gAppConfig.TAFAllowSlowEveryX, + "taf_normally_slow_count": gAppConfig.TAFNormallySlowCount, + }, + "BIND-EVICTION": { + "child.executable": gAppConfig.ChildExecutable, + //"enable_bind_hash_logging" FOUND FOR SOME OCCs ONLY IN occ.def + "bind_eviction_threshold_pct": gAppConfig.BindEvictionThresholdPct, + "bind_eviction_decr_per_sec": gAppConfig.BindEvictionDecrPerSec, + "bind_eviction_target_conn_pct": gAppConfig.BindEvictionTargetConnPct, + "bind_eviction_max_throttle": gAppConfig.BindEvictionMaxThrottle, + "bind_eviction_names": gAppConfig.BindEvictionNames, + "skip_eviction_host_prefix": gAppConfig.SkipEvictRegex, + "eviction_host_prefix": gAppConfig.EvictRegex, + "query_bind_blocker_min_sql_prefix": gAppConfig.QueryBindBlockerMinSqlPrefix, + "enable_connlimit_check": gAppConfig.EnableConnLimitCheck, + }, + "MANUAL-RATE-LIMITER": { + "enable_query_bind_blocker": gAppConfig.EnableQueryBindBlocker, + }, + "SATURATION-RECOVERY": { + "saturation_recover_threshold": GetSatRecoverThresholdMs(), + "saturation_recover_throttle_rate": GetSatRecoverThrottleRate(), + }, + "SOFT-EVICTION": { + "soft_eviction_effective_time": gAppConfig.SoftEvictionEffectiveTimeMs, + "soft_eviction_probability": gAppConfig.SoftEvictionProbability, + }, + "WORKER-CONFIGURATIONS": { + "lifespan_check_interval": gAppConfig.lifeSpanCheckInterval, + "lifo_scheduler_enabled": gAppConfig.LifoScheduler, + //"num_workers_per_proxy", // only present in occ.def for some occs + //"max_clients_per_worker", // only present in occ.def for some occs + "max_stranded_time_interval": gAppConfig.StrandedWorkerTimeoutMs, + "high_load_max_stranded_time_interval": gAppConfig.HighLoadStrandedWorkerTimeoutMs, + "high_load_skip_initiate_recover_pct": gAppConfig.HighLoadSkipInitiateRecoverPct, + "enable_danglingworker_recovery": gAppConfig.EnableDanglingWorkerRecovery, + "max_db_connects_per_sec": gAppConfig.MaxDbConnectsPerSec, + "max_lifespan_per_child": GetMaxLifespanPerChild(), + "max_requests_per_child": GetMaxRequestsPerChild(), + "max_desire_healthy_worker_pct": gAppConfig.MaxDesiredHealthyWorkerPct, + }, + "R-W-SPLIT": { + "readonly_children_pct": gAppConfig.ReadonlyPct, + "cfg_from_tns_override_rw_split": gAppConfig.CfgFromTnsOverrideRWSplit, + }, + "RAC": { + "management_table_prefix": gAppConfig.ManagementTablePrefix, + "rac_sql_interval": gAppConfig.RacMaintReloadInterval, + "rac_restart_window": gAppConfig.RacRestartWindow, + }, + "NO-CATEGORY": { + "database_type": gAppConfig.DatabaseType, // Oracle = 0; MySQL=1; POSTGRES=2 + "cfg_from_tns": gAppConfig.CfgFromTns, + "log_level": gOpsConfig.logLevel, + "high_load_pct": gAppConfig.HighLoadPct, + "init_limit_pct": gAppConfig.InitLimitPct, + "num_standby_dbs": gAppConfig.NumStdbyDbs, + }, + } + + for feature, configs := range whiteListConfigs { + switch feature { + case "BACKLOG": + if gAppConfig.BacklogPct == 0 { + continue + } + case "BOUNCER": + if !gAppConfig.BouncerEnabled { + continue + } + case "OTEL": + if !otelconfig.OTelConfigData.Enabled { + continue + } + case "PROFILE": + if !gAppConfig.EnableProfile { + continue + } + case "SHARDING": + if !gAppConfig.EnableSharding { + continue + } + case "TAF": + if !gAppConfig.EnableTAF { + continue + } + case "R-W-SPLIT": + if gAppConfig.ReadonlyPct == 0 { + continue + } + case "SOFT-EVICTION", "BIND-EVICTION": + if GetSatRecoverThrottleRate() <= 0 { + continue + } + case "MANUAL-RATE-LIMITER": + if !gAppConfig.EnableQueryBindBlocker { + continue + } + } + + evt := cal.NewCalEvent("OCC_CONFIG", fmt.Sprintf(feature), cal.TransOK, "") + for cfg, val := range configs { + s := fmt.Sprintf("%v", val) + evt.AddDataStr(cfg, s) + } + evt.Completed() + } +} + // CheckOpsConfigChange checks if the ops config file needs to be reloaded and reloads it if necessary. // it is called every several seconds from a dedicated go-routine. func CheckOpsConfigChange() { diff --git a/lib/main.go b/lib/main.go index 7040103b..8a2bcda3 100644 --- a/lib/main.go +++ b/lib/main.go @@ -136,6 +136,19 @@ func Run() { } }() + //This logs the configured parameter with the feature name in the CAL log periodically based on ConfigLoggingReloadTimeHours. + LogOccConfigs() + configLoggingTicker := time.NewTicker(time.Duration(GetConfig().ConfigLoggingReloadTimeHours) * time.Hour) + defer configLoggingTicker.Stop() + go func() { + for { + select { + case <-configLoggingTicker.C: + LogOccConfigs() + } + } + }() + CheckEnableProfiling() GoStats() diff --git a/tests/unittest/config_logging/main_test.go b/tests/unittest/config_logging/main_test.go new file mode 100644 index 00000000..f9125b47 --- /dev/null +++ b/tests/unittest/config_logging/main_test.go @@ -0,0 +1,141 @@ +package main + +import ( + "context" + "database/sql" + + "fmt" + "os" + "strings" + "testing" + "time" + + //"github.com/paypal/hera/client/gosqldriver" + _ "github.com/paypal/hera/client/gosqldriver/tcp" + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31003" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["enable_sharding"] = "true" + appcfg["num_shards"] = "3" + appcfg["bouncer_enabled"] = "true" + appcfg["sharding_algo"] = "mod" + appcfg["shard_key_name"] = "id" + appcfg["config_logging_reload_time_hours"] = "0.0002" + pfx := os.Getenv("MGMT_TABLE_PREFIX") + if pfx != "" { + appcfg["management_table_prefix"] = pfx + } + appcfg["sharding_cfg_reload_interval"] = "3600" + appcfg["rac_sql_interval"] = "0" + //appcfg["readonly_children_pct"] = "40" + + appcfg["soft_eviction_effective_time"] = "10000" + appcfg["bind_eviction_threshold_pct"] = "60" + + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "3" + opscfg["opscfg.default.server.log_level"] = "5" + opscfg["opscfg.default.server.saturation_recover_throttle_rate"] = "30" + + return appcfg, opscfg, testutil.MySQLWorker +} + +func setupShardMap() { + twoTask := os.Getenv("TWO_TASK") + if !strings.HasPrefix(twoTask, "tcp") { + // not mysql + return + } + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + testutil.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + conn, err := db.Conn(ctx) + if err != nil { + testutil.Fatalf("Error getting connection %s\n", err.Error()) + } + defer conn.Close() + + testutil.RunDML("create table hera_shard_map ( scuttle_id smallint not null, shard_id tinyint not null, status char(1) , read_status char(1), write_status char(1), remarks varchar(500))") + + for i := 0; i < 1024; i++ { + shard := 0 + if i <= 8 { + shard = i % 3 + } + testutil.RunDML(fmt.Sprintf("insert into hera_shard_map ( scuttle_id, shard_id, status, read_status, write_status ) values ( %d, %d, 'Y', 'Y', 'Y' )", i, shard)) + } +} + +func before() error { + tableName = os.Getenv("TABLE_NAME") + if tableName == "" { + tableName = "jdbc_hera_test" + } + if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { + // mysql + testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") + } + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func TestConfigLogging(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestConfigLogging setup") + setupShardMap() + logger.GetLogger().Log(logger.Debug, "TestConfigLogging begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + + hostname, _ := os.Hostname() + db, err := sql.Open("hera", hostname+":31003") + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + time.Sleep(10 * time.Second) + if testutil.RegexCountFile("OCC_CONFIG\tSHARDING", "cal.log") < 1 { + t.Fatalf("SHARDING configuration details are missing.") + } + + if testutil.RegexCountFile("OCC_CONFIG\tBACKLOG", "cal.log") < 1 { + t.Fatalf("BACKLOG config details are missing.") + } + + if testutil.RegexCountFile("OCC_CONFIG\tTAF", "cal.log") > 0 { + t.Fatalf("TAF is not enabled so we should not see TAF config logging.") + } + + if testutil.RegexCountFile("OCC_CONFIG\tR-W-SPLIT", "cal.log") > 0 { + t.Fatalf("R-W-SPLIT is not configured, it should not log R-W-SPLIT config details.") + } + + if testutil.RegexCountFile("OCC_CONFIG\tSOFT-EVICTION", "cal.log") < 1 { + t.Fatalf("Saturation recovery enabled, so it should log SOFT-EVICTION configurations") + } + + if testutil.RegexCountFile("OCC_CONFIG\tBIND-EVICTION", "cal.log") < 1 { + t.Fatalf("Saturation recovery enabled, so it should log BIND-EVICTION configurations") + } + logger.GetLogger().Log(logger.Debug, "TestShardingMod done -------------------------------------------------------------") +} From 05eec9c560458ca330d5f252d3758f6666615165 Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Thu, 18 Jul 2024 19:28:50 +0530 Subject: [PATCH 06/19] changes for adding dimension occ_worker at source (#89) Co-authored-by: Rajesh S --- lib/config.go | 1 + lib/statelog.go | 1 + tests/unittest/otel_basic/main_test.go | 4 +- .../otel_incorrect_endpoint/main_test.go | 6 +- .../otel_remote_endpoint_tls/main_test.go | 128 ++++++++ .../unittest/otel_with_skip_cal/main_test.go | 4 +- tests/unittest/testutil/setup.go | 2 +- utility/logger/otel/defs.go | 19 +- utility/logger/otel/logger.go | 295 ++++++++++++------ utility/logger/otel/state_logger.go | 9 +- 10 files changed, 355 insertions(+), 114 deletions(-) create mode 100644 tests/unittest/otel_remote_endpoint_tls/main_test.go diff --git a/lib/config.go b/lib/config.go index 91603acc..98e49c1f 100644 --- a/lib/config.go +++ b/lib/config.go @@ -513,6 +513,7 @@ func LogOccConfigs() { }, "OTEL": { "enable_otel": otelconfig.OTelConfigData.Enabled, + "otel_use_tls": otelconfig.OTelConfigData.UseTls, "skip_cal_statelog": otelconfig.OTelConfigData.SkipCalStateLog, "otel_agent_host": otelconfig.OTelConfigData.Host, "otel_agent_metrics_port": otelconfig.OTelConfigData.MetricsPort, diff --git a/lib/statelog.go b/lib/statelog.go index 1c9ee2d1..717f2679 100644 --- a/lib/statelog.go +++ b/lib/statelog.go @@ -766,6 +766,7 @@ func (sl *StateLog) genReport() { } // Initialize statedata object workerStatesData := otel_logger.WorkersStateData{ + StateTitle: sl.mTypeTitles[s][HeraWorkerType(t)][n], ShardId: int(s), WorkerType: int(t), InstanceId: int(n), diff --git a/tests/unittest/otel_basic/main_test.go b/tests/unittest/otel_basic/main_test.go index 60a3b6b6..2a865bd1 100644 --- a/tests/unittest/otel_basic/main_test.go +++ b/tests/unittest/otel_basic/main_test.go @@ -28,7 +28,7 @@ func cfg() (map[string]string, map[string]string, testutil.WorkerType) { appcfg["rac_sql_interval"] = "0" appcfg["child.executable"] = "mysqlworker" appcfg["enable_otel"] = "true" - appcfg["otel_resolution_time_in_sec"] = "3" + appcfg["otel_resolution_time_in_sec"] = "10" opscfg := make(map[string]string) opscfg["opscfg.default.server.max_connections"] = "3" opscfg["opscfg.default.server.log_level"] = "5" @@ -101,6 +101,7 @@ func TestOTELMetricsBasic(t *testing.T) { cancel() conn.Close() + time.Sleep(15 * time.Second) //Read OTEL log file for metrics validation logFilePath := filepath.Join(testutil.GetOTELLogDirPath(), "otel_collector.log") count := testutil.RegexCountFile("{\"key\":\"application\",\"value\":{\"stringValue\":\"hera-test\"}", logFilePath) @@ -124,5 +125,6 @@ func TestOTELMetricsBasic(t *testing.T) { if envCount < 1 { t.Fatalf("az configured as test-dev and its value should present in otel metric dimension") } + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsBasic done -------------------------------------------------------------") } diff --git a/tests/unittest/otel_incorrect_endpoint/main_test.go b/tests/unittest/otel_incorrect_endpoint/main_test.go index bd96879f..cadccfa1 100644 --- a/tests/unittest/otel_incorrect_endpoint/main_test.go +++ b/tests/unittest/otel_incorrect_endpoint/main_test.go @@ -27,7 +27,7 @@ func cfg() (map[string]string, map[string]string, testutil.WorkerType) { appcfg["rac_sql_interval"] = "0" appcfg["child.executable"] = "mysqlworker" appcfg["enable_otel"] = "true" - appcfg["otel_resolution_time_in_sec"] = "3" + appcfg["otel_resolution_time_in_sec"] = "10" appcfg["otel_agent_metrics_uri"] = "v2/metrics" opscfg := make(map[string]string) opscfg["opscfg.default.server.max_connections"] = "3" @@ -99,12 +99,14 @@ func TestOTELMetricsIncorrectEndPoint(t *testing.T) { rows.Close() stmt.Close() + time.Sleep(10 * time.Second) publishingErrors := testutil.RegexCountFile("otel publishing error", "hera.log") if publishingErrors < 2 { t.Fatalf("otel publishing error should present in log because of in-correct OTEL port number") } - calPublishingErrors := testutil.RegexCountFile("otel publishing error", "cal.log") + time.Sleep(5 * time.Second) + calPublishingErrors := testutil.RegexCountFile("failed to send metrics", "cal.log") if calPublishingErrors < 1 { t.Fatalf("otel publishing error should present in CAL log because of in-correct OTEL port number") } diff --git a/tests/unittest/otel_remote_endpoint_tls/main_test.go b/tests/unittest/otel_remote_endpoint_tls/main_test.go new file mode 100644 index 00000000..5a8fc189 --- /dev/null +++ b/tests/unittest/otel_remote_endpoint_tls/main_test.go @@ -0,0 +1,128 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "os" + "strings" + "testing" + "time" + + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31002" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["sharding_cfg_reload_interval"] = "5" + appcfg["enable_sharding"] = "true" + appcfg["num_shards"] = "3" + appcfg["max_scuttle"] = "9" + appcfg["rac_sql_interval"] = "0" + appcfg["child.executable"] = "mysqlworker" + appcfg["enable_otel"] = "true" + appcfg["otel_use_tls"] = "true" + appcfg["otel_agent_host"] = "otelmetrics-pp-observability.us-central1.gcp.dev.paypalinc.com" + appcfg["otel_agent_metrics_port"] = "30706" + appcfg["otel_agent_trace_port"] = "30706" + appcfg["otel_resolution_time_in_sec"] = "10" + appcfg["otel_agent_metrics_uri"] = "v1/metrics" + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "5" + opscfg["opscfg.default.server.log_level"] = "5" + os.Setenv("AVAILABILITY_ZONE", "test-dev") + os.Setenv("ENVIRONMENT", "dev") + return appcfg, opscfg, testutil.MySQLWorker +} + +func before() error { + tableName = os.Getenv("TABLE_NAME") + if tableName == "" { + tableName = "jdbc_hera_test" + } + if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { + // mysql + testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") + } + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func TestOTELMetricsRemoteEndPointWithTLS(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsRemoteEndPointWithTLS begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + // cleanup and insert one row in the table + conn, err := db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + tx, _ := conn.BeginTx(ctx, nil) + sqlTxt := "/*cmd*/delete from " + tableName + stmt, _ := tx.PrepareContext(ctx, sqlTxt) + _, err = stmt.Exec() + if err != nil { + t.Fatalf("Error preparing test (delete table) %s with %s ==== sql\n", err.Error(), sqlTxt) + } + + stmt, _ = tx.PrepareContext(ctx, "/*cmd*/insert into "+tableName+" (id, int_val, str_val) VALUES(?, ?, ?)") + _, err = stmt.Exec(1, time.Now().Unix(), "val 1") + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + if err != nil { + t.Fatalf("Error commit %s\n", err.Error()) + } + + stmt, _ = conn.PrepareContext(ctx, "/*cmd*/Select id, int_val from "+tableName+" where id=?") + rows, _ := stmt.Query(1) + if !rows.Next() { + t.Fatalf("Expected 1 row") + } + + time.Sleep(10 * time.Second) + rows.Close() + stmt.Close() + + time.Sleep(10 * time.Second) + publishingErrors := testutil.RegexCountFile("otel publishing error", "hera.log") + if publishingErrors > 1 { + t.Fatalf("should not fail while publishing metrics remote host") + } + + time.Sleep(5 * time.Second) + calPublishingErrors := testutil.RegexCountFile("failed to send metrics", "cal.log") + if calPublishingErrors > 1 { + t.Fatalf("should not fail while publishing metrics remote host") + } + + cancel() + conn.Close() + + for counter := 0; counter < 1000; counter++ { + time.Sleep(10 * time.Second) + } + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsRemoteEndPointWithTLS done -------------------------------------------------------------") +} diff --git a/tests/unittest/otel_with_skip_cal/main_test.go b/tests/unittest/otel_with_skip_cal/main_test.go index 79b271c6..95b92be7 100644 --- a/tests/unittest/otel_with_skip_cal/main_test.go +++ b/tests/unittest/otel_with_skip_cal/main_test.go @@ -27,7 +27,7 @@ func cfg() (map[string]string, map[string]string, testutil.WorkerType) { appcfg["rac_sql_interval"] = "0" appcfg["child.executable"] = "mysqlworker" appcfg["enable_otel"] = "true" - appcfg["otel_resolution_time_in_sec"] = "3" + appcfg["otel_resolution_time_in_sec"] = "10" appcfg["skip_cal_statelog"] = "true" opscfg := make(map[string]string) opscfg["opscfg.default.server.max_connections"] = "3" @@ -95,7 +95,7 @@ func TestOTELMetricsSkipCALEndPoint(t *testing.T) { t.Fatalf("Expected 1 row") } - time.Sleep(10 * time.Second) + time.Sleep(20 * time.Second) rows.Close() stmt.Close() diff --git a/tests/unittest/testutil/setup.go b/tests/unittest/testutil/setup.go index bae0fc94..b14e0288 100644 --- a/tests/unittest/testutil/setup.go +++ b/tests/unittest/testutil/setup.go @@ -481,7 +481,7 @@ func (m *mux) StartOTelAgent() error { } func (m *mux) StopOTelAgent() error { - logger.GetLogger().Log(logger.Info, "starting OTEL agent locally at: ", time.Now()) + logger.GetLogger().Log(logger.Info, "stoping OTEL agent locally at: ", time.Now()) shutdownAgent := exec.Command("docker-compose", "-f", OTEL_AGENT_DOCKER_CONFIG_PATH, "down") err := shutdownAgent.Run() diff --git a/utility/logger/otel/defs.go b/utility/logger/otel/defs.go index 5a7b49eb..d344cea6 100644 --- a/utility/logger/otel/defs.go +++ b/utility/logger/otel/defs.go @@ -25,14 +25,16 @@ const ( ) const ( - Target = string("target") - Endpoint = string("target_ip_port") - TLS_version = string("tls_version") - Application = string("Application") - ShardId = string("ShardId") - WorkerType = string("WorkerType") - InstanceId = string("InstanceId") - Datapoints = string("datapoints") + Target = string("target") + Endpoint = string("target_ip_port") + TLS_version = string("tls_version") + Application = string("Application") + ShardId = string("ShardId") + WorkerType = string("WorkerType") + InstanceId = string("InstanceId") + Datapoints = string("datapoints") + otelSource = string("otel") + OccWorkerParamName = string("occ_worker") ) const OtelInstrumentationVersion string = "v1.0" @@ -57,6 +59,7 @@ type Tags struct { } type WorkersStateData struct { + StateTitle string ShardId int WorkerType int InstanceId int diff --git a/utility/logger/otel/logger.go b/utility/logger/otel/logger.go index 879693f1..9e4f7a82 100644 --- a/utility/logger/otel/logger.go +++ b/utility/logger/otel/logger.go @@ -152,31 +152,58 @@ func newHTTPExporter(ctx context.Context) (metric.Exporter, error) { //This is useful for metrics like CPU usage, request rates, or other metrics where the rate of change is important. var temporalitySelector = func(instrument metric.InstrumentKind) metricdata.Temporality { return metricdata.DeltaTemporality } - return otlpmetrichttp.New(ctx, - otlpmetrichttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.MetricsPort)), - otlpmetrichttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), - otlpmetrichttp.WithCompression(otlpmetrichttp.NoCompression), - otlpmetrichttp.WithTemporalitySelector(temporalitySelector), - otlpmetrichttp.WithHeaders(headers), - otlpmetrichttp.WithRetry(otlpmetrichttp.RetryConfig{ - // Enabled indicates whether to not retry sending batches in case - // of export failure. - Enabled: false, - // InitialInterval the time to wait after the first failure before - // retrying. - InitialInterval: 1 * time.Second, - // MaxInterval is the upper bound on backoff interval. Once this - // value is reached the delay between consecutive retries will - // always be `MaxInterval`. - MaxInterval: 10 * time.Second, - // MaxElapsedTime is the maximum amount of time (including retries) - // spent trying to send a request/batch. Once this value is - // reached, the data is discarded. - MaxElapsedTime: 20 * time.Second, - }), - otlpmetrichttp.WithURLPath(config.OTelConfigData.MetricsURLPath), - otlpmetrichttp.WithInsecure(), //Since agent is local - ) + if config.OTelConfigData.UseTls { + return otlpmetrichttp.New(ctx, + otlpmetrichttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.MetricsPort)), + otlpmetrichttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlpmetrichttp.WithCompression(otlpmetrichttp.NoCompression), + otlpmetrichttp.WithTemporalitySelector(temporalitySelector), + otlpmetrichttp.WithHeaders(headers), + otlpmetrichttp.WithRetry(otlpmetrichttp.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlpmetrichttp.WithURLPath(config.OTelConfigData.MetricsURLPath), + ) + } else { + return otlpmetrichttp.New(ctx, + otlpmetrichttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.MetricsPort)), + otlpmetrichttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlpmetrichttp.WithCompression(otlpmetrichttp.NoCompression), + otlpmetrichttp.WithTemporalitySelector(temporalitySelector), + otlpmetrichttp.WithHeaders(headers), + otlpmetrichttp.WithRetry(otlpmetrichttp.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlpmetrichttp.WithURLPath(config.OTelConfigData.MetricsURLPath), + otlpmetrichttp.WithInsecure(), //Since agent is local + ) + } } // newGRPCExporter Initializes The "otlpmetricgrpc" exporter in OpenTelemetry is used to export metrics data using the @@ -190,31 +217,57 @@ func newGRPCExporter(ctx context.Context) (metric.Exporter, error) { //over time or when you need to report only the differences (deltas) between measurements. //This is useful for metrics like CPU usage, request rates, or other metrics where the rate of change is important. var temporalitySelector = func(instrument metric.InstrumentKind) metricdata.Temporality { return metricdata.DeltaTemporality } - - return otlpmetricgrpc.New(ctx, - otlpmetricgrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.MetricsPort)), - otlpmetricgrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), - otlpmetricgrpc.WithHeaders(headers), - otlpmetricgrpc.WithReconnectionPeriod(time.Duration(5)*time.Second), - otlpmetricgrpc.WithTemporalitySelector(temporalitySelector), - otlpmetricgrpc.WithRetry(otlpmetricgrpc.RetryConfig{ - // Enabled indicates whether to not retry sending batches in case - // of export failure. - Enabled: false, - // InitialInterval the time to wait after the first failure before - // retrying. - InitialInterval: 1 * time.Second, - // MaxInterval is the upper bound on backoff interval. Once this - // value is reached the delay between consecutive retries will - // always be `MaxInterval`. - MaxInterval: 10 * time.Second, - // MaxElapsedTime is the maximum amount of time (including retries) - // spent trying to send a request/batch. Once this value is - // reached, the data is discarded. - MaxElapsedTime: 20 * time.Second, - }), - otlpmetricgrpc.WithInsecure(), //Since agent is local - ) + if config.OTelConfigData.UseTls { + return otlpmetricgrpc.New(ctx, + otlpmetricgrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.MetricsPort)), + otlpmetricgrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlpmetricgrpc.WithHeaders(headers), + otlpmetricgrpc.WithReconnectionPeriod(time.Duration(5)*time.Second), + otlpmetricgrpc.WithTemporalitySelector(temporalitySelector), + otlpmetricgrpc.WithRetry(otlpmetricgrpc.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + ) + + } else { + return otlpmetricgrpc.New(ctx, + otlpmetricgrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.MetricsPort)), + otlpmetricgrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlpmetricgrpc.WithHeaders(headers), + otlpmetricgrpc.WithReconnectionPeriod(time.Duration(5)*time.Second), + otlpmetricgrpc.WithTemporalitySelector(temporalitySelector), + otlpmetricgrpc.WithRetry(otlpmetricgrpc.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlpmetricgrpc.WithInsecure(), //Since agent is local + ) + } } // newHTTPTraceExporter Initilizes The "otlptracehttp" exporter in OpenTelemetry is used to export spans data using the @@ -222,30 +275,54 @@ func newGRPCExporter(ctx context.Context) (metric.Exporter, error) { func newHTTPTraceExporter(ctx context.Context) (*otlptrace.Exporter, error) { headers := make(map[string]string) headers[IngestTokenHeader] = config.GetOTelIngestToken() - - return otlptracehttp.New(ctx, - otlptracehttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.TracePort)), - otlptracehttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), - otlptracehttp.WithHeaders(headers), - otlptracehttp.WithRetry(otlptracehttp.RetryConfig{ - // Enabled indicates whether to not retry sending batches in case - // of export failure. - Enabled: false, - // InitialInterval the time to wait after the first failure before - // retrying. - InitialInterval: 1 * time.Second, - // MaxInterval is the upper bound on backoff interval. Once this - // value is reached the delay between consecutive retries will - // always be `MaxInterval`. - MaxInterval: 10 * time.Second, - // MaxElapsedTime is the maximum amount of time (including retries) - // spent trying to send a request/batch. Once this value is - // reached, the data is discarded. - MaxElapsedTime: 20 * time.Second, - }), - otlptracehttp.WithURLPath(config.OTelConfigData.TraceURLPath), - otlptracehttp.WithInsecure(), //Since agent is local - ) + if config.OTelConfigData.UseTls { + return otlptracehttp.New(ctx, + otlptracehttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.TracePort)), + otlptracehttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlptracehttp.WithHeaders(headers), + otlptracehttp.WithRetry(otlptracehttp.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlptracehttp.WithURLPath(config.OTelConfigData.TraceURLPath), + ) + } else { + return otlptracehttp.New(ctx, + otlptracehttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.TracePort)), + otlptracehttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlptracehttp.WithHeaders(headers), + otlptracehttp.WithRetry(otlptracehttp.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlptracehttp.WithURLPath(config.OTelConfigData.TraceURLPath), + otlptracehttp.WithInsecure(), //Since agent is local + ) + } } // newGRPCTraceExporter Initilizes The "otlptracegrpc" exporter in OpenTelemetry is used to export spans data using the @@ -255,28 +332,52 @@ func newGRPCTraceExporter(ctx context.Context) (*otlptrace.Exporter, error) { headers := make(map[string]string) headers[IngestTokenHeader] = config.GetOTelIngestToken() - return otlptracegrpc.New(ctx, - otlptracegrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.TracePort)), - otlptracegrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), - otlptracegrpc.WithHeaders(headers), - otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{ - // Enabled indicates whether to not retry sending batches in case - // of export failure. - Enabled: false, - // InitialInterval the time to wait after the first failure before - // retrying. - InitialInterval: 1 * time.Second, - // MaxInterval is the upper bound on backoff interval. Once this - // value is reached the delay between consecutive retries will - // always be `MaxInterval`. - MaxInterval: 10 * time.Second, - // MaxElapsedTime is the maximum amount of time (including retries) - // spent trying to send a request/batch. Once this value is - // reached, the data is discarded. - MaxElapsedTime: 20 * time.Second, - }), - otlptracegrpc.WithInsecure(), //Since agent is local - ) + if config.OTelConfigData.UseTls { + return otlptracegrpc.New(ctx, + otlptracegrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.TracePort)), + otlptracegrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlptracegrpc.WithHeaders(headers), + otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + ) + } else { + return otlptracegrpc.New(ctx, + otlptracegrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.TracePort)), + otlptracegrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlptracegrpc.WithHeaders(headers), + otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlptracegrpc.WithInsecure(), //Since agent is local + ) + } } // getResourceInfo provide application context level attributes during initialization @@ -287,7 +388,7 @@ func getResourceInfo(appName string) *resource.Resource { attributes := []attribute.KeyValue{ attribute.String("container_host", hostname), attribute.String("application", appName), - attribute.String("source", "otel"), + attribute.String("source", otelSource), } environment, isEnvPresent := os.LookupEnv("ENVIRONMENT") diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go index a02eb0d5..2bf654de 100644 --- a/utility/logger/otel/state_logger.go +++ b/utility/logger/otel/state_logger.go @@ -230,6 +230,7 @@ func (stateLogMetrics *StateLogMetrics) asyncStateLogMetricsPoll(observer metric stateLogMetrics.stateLock.Lock() defer stateLogMetrics.stateLock.Unlock() stateLogsData := make(map[string]map[string]int64) + var stateLogTitle string //Infinite loop read through the channel and send metrics mainloop: for { @@ -245,6 +246,7 @@ mainloop: stateLogsData[keyName] = make(map[string]int64) } //Update metadata information + stateLogTitle = workersState.StateTitle stateLogsData[keyName][ShardId] = int64(workersState.ShardId) stateLogsData[keyName][WorkerType] = int64(workersState.WorkerType) stateLogsData[keyName][InstanceId] = int64(workersState.InstanceId) @@ -273,7 +275,7 @@ mainloop: } //Process metrics data if len(stateLogsData) > 0 { - err = stateLogMetrics.sendMetricsDataToCollector(observer, stateLogsData) + err = stateLogMetrics.sendMetricsDataToCollector(observer, &stateLogTitle, stateLogsData) } return err } @@ -281,13 +283,14 @@ mainloop: /* * Send metrics datat data-points to collector */ -func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(observer metric.Observer, stateLogsData map[string]map[string]int64) (err error) { +func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(observer metric.Observer, stateLogTitle *string, stateLogsData map[string]map[string]int64) (err error) { for key, aggStatesData := range stateLogsData { - logger.GetLogger().Log(logger.Info, fmt.Sprintf("calculated max value and aggregation of updown counter for key: %s using datapoints size: %d", key, aggStatesData[Datapoints])) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("publishing metric with calculated max value and aggregation of gauge for shardid-workertype-instanceId: %s using datapoints size: %d", key, aggStatesData[Datapoints])) commonLabels := []attribute.KeyValue{ attribute.Int(ShardId, int(aggStatesData[ShardId])), attribute.Int(WorkerType, int(aggStatesData[WorkerType])), attribute.Int(InstanceId, int(aggStatesData[InstanceId])), + attribute.String(OccWorkerParamName, *stateLogTitle), } //Observe states data // 1. Worker States From a00254fa0693d772480dc0fcfe1c9be21425720b Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Tue, 23 Jul 2024 17:16:10 +0530 Subject: [PATCH 07/19] changes for renaming metrics (#90) Co-authored-by: Rajesh S --- .../otel_remote_endpoint_tls/main_test.go | 3 + utility/logger/otel/defs.go | 45 ++++-- utility/logger/otel/state_logger.go | 131 ++++++++++++++++-- utility/logger/otel/test/state_logger_test.go | 4 +- 4 files changed, 156 insertions(+), 27 deletions(-) diff --git a/tests/unittest/otel_remote_endpoint_tls/main_test.go b/tests/unittest/otel_remote_endpoint_tls/main_test.go index 5a8fc189..c018f01c 100644 --- a/tests/unittest/otel_remote_endpoint_tls/main_test.go +++ b/tests/unittest/otel_remote_endpoint_tls/main_test.go @@ -120,9 +120,12 @@ func TestOTELMetricsRemoteEndPointWithTLS(t *testing.T) { cancel() conn.Close() +<<<<<<< HEAD for counter := 0; counter < 1000; counter++ { time.Sleep(10 * time.Second) } +======= +>>>>>>> otel_logging_changes logger.GetLogger().Log(logger.Debug, "TestOTELMetricsRemoteEndPointWithTLS done -------------------------------------------------------------") } diff --git a/utility/logger/otel/defs.go b/utility/logger/otel/defs.go index d344cea6..9d98bc11 100644 --- a/utility/logger/otel/defs.go +++ b/utility/logger/otel/defs.go @@ -9,19 +9,30 @@ import ( // Following Metric Names will get instrumented as part of StateLogMetrics const ( // Worker States - InitConnCountMetric = "init_connection.count" - AccptConnCountMetric = "accept_connection.count" - WaitConnCountMetric = "wait_connection.count" - BusyConnCountMetric = "busy_connection.count" - ScheduledConnCountMetric = "scheduled_connection.count" - FinishedConnCountMetric = "finished_connection.count" - QuiescedConnCountMetric = "quiesced_connection.count" + InitConnGuageMetric = "init_connection.cnt" + AccptConnGuageMetric = "accept_connection.cnt" + WaitConnGuageMetric = "wait_connection.cnt" + BusyConnGuageMetric = "busy_connection.cnt" + ScheduledConnGuageMetric = "scheduled_connection.cnt" + FinishedConnGuageMetric = "finished_connection.cnt" + QuiescedConnGuageMetric = "quiesced_connection.cnt" // Connection States - AssignedConnCountMetric = "assigned_connection.count" - IdleConnCountMetric = "idle_connection.count" - BacklogConnCountMetric = "backlog_connection.count" - StrdConnCountMetric = "stranded_connection.count" + AssignedConnGuageMetric = "assigned_connection.cnt" + IdleConnGuageMetric = "idle_connection.cnt" + BacklogConnGuageMetric = "backlog_connection.cnt" + StrdConnGuageMetric = "stranded_connection.cnt" + + InitMaxGuageMetric = "init_connection.cnt.max" + AcceptMinGuageMetric = "accept_connection.cnt.min" + WaitMaxGuageMetric = "wait_connection.cnt.max" + BusyMaxGuageMetric = "busy_connection.cnt.max" + SchdMaxGuageMetric = "scheduled_connection.cnt.max" + QuiescedMaxGuageMetric = "quiesced_connection.cnt.max" + + IdleMaxGuageMetric = "idle_connection.cnt.max" + BacklogMaxGuageMetric = "backlog_connection.cnt.max" + StrdMaxGuageMetric = "stranded_connection.cnt.max" ) const ( @@ -111,6 +122,18 @@ type StateLogMetrics struct { idleState metric.Int64ObservableGauge bklgState metric.Int64ObservableGauge strdState metric.Int64ObservableGauge + + initStateMax metric.Int64ObservableGauge + waitStateMax metric.Int64ObservableGauge + busyStateMax metric.Int64ObservableGauge + schdStateMax metric.Int64ObservableGauge + quceStateMax metric.Int64ObservableGauge + + idleStateMax metric.Int64ObservableGauge + bklgStateMax metric.Int64ObservableGauge + strdStateMax metric.Int64ObservableGauge + + acptStateMin metric.Int64ObservableGauge } // Object represents the workers states data for worker belongs to specific shardId and workperType with flat-map diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go index 2bf654de..6787df77 100644 --- a/utility/logger/otel/state_logger.go +++ b/utility/logger/otel/state_logger.go @@ -111,7 +111,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { //"init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd", "cls" var err error if stateLogMetrics.initState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(InitConnCountMetric), + otelconfig.OTelConfigData.PopulateMetricNamePrefix(InitConnGuageMetric), metric.WithDescription("Number of workers in init state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for init state", err) @@ -119,7 +119,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { } if stateLogMetrics.acptState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(AccptConnCountMetric), + otelconfig.OTelConfigData.PopulateMetricNamePrefix(AccptConnGuageMetric), metric.WithDescription("Number of workers in accept state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for accept state", err) @@ -127,7 +127,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { } if stateLogMetrics.waitState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(WaitConnCountMetric), + otelconfig.OTelConfigData.PopulateMetricNamePrefix(WaitConnGuageMetric), metric.WithDescription("Number of workers in wait state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for wait state", err) @@ -135,7 +135,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { } if stateLogMetrics.busyState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(BusyConnCountMetric), + otelconfig.OTelConfigData.PopulateMetricNamePrefix(BusyConnGuageMetric), metric.WithDescription("Number of workers in busy state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for busy state", err) @@ -143,7 +143,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { } if stateLogMetrics.schdState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(ScheduledConnCountMetric), + otelconfig.OTelConfigData.PopulateMetricNamePrefix(ScheduledConnGuageMetric), metric.WithDescription("Number of workers in scheduled state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for scheduled state", err) @@ -151,7 +151,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { } if stateLogMetrics.fnshState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(FinishedConnCountMetric), + otelconfig.OTelConfigData.PopulateMetricNamePrefix(FinishedConnGuageMetric), metric.WithDescription("Number of workers in finished state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for finished state", err) @@ -159,7 +159,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { } if stateLogMetrics.quceState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(QuiescedConnCountMetric), + otelconfig.OTelConfigData.PopulateMetricNamePrefix(QuiescedConnGuageMetric), metric.WithDescription("Number of workers in quiesced state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for quiesced state", err) @@ -167,7 +167,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { } if stateLogMetrics.asgnState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(AssignedConnCountMetric), + otelconfig.OTelConfigData.PopulateMetricNamePrefix(AssignedConnGuageMetric), metric.WithDescription("Number of workers in assigned state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for assigned state", err) @@ -175,7 +175,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { } if stateLogMetrics.idleState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(IdleConnCountMetric), + otelconfig.OTelConfigData.PopulateMetricNamePrefix(IdleConnGuageMetric), metric.WithDescription("Number of workers in idle state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for idle state", err) @@ -183,7 +183,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { } if stateLogMetrics.bklgState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(BacklogConnCountMetric), + otelconfig.OTelConfigData.PopulateMetricNamePrefix(BacklogConnGuageMetric), metric.WithDescription("Number of workers in backlog state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for backlog state", err) @@ -191,12 +191,85 @@ func (stateLogMetrics *StateLogMetrics) register() error { } if stateLogMetrics.strdState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(StrdConnCountMetric), + otelconfig.OTelConfigData.PopulateMetricNamePrefix(StrdConnGuageMetric), metric.WithDescription("Number of connections in stranded state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for stranded state", err) return err } + //Initialize max metrics + if stateLogMetrics.initStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(InitMaxGuageMetric), + metric.WithDescription("Maximum Number of workers in init state within resolution time"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for init max state", err) + return err + } + + if stateLogMetrics.waitStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(WaitMaxGuageMetric), + metric.WithDescription("Maximum Number of workers in wait state within resolution time"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for max wait state", err) + return err + } + + if stateLogMetrics.busyStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(BusyMaxGuageMetric), + metric.WithDescription("Maximum Number of workers in busy state within resolution time"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for max busy state", err) + return err + } + + if stateLogMetrics.schdStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(SchdMaxGuageMetric), + metric.WithDescription("Maximum Number of workers in scheduled state within resolution time"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for scheduled state", err) + return err + } + + if stateLogMetrics.quceStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(QuiescedMaxGuageMetric), + metric.WithDescription("Maximum Number of workers in quiesced state within resolution time"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for quiesced state", err) + return err + } + + if stateLogMetrics.idleStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(IdleMaxGuageMetric), + metric.WithDescription("Maximum Number of client connections in idle state within resolution time"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for max idle state", err) + return err + } + + if stateLogMetrics.bklgStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(BacklogMaxGuageMetric), + metric.WithDescription("Maximum Number of client connections in backlog state within resolution time"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for max backlog state", err) + return err + } + + if stateLogMetrics.strdStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(StrdMaxGuageMetric), + metric.WithDescription("Maximum Number of client connections in idle state within resolution time"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for max stranded state", err) + return err + } + + //Initialize min for accpet + if stateLogMetrics.acptStateMin, err = stateLogMetrics.meter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(AcceptMinGuageMetric), + metric.WithDescription("Minimum Number of workers in accept state within resolution time"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for min accept state", err) + return err + } stateLogMetrics.registration, err = stateLogMetrics.meter.RegisterCallback( func(ctx context.Context, observer metric.Observer) error { @@ -214,6 +287,17 @@ func (stateLogMetrics *StateLogMetrics) register() error { stateLogMetrics.idleState, stateLogMetrics.bklgState, stateLogMetrics.strdState, + + stateLogMetrics.initStateMax, //Max + stateLogMetrics.waitStateMax, + stateLogMetrics.busyStateMax, + stateLogMetrics.schdStateMax, + stateLogMetrics.quceStateMax, + stateLogMetrics.idleStateMax, + stateLogMetrics.bklgStateMax, + stateLogMetrics.strdStateMax, + + stateLogMetrics.acptStateMin, //Min }...) if err != nil { @@ -257,11 +341,16 @@ mainloop: stateLogsData[keyName][key] += value } else { maxKey := key + "Max" + minKey := key + "Min" stateLogsData[keyName][key] = value //check max update max value if stateLogsData[keyName][maxKey] < value { stateLogsData[keyName][maxKey] = value } + //Min value + if stateLogsData[keyName][minKey] > value { + stateLogsData[keyName][minKey] = value + } } } case <-stateLogMetrics.doneCh: @@ -293,8 +382,7 @@ func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(observer metr attribute.String(OccWorkerParamName, *stateLogTitle), } //Observe states data - // 1. Worker States - + //1. Worker States observer.ObserveInt64(stateLogMetrics.initState, aggStatesData["init"], metric.WithAttributes(commonLabels...)) observer.ObserveInt64(stateLogMetrics.acptState, aggStatesData["acpt"], metric.WithAttributes(commonLabels...)) observer.ObserveInt64(stateLogMetrics.waitState, aggStatesData["wait"], metric.WithAttributes(commonLabels...)) @@ -303,11 +391,26 @@ func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(observer metr observer.ObserveInt64(stateLogMetrics.fnshState, aggStatesData["fnsh"], metric.WithAttributes(commonLabels...)) observer.ObserveInt64(stateLogMetrics.quceState, aggStatesData["quce"], metric.WithAttributes(commonLabels...)) - // 2. Connection States + //2. Connection States observer.ObserveInt64(stateLogMetrics.asgnState, aggStatesData["asgn"], metric.WithAttributes(commonLabels...)) observer.ObserveInt64(stateLogMetrics.idleState, aggStatesData["idle"], metric.WithAttributes(commonLabels...)) observer.ObserveInt64(stateLogMetrics.bklgState, aggStatesData["bklg"], metric.WithAttributes(commonLabels...)) observer.ObserveInt64(stateLogMetrics.strdState, aggStatesData["strd"], metric.WithAttributes(commonLabels...)) + + //3. Worker States Max values + observer.ObserveInt64(stateLogMetrics.initStateMax, aggStatesData["initMax"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.waitStateMax, aggStatesData["waitMax"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.busyStateMax, aggStatesData["busyMax"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.schdStateMax, aggStatesData["schdMax"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.quceStateMax, aggStatesData["quceMax"], metric.WithAttributes(commonLabels...)) + + //4. Connection States Max values + observer.ObserveInt64(stateLogMetrics.idleStateMax, aggStatesData["idleMax"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.bklgStateMax, aggStatesData["bklgMax"], metric.WithAttributes(commonLabels...)) + observer.ObserveInt64(stateLogMetrics.strdStateMax, aggStatesData["strdMax"], metric.WithAttributes(commonLabels...)) + + //5. Min accept state + observer.ObserveInt64(stateLogMetrics.acptStateMin, aggStatesData["acptMin"], metric.WithAttributes(commonLabels...)) } return nil } diff --git a/utility/logger/otel/test/state_logger_test.go b/utility/logger/otel/test/state_logger_test.go index 366b7927..f994738c 100644 --- a/utility/logger/otel/test/state_logger_test.go +++ b/utility/logger/otel/test/state_logger_test.go @@ -167,8 +167,8 @@ func TestSendingStateLogMetrics(t *testing.T) { logger.GetLogger().Log(logger.Info, "Data Sent successfully for instrumentation") time.Sleep(5 * time.Second) metricsData := mc.GetMetrics() - if len(metricsData) < 11 { - t.Fatalf("got %d, wanted %d", len(metricsData), 24) + if len(metricsData) < 20 { + t.Fatalf("got %d, wanted %d", len(metricsData), 20) } } From 629c290ed39d09339e87f5df9e2b1481fb1008df Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Tue, 23 Jul 2024 17:20:55 +0530 Subject: [PATCH 08/19] Occ config logging (#394) (#91) * occ configurations logging * cal event success * adding cal data * added TODOs * Remove log_occ_confogs.go * Remove testing files * source of configs - files * whitelist format change * code clean up * code review changes-1 * CR fixes * CR fixes * Delete tests/unittest/config_logging/main_test.go * clean up * Merge branch 'occ-config-logging' of /Users/simmidisetty/Documents/GitHub/OpenSourceHera/src/github.com/paypal/hera with conflicts. * test for config logging * removing test changes * tests for all cases * test * making minor changes for logging feature specific data * changes for incorporate review comments --------- Co-authored-by: satyakamala03 <128077872+satyakamala03@users.noreply.github.com> Co-authored-by: simmidisetty Co-authored-by: Rajesh S --- lib/config.go | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/config.go b/lib/config.go index 98e49c1f..49fc837d 100644 --- a/lib/config.go +++ b/lib/config.go @@ -24,6 +24,7 @@ import ( "github.com/paypal/hera/config" "github.com/paypal/hera/utility/logger" otelconfig "github.com/paypal/hera/utility/logger/otel/config" + "os" "path/filepath" "strings" From db14446809643fe1c649ad96f458845ae5908cad Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Thu, 25 Jul 2024 17:03:24 +0530 Subject: [PATCH 09/19] Otel changes for adding host dimension (#93) * Occ config logging (#394) * occ configurations logging * cal event success * adding cal data * added TODOs * Remove log_occ_confogs.go * Remove testing files * source of configs - files * whitelist format change * code clean up * code review changes-1 * CR fixes * CR fixes * Delete tests/unittest/config_logging/main_test.go * clean up * Merge branch 'occ-config-logging' of /Users/simmidisetty/Documents/GitHub/OpenSourceHera/src/github.com/paypal/hera with conflicts. * test for config logging * removing test changes * tests for all cases * test * making minor changes for logging feature specific data * changes for incorporate review comments --------- Co-authored-by: simmidisetty Co-authored-by: Rajesh S * adding host dimension in data-point level --------- Co-authored-by: satyakamala03 <128077872+satyakamala03@users.noreply.github.com> Co-authored-by: simmidisetty Co-authored-by: Rajesh S --- tests/unittest/otel_basic/main_test.go | 6 ++--- .../otel_remote_endpoint_tls/main_test.go | 12 ++------- utility/logger/otel/defs.go | 25 +++++++++++-------- utility/logger/otel/logger.go | 6 ++--- utility/logger/otel/state_logger.go | 15 +++++++++++ 5 files changed, 38 insertions(+), 26 deletions(-) diff --git a/tests/unittest/otel_basic/main_test.go b/tests/unittest/otel_basic/main_test.go index 2a865bd1..7bfa327b 100644 --- a/tests/unittest/otel_basic/main_test.go +++ b/tests/unittest/otel_basic/main_test.go @@ -28,7 +28,7 @@ func cfg() (map[string]string, map[string]string, testutil.WorkerType) { appcfg["rac_sql_interval"] = "0" appcfg["child.executable"] = "mysqlworker" appcfg["enable_otel"] = "true" - appcfg["otel_resolution_time_in_sec"] = "10" + appcfg["otel_resolution_time_in_sec"] = "1" opscfg := make(map[string]string) opscfg["opscfg.default.server.max_connections"] = "3" opscfg["opscfg.default.server.log_level"] = "5" @@ -108,9 +108,9 @@ func TestOTELMetricsBasic(t *testing.T) { if count < 1 { t.Fatalf("OTEL event should contain application as hera-test") } - initCount := testutil.RegexCountFile("\"name\":\"pp.occ.init_connection.count\"", logFilePath) + initCount := testutil.RegexCountFile("\"name\":\"pp.occ.init_connection.cnt\"", logFilePath) if initCount < 1 { - t.Fatalf("OTEL event should contain metric name pp.occ.init_connection.count") + t.Fatalf("OTEL event should contain metric name pp.occ.init_connection.cnt") } tagsCount := testutil.RegexCountFile("{\"key\":\"InstanceId\",\"value\":{\"intValue\":\"0\"}},{\"key\":\"ShardId\",\"value\":{\"intValue\":\"0\"}},{\"key\":\"WorkerType\",\"value\":{\"intValue\":\"0\"}", logFilePath) diff --git a/tests/unittest/otel_remote_endpoint_tls/main_test.go b/tests/unittest/otel_remote_endpoint_tls/main_test.go index c018f01c..4d4f96f6 100644 --- a/tests/unittest/otel_remote_endpoint_tls/main_test.go +++ b/tests/unittest/otel_remote_endpoint_tls/main_test.go @@ -102,17 +102,16 @@ func TestOTELMetricsRemoteEndPointWithTLS(t *testing.T) { t.Fatalf("Expected 1 row") } - time.Sleep(10 * time.Second) + time.Sleep(15 * time.Second) rows.Close() stmt.Close() - time.Sleep(10 * time.Second) + time.Sleep(15 * time.Second) publishingErrors := testutil.RegexCountFile("otel publishing error", "hera.log") if publishingErrors > 1 { t.Fatalf("should not fail while publishing metrics remote host") } - time.Sleep(5 * time.Second) calPublishingErrors := testutil.RegexCountFile("failed to send metrics", "cal.log") if calPublishingErrors > 1 { t.Fatalf("should not fail while publishing metrics remote host") @@ -120,12 +119,5 @@ func TestOTELMetricsRemoteEndPointWithTLS(t *testing.T) { cancel() conn.Close() -<<<<<<< HEAD - - for counter := 0; counter < 1000; counter++ { - time.Sleep(10 * time.Second) - } -======= ->>>>>>> otel_logging_changes logger.GetLogger().Log(logger.Debug, "TestOTELMetricsRemoteEndPointWithTLS done -------------------------------------------------------------") } diff --git a/utility/logger/otel/defs.go b/utility/logger/otel/defs.go index 9d98bc11..ca8a7152 100644 --- a/utility/logger/otel/defs.go +++ b/utility/logger/otel/defs.go @@ -36,16 +36,19 @@ const ( ) const ( - Target = string("target") - Endpoint = string("target_ip_port") - TLS_version = string("tls_version") - Application = string("Application") - ShardId = string("ShardId") - WorkerType = string("WorkerType") - InstanceId = string("InstanceId") - Datapoints = string("datapoints") - otelSource = string("otel") - OccWorkerParamName = string("occ_worker") + Target = string("target") + Endpoint = string("target_ip_port") + TLS_version = string("tls_version") + ApplicationDimName = string("application") + ShardId = string("ShardId") + WorkerType = string("WorkerType") + InstanceId = string("InstanceId") + Datapoints = string("datapoints") + OtelSourceName = string("source") + otelSource = string("otel") + OccWorkerParamName = string("occ_worker") + HostDimensionName = string("host") + ContainerHostDimName = string("container_host") ) const OtelInstrumentationVersion string = "v1.0" @@ -99,6 +102,8 @@ type StateLogMetrics struct { //Statelog metrics configuration data metricsConfig stateLogMetricsConfig + hostname string + meter metric.Meter //Channel to receive statelog data diff --git a/utility/logger/otel/logger.go b/utility/logger/otel/logger.go index 9e4f7a82..4e88b3d4 100644 --- a/utility/logger/otel/logger.go +++ b/utility/logger/otel/logger.go @@ -386,9 +386,9 @@ func getResourceInfo(appName string) *resource.Resource { // Create a slice to hold the attributes attributes := []attribute.KeyValue{ - attribute.String("container_host", hostname), - attribute.String("application", appName), - attribute.String("source", otelSource), + attribute.String(ContainerHostDimName, hostname), + attribute.String(ApplicationDimName, appName), + attribute.String(OtelSourceName, otelSource), } environment, isEnvPresent := os.LookupEnv("ENVIRONMENT") diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go index 6787df77..6295f2e1 100644 --- a/utility/logger/otel/state_logger.go +++ b/utility/logger/otel/state_logger.go @@ -8,6 +8,7 @@ import ( "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" + "os" "sync" "time" ) @@ -71,11 +72,16 @@ func StartMetricsCollection(totalWorkersCount int, opt ...StateLogOption) error var err error //Registers instrumentation for metrics registerStateMetrics.Do(func() { + hostName, hostErr := os.Hostname() + if hostErr != nil { + logger.GetLogger().Log(logger.Alert, "Failed to fetch hostname for current container", err) + } //Initialize state-log metrics metricsStateLogger = &StateLogMetrics{ meter: stateLogMetricsConfig.MeterProvider.Meter(StateLogMeterName, metric.WithInstrumentationVersion(OtelInstrumentationVersion)), metricsConfig: stateLogMetricsConfig, + hostname: hostName, mStateDataChan: make(chan *WorkersStateData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*2), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, doneCh: make(chan struct{}), } @@ -344,10 +350,18 @@ mainloop: minKey := key + "Min" stateLogsData[keyName][key] = value //check max update max value + _, keyPresent := stateLogsData[keyName][maxKey] + if !keyPresent { + stateLogsData[keyName][maxKey] = value + } if stateLogsData[keyName][maxKey] < value { stateLogsData[keyName][maxKey] = value } //Min value + _, keyPresent = stateLogsData[keyName][minKey] + if !keyPresent { + stateLogsData[keyName][minKey] = value + } if stateLogsData[keyName][minKey] > value { stateLogsData[keyName][minKey] = value } @@ -380,6 +394,7 @@ func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(observer metr attribute.Int(WorkerType, int(aggStatesData[WorkerType])), attribute.Int(InstanceId, int(aggStatesData[InstanceId])), attribute.String(OccWorkerParamName, *stateLogTitle), + attribute.String(HostDimensionName, stateLogMetrics.hostname), } //Observe states data //1. Worker States From 16789daf21f06b9f25031a7234fd3e004d8170b6 Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Wed, 31 Jul 2024 14:44:31 +0530 Subject: [PATCH 10/19] Otel dist metrics change (#96) * Occ config logging (#394) * occ configurations logging * cal event success * adding cal data * added TODOs * Remove log_occ_confogs.go * Remove testing files * source of configs - files * whitelist format change * code clean up * code review changes-1 * CR fixes * CR fixes * Delete tests/unittest/config_logging/main_test.go * clean up * Merge branch 'occ-config-logging' of /Users/simmidisetty/Documents/GitHub/OpenSourceHera/src/github.com/paypal/hera with conflicts. * test for config logging * removing test changes * tests for all cases * test * making minor changes for logging feature specific data * changes for incorporate review comments --------- Co-authored-by: simmidisetty Co-authored-by: Rajesh S * changes for otel integration in hera --------- Co-authored-by: satyakamala03 <128077872+satyakamala03@users.noreply.github.com> Co-authored-by: simmidisetty Co-authored-by: Rajesh S --- lib/statelog.go | 3 +- tests/unittest/otel_basic/main_test.go | 8 +- tests/unittest/testutil/main.go | 6 +- utility/logger/otel/defs.go | 84 ++---- utility/logger/otel/logger.go | 118 +++++++- utility/logger/otel/state_logger.go | 251 ++++-------------- utility/logger/otel/test/state_logger_test.go | 17 +- 7 files changed, 218 insertions(+), 269 deletions(-) diff --git a/lib/statelog.go b/lib/statelog.go index 717f2679..89d54c5b 100644 --- a/lib/statelog.go +++ b/lib/statelog.go @@ -19,6 +19,7 @@ package lib import ( "bytes" + "context" "errors" "fmt" otel_logger "github.com/paypal/hera/utility/logger/otel" @@ -571,7 +572,7 @@ func (sl *StateLog) init() error { if otelconfig.OTelConfigData.Enabled { // Initialize statelog_metrics to send metrics information currently we are ignoring registration object returned from this call - stateStartErr := otel_logger.StartMetricsCollection(totalWorkersCount, + stateStartErr := otel_logger.StartMetricsCollection(context.Background(), totalWorkersCount, otel_logger.WithMetricProvider(otel.GetMeterProvider()), otel_logger.WithAppName(otelconfig.OTelConfigData.PoolName)) diff --git a/tests/unittest/otel_basic/main_test.go b/tests/unittest/otel_basic/main_test.go index 7bfa327b..f8877584 100644 --- a/tests/unittest/otel_basic/main_test.go +++ b/tests/unittest/otel_basic/main_test.go @@ -28,9 +28,11 @@ func cfg() (map[string]string, map[string]string, testutil.WorkerType) { appcfg["rac_sql_interval"] = "0" appcfg["child.executable"] = "mysqlworker" appcfg["enable_otel"] = "true" - appcfg["otel_resolution_time_in_sec"] = "1" + appcfg["otel_resolution_time_in_sec"] = "10" opscfg := make(map[string]string) opscfg["opscfg.default.server.max_connections"] = "3" + appcfg["cfg_from_tns"] = "false" + appcfg["num_standby_dbs"] = "0" opscfg["opscfg.default.server.log_level"] = "5" os.Setenv("AVAILABILITY_ZONE", "test-dev") os.Setenv("ENVIRONMENT", "dev") @@ -108,9 +110,9 @@ func TestOTELMetricsBasic(t *testing.T) { if count < 1 { t.Fatalf("OTEL event should contain application as hera-test") } - initCount := testutil.RegexCountFile("\"name\":\"pp.occ.init_connection.cnt\"", logFilePath) + initCount := testutil.RegexCountFile("\"name\":\"pp.occ.init_connection\"", logFilePath) if initCount < 1 { - t.Fatalf("OTEL event should contain metric name pp.occ.init_connection.cnt") + t.Fatalf("OTEL event should contain metric name pp.occ.init_connection") } tagsCount := testutil.RegexCountFile("{\"key\":\"InstanceId\",\"value\":{\"intValue\":\"0\"}},{\"key\":\"ShardId\",\"value\":{\"intValue\":\"0\"}},{\"key\":\"WorkerType\",\"value\":{\"intValue\":\"0\"}", logFilePath) diff --git a/tests/unittest/testutil/main.go b/tests/unittest/testutil/main.go index 3e08310e..e3150abe 100644 --- a/tests/unittest/testutil/main.go +++ b/tests/unittest/testutil/main.go @@ -23,9 +23,9 @@ func setup(cfg cfgFunc) error { if appcfg["enable_otel"] == "true" { err = mx.StartOTelAgent() } - if err != nil { - return err - } + //if err != nil { + // return err + //} err = mx.StartServer() return err } diff --git a/utility/logger/otel/defs.go b/utility/logger/otel/defs.go index ca8a7152..4819e39d 100644 --- a/utility/logger/otel/defs.go +++ b/utility/logger/otel/defs.go @@ -9,30 +9,19 @@ import ( // Following Metric Names will get instrumented as part of StateLogMetrics const ( // Worker States - InitConnGuageMetric = "init_connection.cnt" - AccptConnGuageMetric = "accept_connection.cnt" - WaitConnGuageMetric = "wait_connection.cnt" - BusyConnGuageMetric = "busy_connection.cnt" - ScheduledConnGuageMetric = "scheduled_connection.cnt" - FinishedConnGuageMetric = "finished_connection.cnt" - QuiescedConnGuageMetric = "quiesced_connection.cnt" + InitConnMetric = "init_connection" + AccptConnMetric = "accept_connection" + WaitConnMetric = "wait_connection" + BusyConnMetric = "busy_connection" + ScheduledConnMetric = "scheduled_connection" + FinishedConnMetric = "finished_connection" + QuiescedConnMetric = "quiesced_connection" // Connection States - AssignedConnGuageMetric = "assigned_connection.cnt" - IdleConnGuageMetric = "idle_connection.cnt" - BacklogConnGuageMetric = "backlog_connection.cnt" - StrdConnGuageMetric = "stranded_connection.cnt" - - InitMaxGuageMetric = "init_connection.cnt.max" - AcceptMinGuageMetric = "accept_connection.cnt.min" - WaitMaxGuageMetric = "wait_connection.cnt.max" - BusyMaxGuageMetric = "busy_connection.cnt.max" - SchdMaxGuageMetric = "scheduled_connection.cnt.max" - QuiescedMaxGuageMetric = "quiesced_connection.cnt.max" - - IdleMaxGuageMetric = "idle_connection.cnt.max" - BacklogMaxGuageMetric = "backlog_connection.cnt.max" - StrdMaxGuageMetric = "stranded_connection.cnt.max" + AssignedConnMetric = "assigned_connection" + IdleConnMetric = "idle_connection" + BacklogConnMetric = "backlog_connection" + StrdConnMetric = "stranded_connection" ) const ( @@ -51,6 +40,9 @@ const ( ContainerHostDimName = string("container_host") ) +var StatelogBucket = []float64{0, 5, 10, 15, 20, 25, 30, 40, 50, 60, 80, 100, 120, 160, 200} +var ConnectionStateBucket = []float64{0, 25, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800, 1200, 2400, 4800, 9600, 19200, 39400, 65536} + const OtelInstrumentationVersion string = "v1.0" // DEFAULT_OTEL_COLLECTOR_PROTOCOL default OTEL configurations point to QA collector @@ -84,18 +76,6 @@ type ( ServerType int ) -// StateData Represents stats by a worker -type StateData struct { - Name string - Value float64 - Dimensions metric.MeasurementOption -} - -type DataPoint struct { - attr metric.MeasurementOption - data int64 -} - // StateLogMetrics state_log_metrics reports workers states type StateLogMetrics struct { @@ -114,31 +94,17 @@ type StateLogMetrics struct { stateLock sync.Mutex - registration metric.Registration - - initState metric.Int64ObservableGauge - acptState metric.Int64ObservableGauge - waitState metric.Int64ObservableGauge - busyState metric.Int64ObservableGauge - schdState metric.Int64ObservableGauge - fnshState metric.Int64ObservableGauge - quceState metric.Int64ObservableGauge - asgnState metric.Int64ObservableGauge - idleState metric.Int64ObservableGauge - bklgState metric.Int64ObservableGauge - strdState metric.Int64ObservableGauge - - initStateMax metric.Int64ObservableGauge - waitStateMax metric.Int64ObservableGauge - busyStateMax metric.Int64ObservableGauge - schdStateMax metric.Int64ObservableGauge - quceStateMax metric.Int64ObservableGauge - - idleStateMax metric.Int64ObservableGauge - bklgStateMax metric.Int64ObservableGauge - strdStateMax metric.Int64ObservableGauge - - acptStateMin metric.Int64ObservableGauge + initState metric.Int64Histogram + acptState metric.Int64Histogram + waitState metric.Int64Histogram + busyState metric.Int64Histogram + schdState metric.Int64Histogram + fnshState metric.Int64Histogram + quceState metric.Int64Histogram + asgnState metric.Int64Histogram + idleState metric.Int64Histogram + bklgState metric.Int64Histogram + strdState metric.Int64Histogram } // Object represents the workers states data for worker belongs to specific shardId and workperType with flat-map diff --git a/utility/logger/otel/logger.go b/utility/logger/otel/logger.go index 4e88b3d4..2785659c 100644 --- a/utility/logger/otel/logger.go +++ b/utility/logger/otel/logger.go @@ -13,6 +13,7 @@ import ( "go.opentelemetry.io/otel/exporters/otlp/otlptrace" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/sdk/instrumentation" "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/metric/metricdata" "go.opentelemetry.io/otel/sdk/resource" @@ -116,15 +117,130 @@ func newMeterProvider(ctx context.Context) (*metric.MeterProvider, error) { logger.GetLogger().Log(logger.Alert, "failed to initialize metric exporter, error %v", err) return nil, err } - + metricViews := getStateLogMetricsViews() meterProvider := metric.NewMeterProvider( metric.WithResource(getResourceInfo(config.OTelConfigData.PoolName)), metric.WithReader(metric.NewPeriodicReader(metricExporter, metric.WithInterval(time.Duration(config.OTelConfigData.ResolutionTimeInSec)*time.Second))), + metric.WithView(metricViews...), ) return meterProvider, nil } +func getStateLogMetricsViews() []metric.View { + initView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(InitConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + acptStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(AccptConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + waitStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(WaitConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + busyStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(BusyConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + schdStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(ScheduledConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + fnshStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(FinishedConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + quceStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(QuiescedConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + asgnStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(AssignedConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + idleStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(IdleConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + bklgStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(BacklogConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + strdStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(StrdConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + return []metric.View{initView, acptStateView, waitStateView, busyStateView, schdStateView, + fnshStateView, quceStateView, asgnStateView, idleStateView, bklgStateView, strdStateView} +} + // getMetricExporter Initialize metric exporter based protocol selected by user. func getMetricExporter(ctx context.Context) (metric.Exporter, error) { if config.OTelConfigData.OtelMetricGRPC { diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go index 6295f2e1..f2d4922a 100644 --- a/utility/logger/otel/state_logger.go +++ b/utility/logger/otel/state_logger.go @@ -57,7 +57,7 @@ func newConfig(opts ...StateLogOption) stateLogMetricsConfig { } // StartMetricsCollection initializes reporting of stateLogMetrics using the supplied config. -func StartMetricsCollection(totalWorkersCount int, opt ...StateLogOption) error { +func StartMetricsCollection(ctx context.Context, totalWorkersCount int, opt ...StateLogOption) error { stateLogMetricsConfig := newConfig(opt...) //Verification of config data @@ -86,6 +86,11 @@ func StartMetricsCollection(totalWorkersCount int, opt ...StateLogOption) error doneCh: make(chan struct{}), } err = metricsStateLogger.register() + if err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register state metrics collector", err) + } else { + go metricsStateLogger.startStateLogMetricsPoll(ctx) + } }) return err } @@ -108,6 +113,13 @@ func AddDataPointToOTELStateDataChan(dataPoint *WorkersStateData) { return case <-time.After(time.Millisecond * 100): logger.GetLogger().Log(logger.Alert, "timeout occurred while adding record to stats data channel") + default: + select { + case metricsStateLogger.mStateDataChan <- dataPoint: + return + default: + logger.GetLogger().Log(logger.Alert, "metricsStateLogger.mStateData channel closed or full while sending data") + } } } @@ -116,195 +128,93 @@ func (stateLogMetrics *StateLogMetrics) register() error { //"init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd", "cls" var err error - if stateLogMetrics.initState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(InitConnGuageMetric), + if stateLogMetrics.initState, err = stateLogMetrics.meter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(InitConnMetric), metric.WithDescription("Number of workers in init state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for init state", err) return err } - if stateLogMetrics.acptState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(AccptConnGuageMetric), + if stateLogMetrics.acptState, err = stateLogMetrics.meter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(AccptConnMetric), metric.WithDescription("Number of workers in accept state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for accept state", err) return err } - if stateLogMetrics.waitState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(WaitConnGuageMetric), + if stateLogMetrics.waitState, err = stateLogMetrics.meter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(WaitConnMetric), metric.WithDescription("Number of workers in wait state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for wait state", err) return err } - if stateLogMetrics.busyState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(BusyConnGuageMetric), + if stateLogMetrics.busyState, err = stateLogMetrics.meter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(BusyConnMetric), metric.WithDescription("Number of workers in busy state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for busy state", err) return err } - if stateLogMetrics.schdState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(ScheduledConnGuageMetric), + if stateLogMetrics.schdState, err = stateLogMetrics.meter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(ScheduledConnMetric), metric.WithDescription("Number of workers in scheduled state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for scheduled state", err) return err } - if stateLogMetrics.fnshState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(FinishedConnGuageMetric), + if stateLogMetrics.fnshState, err = stateLogMetrics.meter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(FinishedConnMetric), metric.WithDescription("Number of workers in finished state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for finished state", err) return err } - if stateLogMetrics.quceState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(QuiescedConnGuageMetric), + if stateLogMetrics.quceState, err = stateLogMetrics.meter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(QuiescedConnMetric), metric.WithDescription("Number of workers in quiesced state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for quiesced state", err) return err } - if stateLogMetrics.asgnState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(AssignedConnGuageMetric), + if stateLogMetrics.asgnState, err = stateLogMetrics.meter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(AssignedConnMetric), metric.WithDescription("Number of workers in assigned state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for assigned state", err) return err } - if stateLogMetrics.idleState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(IdleConnGuageMetric), + if stateLogMetrics.idleState, err = stateLogMetrics.meter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(IdleConnMetric), metric.WithDescription("Number of workers in idle state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for idle state", err) return err } - if stateLogMetrics.bklgState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(BacklogConnGuageMetric), + if stateLogMetrics.bklgState, err = stateLogMetrics.meter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(BacklogConnMetric), metric.WithDescription("Number of workers in backlog state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for backlog state", err) return err } - if stateLogMetrics.strdState, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(StrdConnGuageMetric), + if stateLogMetrics.strdState, err = stateLogMetrics.meter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(StrdConnMetric), metric.WithDescription("Number of connections in stranded state"), ); err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for stranded state", err) return err } - //Initialize max metrics - if stateLogMetrics.initStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(InitMaxGuageMetric), - metric.WithDescription("Maximum Number of workers in init state within resolution time"), - ); err != nil { - logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for init max state", err) - return err - } - - if stateLogMetrics.waitStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(WaitMaxGuageMetric), - metric.WithDescription("Maximum Number of workers in wait state within resolution time"), - ); err != nil { - logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for max wait state", err) - return err - } - - if stateLogMetrics.busyStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(BusyMaxGuageMetric), - metric.WithDescription("Maximum Number of workers in busy state within resolution time"), - ); err != nil { - logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for max busy state", err) - return err - } - - if stateLogMetrics.schdStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(SchdMaxGuageMetric), - metric.WithDescription("Maximum Number of workers in scheduled state within resolution time"), - ); err != nil { - logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for scheduled state", err) - return err - } - - if stateLogMetrics.quceStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(QuiescedMaxGuageMetric), - metric.WithDescription("Maximum Number of workers in quiesced state within resolution time"), - ); err != nil { - logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for quiesced state", err) - return err - } - - if stateLogMetrics.idleStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(IdleMaxGuageMetric), - metric.WithDescription("Maximum Number of client connections in idle state within resolution time"), - ); err != nil { - logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for max idle state", err) - return err - } - - if stateLogMetrics.bklgStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(BacklogMaxGuageMetric), - metric.WithDescription("Maximum Number of client connections in backlog state within resolution time"), - ); err != nil { - logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for max backlog state", err) - return err - } - - if stateLogMetrics.strdStateMax, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(StrdMaxGuageMetric), - metric.WithDescription("Maximum Number of client connections in idle state within resolution time"), - ); err != nil { - logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for max stranded state", err) - return err - } - - //Initialize min for accpet - if stateLogMetrics.acptStateMin, err = stateLogMetrics.meter.Int64ObservableGauge( - otelconfig.OTelConfigData.PopulateMetricNamePrefix(AcceptMinGuageMetric), - metric.WithDescription("Minimum Number of workers in accept state within resolution time"), - ); err != nil { - logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for min accept state", err) - return err - } - - stateLogMetrics.registration, err = stateLogMetrics.meter.RegisterCallback( - func(ctx context.Context, observer metric.Observer) error { - return stateLogMetrics.asyncStateLogMetricsPoll(observer) - }, - []metric.Observable{ - stateLogMetrics.initState, - stateLogMetrics.acptState, - stateLogMetrics.waitState, - stateLogMetrics.busyState, - stateLogMetrics.schdState, - stateLogMetrics.fnshState, - stateLogMetrics.quceState, - stateLogMetrics.asgnState, - stateLogMetrics.idleState, - stateLogMetrics.bklgState, - stateLogMetrics.strdState, - - stateLogMetrics.initStateMax, //Max - stateLogMetrics.waitStateMax, - stateLogMetrics.busyStateMax, - stateLogMetrics.schdStateMax, - stateLogMetrics.quceStateMax, - stateLogMetrics.idleStateMax, - stateLogMetrics.bklgStateMax, - stateLogMetrics.strdStateMax, - - stateLogMetrics.acptStateMin, //Min - }...) if err != nil { return err @@ -313,19 +223,16 @@ func (stateLogMetrics *StateLogMetrics) register() error { } /* - * AasyncStatelogMetricsPoll poll operation involved periodically by OTEL collector based-on its polling interval + * asyncStateLogMetricsPoll poll operation involved periodically by OTEL collector based-on its polling interval * it poll metrics from channel do aggregation or compute max based combination of shardId + workerType + InstanceId */ -func (stateLogMetrics *StateLogMetrics) asyncStateLogMetricsPoll(observer metric.Observer) (err error) { - stateLogMetrics.stateLock.Lock() - defer stateLogMetrics.stateLock.Unlock() - stateLogsData := make(map[string]map[string]int64) - var stateLogTitle string - //Infinite loop read through the channel and send metrics +func (stateLogMetrics *StateLogMetrics) startStateLogMetricsPoll(ctx context.Context) { mainloop: for { select { case workersState, more := <-stateLogMetrics.mStateDataChan: + stateLogsData := make(map[string]map[string]int64) + var stateLogTitle string if !more { logger.GetLogger().Log(logger.Info, "Statelog metrics data channel 'mStateDataChan' has been closed.") break mainloop @@ -341,52 +248,28 @@ mainloop: stateLogsData[keyName][WorkerType] = int64(workersState.WorkerType) stateLogsData[keyName][InstanceId] = int64(workersState.InstanceId) stateLogsData[keyName][Datapoints] += 1 - for key, value := range workersState.StateData { - if key == "req" || key == "resp" { - stateLogsData[keyName][key] += value - } else { - maxKey := key + "Max" - minKey := key + "Min" - stateLogsData[keyName][key] = value - //check max update max value - _, keyPresent := stateLogsData[keyName][maxKey] - if !keyPresent { - stateLogsData[keyName][maxKey] = value - } - if stateLogsData[keyName][maxKey] < value { - stateLogsData[keyName][maxKey] = value - } - //Min value - _, keyPresent = stateLogsData[keyName][minKey] - if !keyPresent { - stateLogsData[keyName][minKey] = value - } - if stateLogsData[keyName][minKey] > value { - stateLogsData[keyName][minKey] = value - } - } + stateLogsData[keyName][key] = value + } + if len(stateLogsData) > 0 { + stateLogMetrics.sendMetricsDataToCollector(ctx, &stateLogTitle, stateLogsData) } case <-stateLogMetrics.doneCh: logger.GetLogger().Log(logger.Info, "received stopped signal for processing statelog metric. "+ - "so unregistering callback for sending data and closing data channel") + "so stop sending data and closing data channel") close(stateLogMetrics.mStateDataChan) - stateLogMetrics.registration.Unregister() - default: break mainloop + case <-time.After(1000 * time.Millisecond): + logger.GetLogger().Log(logger.Info, "timeout on waiting for statelog metrics data") + continue mainloop } } - //Process metrics data - if len(stateLogsData) > 0 { - err = stateLogMetrics.sendMetricsDataToCollector(observer, &stateLogTitle, stateLogsData) - } - return err } /* * Send metrics datat data-points to collector */ -func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(observer metric.Observer, stateLogTitle *string, stateLogsData map[string]map[string]int64) (err error) { +func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(ctx context.Context, stateLogTitle *string, stateLogsData map[string]map[string]int64) { for key, aggStatesData := range stateLogsData { logger.GetLogger().Log(logger.Info, fmt.Sprintf("publishing metric with calculated max value and aggregation of gauge for shardid-workertype-instanceId: %s using datapoints size: %d", key, aggStatesData[Datapoints])) commonLabels := []attribute.KeyValue{ @@ -398,34 +281,18 @@ func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(observer metr } //Observe states data //1. Worker States - observer.ObserveInt64(stateLogMetrics.initState, aggStatesData["init"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.acptState, aggStatesData["acpt"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.waitState, aggStatesData["wait"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.busyState, aggStatesData["busy"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.schdState, aggStatesData["schd"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.fnshState, aggStatesData["fnsh"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.quceState, aggStatesData["quce"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.initState.Record(ctx, aggStatesData["init"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.acptState.Record(ctx, aggStatesData["acpt"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.waitState.Record(ctx, aggStatesData["wait"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.busyState.Record(ctx, aggStatesData["busy"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.schdState.Record(ctx, aggStatesData["schd"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.fnshState.Record(ctx, aggStatesData["fnsh"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.quceState.Record(ctx, aggStatesData["quce"], metric.WithAttributes(commonLabels...)) //2. Connection States - observer.ObserveInt64(stateLogMetrics.asgnState, aggStatesData["asgn"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.idleState, aggStatesData["idle"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.bklgState, aggStatesData["bklg"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.strdState, aggStatesData["strd"], metric.WithAttributes(commonLabels...)) - - //3. Worker States Max values - observer.ObserveInt64(stateLogMetrics.initStateMax, aggStatesData["initMax"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.waitStateMax, aggStatesData["waitMax"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.busyStateMax, aggStatesData["busyMax"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.schdStateMax, aggStatesData["schdMax"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.quceStateMax, aggStatesData["quceMax"], metric.WithAttributes(commonLabels...)) - - //4. Connection States Max values - observer.ObserveInt64(stateLogMetrics.idleStateMax, aggStatesData["idleMax"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.bklgStateMax, aggStatesData["bklgMax"], metric.WithAttributes(commonLabels...)) - observer.ObserveInt64(stateLogMetrics.strdStateMax, aggStatesData["strdMax"], metric.WithAttributes(commonLabels...)) - - //5. Min accept state - observer.ObserveInt64(stateLogMetrics.acptStateMin, aggStatesData["acptMin"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.asgnState.Record(ctx, aggStatesData["asgn"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.idleState.Record(ctx, aggStatesData["idle"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.bklgState.Record(ctx, aggStatesData["bklg"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.strdState.Record(ctx, aggStatesData["strd"], metric.WithAttributes(commonLabels...)) } - return nil } diff --git a/utility/logger/otel/test/state_logger_test.go b/utility/logger/otel/test/state_logger_test.go index f994738c..87c455e8 100644 --- a/utility/logger/otel/test/state_logger_test.go +++ b/utility/logger/otel/test/state_logger_test.go @@ -93,7 +93,7 @@ func TestVerifyStateLogMetricsInitilization(t *testing.T) { t.Fail() } - err = otellogger.StartMetricsCollection(5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) + err = otellogger.StartMetricsCollection(context.Background(), 5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) if err != nil { logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") @@ -116,7 +116,7 @@ func TestVerifyStateLogMetricsInitilizationAndContextWithTimeout(t *testing.T) { t.Fail() } - err = otellogger.StartMetricsCollection(5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) + err = otellogger.StartMetricsCollection(context.Background(), 5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) defer otellogger.StopMetricCollection() if err != nil { @@ -137,7 +137,7 @@ func TestSendingStateLogMetrics(t *testing.T) { time.Sleep(2 * time.Second) - err := otellogger.StartMetricsCollection(5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) + err := otellogger.StartMetricsCollection(context.Background(), 5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) if err != nil { logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") @@ -167,8 +167,8 @@ func TestSendingStateLogMetrics(t *testing.T) { logger.GetLogger().Log(logger.Info, "Data Sent successfully for instrumentation") time.Sleep(5 * time.Second) metricsData := mc.GetMetrics() - if len(metricsData) < 20 { - t.Fatalf("got %d, wanted %d", len(metricsData), 20) + if len(metricsData) < 11 { + t.Fatalf("got %d, wanted %d", len(metricsData), 11) } } @@ -178,7 +178,7 @@ func TestSendingStateLogMetricsConsoleExporter(t *testing.T) { t.Fail() } - err2 := otellogger.StartMetricsCollection(100, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp2")) + err2 := otellogger.StartMetricsCollection(context.Background(), 100, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp2")) if err2 != nil { logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") @@ -268,7 +268,7 @@ func TestOCCStatelogGenerator(t *testing.T) { } defer cont.Shutdown(context.Background()) - err2 := otellogger.StartMetricsCollection(1000, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) + err2 := otellogger.StartMetricsCollection(context.Background(), 1000, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) defer otellogger.StopMetricCollection() go dataGenerator() @@ -285,7 +285,6 @@ func dataGenerator() { waitTime := time.Second * 1 metricNames := [11]string{"init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd"} - workerStates := [2]string{"req", "resp"} timer := time.NewTimer(waitTime) @@ -316,8 +315,6 @@ mainloop: //Random index randIndex := rand.Intn(len(metricNames)) workerStatesData.StateData[metricNames[randIndex]] += int64(totalSum - tempSum) - workerStatesData.StateData[workerStates[0]] = int64(rand.Intn(100)) - workerStatesData.StateData[workerStates[1]] = int64(rand.Intn(100)) otellogger.AddDataPointToOTELStateDataChan(&workerStatesData) timer.Reset(waitTime) case <-ctx.Done(): From be68655425efa5f8946b6ad55790675ffe74a69c Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Fri, 2 Aug 2024 13:07:32 +0530 Subject: [PATCH 11/19] Otel incorporate review comments and uts (#98) * Occ config logging (#394) * occ configurations logging * cal event success * adding cal data * added TODOs * Remove log_occ_confogs.go * Remove testing files * source of configs - files * whitelist format change * code clean up * code review changes-1 * CR fixes * CR fixes * Delete tests/unittest/config_logging/main_test.go * clean up * Merge branch 'occ-config-logging' of /Users/simmidisetty/Documents/GitHub/OpenSourceHera/src/github.com/paypal/hera with conflicts. * test for config logging * removing test changes * tests for all cases * test * making minor changes for logging feature specific data * changes for incorporate review comments --------- Co-authored-by: simmidisetty Co-authored-by: Rajesh S * changes for incorporate review comments --------- Co-authored-by: satyakamala03 <128077872+satyakamala03@users.noreply.github.com> Co-authored-by: simmidisetty Co-authored-by: Rajesh S --- go.mod | 3 - go.sum | 7 - utility/logger/otel/defs.go | 7 + utility/logger/otel/logger.go | 150 ------------------ utility/logger/otel/state_logger.go | 5 +- utility/logger/otel/test/state_logger_test.go | 8 +- 6 files changed, 12 insertions(+), 168 deletions(-) diff --git a/go.mod b/go.mod index 8183674e..dd63842c 100644 --- a/go.mod +++ b/go.mod @@ -9,9 +9,6 @@ require ( go.opentelemetry.io/otel v1.24.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.24.0 - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.24.0 - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.24.0 - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.24.0 go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.24.0 go.opentelemetry.io/otel/metric v1.24.0 go.opentelemetry.io/otel/sdk v1.24.0 diff --git a/go.sum b/go.sum index f59b3698..0a629aea 100644 --- a/go.sum +++ b/go.sum @@ -27,12 +27,6 @@ go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0 h1:f2j go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0/go.mod h1:B+bcQI1yTY+N0vqMpoZbEN7+XU4tNM0DmUiOwebFJWI= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.24.0 h1:mM8nKi6/iFQ0iqst80wDHU2ge198Ye/TfN0WBS5U24Y= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.24.0/go.mod h1:0PrIIzDteLSmNyxqcGYRL4mDIo8OTuBAOI/Bn1URxac= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.24.0 h1:t6wl9SPayj+c7lEIFgm4ooDBZVb01IhLB4InpomhRw8= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.24.0/go.mod h1:iSDOcsnSA5INXzZtwaBPrKp/lWu/V14Dd+llD0oI2EA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.24.0 h1:Mw5xcxMwlqoJd97vwPxA8isEaIoxsta9/Q51+TTJLGE= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.24.0/go.mod h1:CQNu9bj7o7mC6U7+CA/schKEYakYXWr79ucDHTMGhCM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.24.0 h1:Xw8U6u2f8DK2XAkGRFV7BBLENgnTGX9i4rQRxJf+/vs= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.24.0/go.mod h1:6KW1Fm6R/s6Z3PGXwSJN2K4eT6wQB3vXX6CVnYX9NmM= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.24.0 h1:JYE2HM7pZbOt5Jhk8ndWZTUWYOVift2cHjXVMkPdmdc= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.24.0/go.mod h1:yMb/8c6hVsnma0RpsBMNo0fEiQKeclawtgaIaOp2MLY= go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI= @@ -45,7 +39,6 @@ go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= go.opentelemetry.io/proto/otlp v1.2.0 h1:pVeZGk7nXDC9O2hncA6nHldxEjm6LByfA2aN8IOkz94= go.opentelemetry.io/proto/otlp v1.2.0/go.mod h1:gGpR8txAl5M03pDhMC79G6SdqNV26naRm/KDsgaHD8A= -go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= diff --git a/utility/logger/otel/defs.go b/utility/logger/otel/defs.go index 4819e39d..088a52aa 100644 --- a/utility/logger/otel/defs.go +++ b/utility/logger/otel/defs.go @@ -43,6 +43,13 @@ const ( var StatelogBucket = []float64{0, 5, 10, 15, 20, 25, 30, 40, 50, 60, 80, 100, 120, 160, 200} var ConnectionStateBucket = []float64{0, 25, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800, 1200, 2400, 4800, 9600, 19200, 39400, 65536} +// WorkerTypeMap This map represents worker type configured in lib.HeraWorkerType variable. If any changes in worker type this definition need to get updated. +var WorkerTypeMap = map[int]string{ + 0: "rw", + 1: "ro", + 2: "standby_ro", +} + const OtelInstrumentationVersion string = "v1.0" // DEFAULT_OTEL_COLLECTOR_PROTOCOL default OTEL configurations point to QA collector diff --git a/utility/logger/otel/logger.go b/utility/logger/otel/logger.go index 2785659c..c90dd853 100644 --- a/utility/logger/otel/logger.go +++ b/utility/logger/otel/logger.go @@ -10,14 +10,10 @@ import ( "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" - "go.opentelemetry.io/otel/exporters/otlp/otlptrace" - "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" - "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" "go.opentelemetry.io/otel/sdk/instrumentation" "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/metric/metricdata" "go.opentelemetry.io/otel/sdk/resource" - "go.opentelemetry.io/otel/sdk/trace" "os" "sync" "time" @@ -62,14 +58,6 @@ func initializeOTelSDK(ctx context.Context) (shutdown func(ctx context.Context) errorDataMap := make(map[string]*OTelErrorData) //Initialize the map after process it. gErrorDataMap.Store(errorDataMap) - traceProvider, err := newTraceProvider(ctx) //Initialize trace provider - if err != nil { - handleErr(err) - return nil, err - } - shutdownFuncs = append(shutdownFuncs, traceProvider.Shutdown) - otel.SetTracerProvider(traceProvider) - //Setup meter provider meterProvider, err := newMeterProvider(ctx) otel.SetMeterProvider(meterProvider) @@ -89,26 +77,6 @@ func initializeOTelSDK(ctx context.Context) (shutdown func(ctx context.Context) return shutdown, err } -func newTraceProvider(ctx context.Context) (*trace.TracerProvider, error) { - - traceExporter, err := getTraceExporter(ctx) - if err != nil { - return nil, err - } - - traceProvider := trace.NewTracerProvider( - trace.WithBatcher(traceExporter, - trace.WithBatchTimeout(5*time.Second), - trace.WithExportTimeout(2*time.Second), - trace.WithMaxExportBatchSize(10), - trace.WithMaxQueueSize(10), - ), - // Default is 5s. Set to 1s for demonstrative purposes. - trace.WithResource(getResourceInfo(config.OTelConfigData.PoolName)), - ) - return traceProvider, nil -} - // Initialize newMeterProvider respective exporter either HTTP or GRPC exporter func newMeterProvider(ctx context.Context) (*metric.MeterProvider, error) { metricExporter, err := getMetricExporter(ctx) @@ -249,14 +217,6 @@ func getMetricExporter(ctx context.Context) (metric.Exporter, error) { return newHTTPExporter(ctx) } -// getTraceExporter Initialize span exporter based protocol(GRPC or HTTP) selected by user. -func getTraceExporter(ctx context.Context) (*otlptrace.Exporter, error) { - if config.OTelConfigData.OtelTraceGRPC { - return newGRPCTraceExporter(ctx) - } - return newHTTPTraceExporter(ctx) -} - // newHTTPExporter Initilizes The "otlpmetrichttp" exporter in OpenTelemetry is used to export metrics data using the // OpenTelemetry Protocol (OTLP) over HTTP. func newHTTPExporter(ctx context.Context) (metric.Exporter, error) { @@ -386,116 +346,6 @@ func newGRPCExporter(ctx context.Context) (metric.Exporter, error) { } } -// newHTTPTraceExporter Initilizes The "otlptracehttp" exporter in OpenTelemetry is used to export spans data using the -// OpenTelemetry Protocol (OTLP) over HTTP. -func newHTTPTraceExporter(ctx context.Context) (*otlptrace.Exporter, error) { - headers := make(map[string]string) - headers[IngestTokenHeader] = config.GetOTelIngestToken() - if config.OTelConfigData.UseTls { - return otlptracehttp.New(ctx, - otlptracehttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.TracePort)), - otlptracehttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), - otlptracehttp.WithHeaders(headers), - otlptracehttp.WithRetry(otlptracehttp.RetryConfig{ - // Enabled indicates whether to not retry sending batches in case - // of export failure. - Enabled: false, - // InitialInterval the time to wait after the first failure before - // retrying. - InitialInterval: 1 * time.Second, - // MaxInterval is the upper bound on backoff interval. Once this - // value is reached the delay between consecutive retries will - // always be `MaxInterval`. - MaxInterval: 10 * time.Second, - // MaxElapsedTime is the maximum amount of time (including retries) - // spent trying to send a request/batch. Once this value is - // reached, the data is discarded. - MaxElapsedTime: 20 * time.Second, - }), - otlptracehttp.WithURLPath(config.OTelConfigData.TraceURLPath), - ) - } else { - return otlptracehttp.New(ctx, - otlptracehttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.TracePort)), - otlptracehttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), - otlptracehttp.WithHeaders(headers), - otlptracehttp.WithRetry(otlptracehttp.RetryConfig{ - // Enabled indicates whether to not retry sending batches in case - // of export failure. - Enabled: false, - // InitialInterval the time to wait after the first failure before - // retrying. - InitialInterval: 1 * time.Second, - // MaxInterval is the upper bound on backoff interval. Once this - // value is reached the delay between consecutive retries will - // always be `MaxInterval`. - MaxInterval: 10 * time.Second, - // MaxElapsedTime is the maximum amount of time (including retries) - // spent trying to send a request/batch. Once this value is - // reached, the data is discarded. - MaxElapsedTime: 20 * time.Second, - }), - otlptracehttp.WithURLPath(config.OTelConfigData.TraceURLPath), - otlptracehttp.WithInsecure(), //Since agent is local - ) - } -} - -// newGRPCTraceExporter Initilizes The "otlptracegrpc" exporter in OpenTelemetry is used to export spans data using the -// OpenTelemetry Protocol (OTLP) over GRPC. -func newGRPCTraceExporter(ctx context.Context) (*otlptrace.Exporter, error) { - - headers := make(map[string]string) - headers[IngestTokenHeader] = config.GetOTelIngestToken() - - if config.OTelConfigData.UseTls { - return otlptracegrpc.New(ctx, - otlptracegrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.TracePort)), - otlptracegrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), - otlptracegrpc.WithHeaders(headers), - otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{ - // Enabled indicates whether to not retry sending batches in case - // of export failure. - Enabled: false, - // InitialInterval the time to wait after the first failure before - // retrying. - InitialInterval: 1 * time.Second, - // MaxInterval is the upper bound on backoff interval. Once this - // value is reached the delay between consecutive retries will - // always be `MaxInterval`. - MaxInterval: 10 * time.Second, - // MaxElapsedTime is the maximum amount of time (including retries) - // spent trying to send a request/batch. Once this value is - // reached, the data is discarded. - MaxElapsedTime: 20 * time.Second, - }), - ) - } else { - return otlptracegrpc.New(ctx, - otlptracegrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.TracePort)), - otlptracegrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), - otlptracegrpc.WithHeaders(headers), - otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{ - // Enabled indicates whether to not retry sending batches in case - // of export failure. - Enabled: false, - // InitialInterval the time to wait after the first failure before - // retrying. - InitialInterval: 1 * time.Second, - // MaxInterval is the upper bound on backoff interval. Once this - // value is reached the delay between consecutive retries will - // always be `MaxInterval`. - MaxInterval: 10 * time.Second, - // MaxElapsedTime is the maximum amount of time (including retries) - // spent trying to send a request/batch. Once this value is - // reached, the data is discarded. - MaxElapsedTime: 20 * time.Second, - }), - otlptracegrpc.WithInsecure(), //Since agent is local - ) - } -} - // getResourceInfo provide application context level attributes during initialization func getResourceInfo(appName string) *resource.Resource { hostname, _ := os.Hostname() diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go index f2d4922a..955caca6 100644 --- a/utility/logger/otel/state_logger.go +++ b/utility/logger/otel/state_logger.go @@ -259,9 +259,6 @@ mainloop: "so stop sending data and closing data channel") close(stateLogMetrics.mStateDataChan) break mainloop - case <-time.After(1000 * time.Millisecond): - logger.GetLogger().Log(logger.Info, "timeout on waiting for statelog metrics data") - continue mainloop } } } @@ -274,7 +271,7 @@ func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(ctx context.C logger.GetLogger().Log(logger.Info, fmt.Sprintf("publishing metric with calculated max value and aggregation of gauge for shardid-workertype-instanceId: %s using datapoints size: %d", key, aggStatesData[Datapoints])) commonLabels := []attribute.KeyValue{ attribute.Int(ShardId, int(aggStatesData[ShardId])), - attribute.Int(WorkerType, int(aggStatesData[WorkerType])), + attribute.String(WorkerType, WorkerTypeMap[int(aggStatesData[WorkerType])]), attribute.Int(InstanceId, int(aggStatesData[InstanceId])), attribute.String(OccWorkerParamName, *stateLogTitle), attribute.String(HostDimensionName, stateLogMetrics.hostname), diff --git a/utility/logger/otel/test/state_logger_test.go b/utility/logger/otel/test/state_logger_test.go index 87c455e8..a303599a 100644 --- a/utility/logger/otel/test/state_logger_test.go +++ b/utility/logger/otel/test/state_logger_test.go @@ -27,7 +27,7 @@ func initializeConsoleExporter() (*metric.MeterProvider, error) { Enabled: true, OtelMetricGRPC: false, OtelTraceGRPC: false, - ResolutionTimeInSec: 3, + ResolutionTimeInSec: 6, OTelErrorReportingInterval: 10, PoolName: "occ-testapp", MetricNamePrefix: "pp.occ", @@ -261,7 +261,7 @@ func TestSendingStateLogMetricsConsoleExporter(t *testing.T) { } } -func TestOCCStatelogGenerator(t *testing.T) { +func TestOCCStateLogGeneratorWithRandomValues(t *testing.T) { cont, err := initializeConsoleExporter() if err != nil { t.Fail() @@ -276,11 +276,11 @@ func TestOCCStatelogGenerator(t *testing.T) { logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") t.Fatalf("TestOCCStatelogGenerator failed with error %v", err) } - <-time.After(time.Second * time.Duration(10)) + <-time.After(time.Second * time.Duration(30)) } func dataGenerator() { - ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() waitTime := time.Second * 1 From 057a0a325c49134789ff3f8ddad1e279c21446b8 Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Fri, 30 Aug 2024 11:06:08 +0530 Subject: [PATCH 12/19] Otel dimension fixes (#105) * Occ config logging (#394) * occ configurations logging * cal event success * adding cal data * added TODOs * Remove log_occ_confogs.go * Remove testing files * source of configs - files * whitelist format change * code clean up * code review changes-1 * CR fixes * CR fixes * Delete tests/unittest/config_logging/main_test.go * clean up * Merge branch 'occ-config-logging' of /Users/simmidisetty/Documents/GitHub/OpenSourceHera/src/github.com/paypal/hera with conflicts. * test for config logging * removing test changes * tests for all cases * test * making minor changes for logging feature specific data * changes for incorporate review comments --------- Co-authored-by: simmidisetty Co-authored-by: Rajesh S * adding new tests and dimension value change --------- Co-authored-by: satyakamala03 <128077872+satyakamala03@users.noreply.github.com> Co-authored-by: simmidisetty Co-authored-by: Rajesh S --- .github/workflows/go.yml | 15 +- lib/statelog.go | 8 +- lib/workerpool_test.go | 4 +- tests/unittest/bindLess/main_test.go | 2 +- tests/unittest/bindThrottle/main_test.go | 2 +- tests/unittest/otel_basic/main_test.go | 2 +- .../otel_incorrect_endpoint/main_test.go | 2 +- .../otel_remote_endpoint_tls/main_test.go | 123 ----------- tests/unittest/otel_sharding/main_test.go | 198 ++++++++++++++++++ 9 files changed, 223 insertions(+), 133 deletions(-) delete mode 100644 tests/unittest/otel_remote_endpoint_tls/main_test.go create mode 100644 tests/unittest/otel_sharding/main_test.go diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 75165e64..bd4810d0 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -7,7 +7,6 @@ on: branches: [ main ] jobs: - build: runs-on: ubuntu-latest steps: @@ -16,7 +15,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v4 with: - go-version: 1.18.2 + go-version: 1.20.14 - name: Install GO Modules run: | go mod tidy @@ -37,17 +36,25 @@ jobs: - 3306:3306 steps: - uses: actions/checkout@v3 - - name: Set up Go uses: actions/setup-go@v4 with: - go-version: 1.18.2 + go-version: 1.20.14 - name: Install GO Modules run: | go mod tidy go mod download github.com/godror/godror + - name: Give Full Permissions to Work Directory + run: sudo chmod -R 777 /home/runner/work/ - name: Build worker run: go install github.com/paypal/hera/worker/mysqlworker + - name: Install Docker Compose + run: | + sudo apt-get update + sudo apt-get install -y docker-compose + + - name: Verify Docker Compose Installation + run: docker-compose --version - name: System Test run: | export GOPATH="/home/runner/go" diff --git a/lib/statelog.go b/lib/statelog.go index 89d54c5b..7a12d7a8 100644 --- a/lib/statelog.go +++ b/lib/statelog.go @@ -29,6 +29,7 @@ import ( "os" "path/filepath" "strconv" + "strings" "sync" "time" @@ -103,6 +104,9 @@ type StateLog struct { // title printed in workertype column (leftmost). // mTypeTitles [](map[HeraWorkerType]([]string)) + + //OTEL statelog occ-worker dimension titles + workerDimensionTitle map[string]string // // header row (state) // @@ -479,6 +483,7 @@ func (sl *StateLog) init() error { sl.mWorkerStates = make([]map[HeraWorkerType][][]*WorkerStateInfo, sl.maxShardSize) sl.mConnStates = make([]map[HeraWorkerType][]*ConnStateInfo, sl.maxShardSize) sl.mTypeTitles = make([]map[HeraWorkerType][]string, sl.maxShardSize) + sl.workerDimensionTitle = make(map[string]string) sl.mLastReqCnt = make([]map[HeraWorkerType][]int64, sl.maxShardSize) sl.mLastRspCnt = make([]map[HeraWorkerType][]int64, sl.maxShardSize) // @@ -558,6 +563,7 @@ func (sl *StateLog) init() error { if shardEnabled { sl.mTypeTitles[s][t][i] += suffix } + sl.workerDimensionTitle[sl.mTypeTitles[s][t][i]] = strings.Replace(sl.mTypeTitles[s][t][i], GetConfig().StateLogPrefix, otelconfig.OTelConfigData.PoolName, 1) } } } @@ -767,7 +773,7 @@ func (sl *StateLog) genReport() { } // Initialize statedata object workerStatesData := otel_logger.WorkersStateData{ - StateTitle: sl.mTypeTitles[s][HeraWorkerType(t)][n], + StateTitle: sl.workerDimensionTitle[sl.mTypeTitles[s][HeraWorkerType(t)][n]], ShardId: int(s), WorkerType: int(t), InstanceId: int(n), diff --git a/lib/workerpool_test.go b/lib/workerpool_test.go index 06116e06..4e3db96d 100644 --- a/lib/workerpool_test.go +++ b/lib/workerpool_test.go @@ -19,6 +19,7 @@ package lib import ( "encoding/hex" + otelconfig "github.com/paypal/hera/utility/logger/otel/config" "os" "sync" "testing" @@ -29,13 +30,14 @@ func TestPoolDempotency(t *testing.T) { var useheratxt = false var err error if useheratxt { - err = InitConfig() + err = InitConfig("occ-test") if err != nil { t.Errorf("config initialization failure %s", err.Error()) return } } else { gAppConfig = &Config{BacklogTimeoutMsec: 1, LifoScheduler: true, numWorkersCh: make(chan int, 1)} + otelconfig.OTelConfigData = &otelconfig.OTelConfig{} gOpsConfig = &OpsConfig{numWorkers: 3} gAppConfig.numWorkersCh <- int(gOpsConfig.numWorkers) } diff --git a/tests/unittest/bindLess/main_test.go b/tests/unittest/bindLess/main_test.go index b59b1f07..72c92ec4 100644 --- a/tests/unittest/bindLess/main_test.go +++ b/tests/unittest/bindLess/main_test.go @@ -211,7 +211,7 @@ func TestBindLess(t *testing.T) { logger.GetLogger().Log(logger.Debug, "TestBindLess +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") testutil.BackupAndClear("cal", "BindLess start") testutil.BackupAndClear("hera", "BindLess start") - err := partialBadLoad(0.10) + err := partialBadLoad(0.07) if err != nil && err != NormCliErr() { t.Fatalf("main step function returned err %s", err.Error()) } diff --git a/tests/unittest/bindThrottle/main_test.go b/tests/unittest/bindThrottle/main_test.go index 7cdede41..7e3c87da 100644 --- a/tests/unittest/bindThrottle/main_test.go +++ b/tests/unittest/bindThrottle/main_test.go @@ -205,7 +205,7 @@ func mkClients(num int, stop *int, bindV int, grpName string, outErr *string, db func TestBindThrottle(t *testing.T) { // we would like to clear hera.log, but even if we try, lots of messages still go there logger.GetLogger().Log(logger.Debug, "BindThrottle +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") - err := partialBadLoad(0.10) + err := partialBadLoad(0.07) if err != nil && err != NormCliErr() { t.Fatalf("main step function returned err %s", err.Error()) } diff --git a/tests/unittest/otel_basic/main_test.go b/tests/unittest/otel_basic/main_test.go index f8877584..f1c04a37 100644 --- a/tests/unittest/otel_basic/main_test.go +++ b/tests/unittest/otel_basic/main_test.go @@ -114,7 +114,7 @@ func TestOTELMetricsBasic(t *testing.T) { if initCount < 1 { t.Fatalf("OTEL event should contain metric name pp.occ.init_connection") } - tagsCount := testutil.RegexCountFile("{\"key\":\"InstanceId\",\"value\":{\"intValue\":\"0\"}},{\"key\":\"ShardId\",\"value\":{\"intValue\":\"0\"}},{\"key\":\"WorkerType\",\"value\":{\"intValue\":\"0\"}", + tagsCount := testutil.RegexCountFile("{\"key\":\"InstanceId\",\"value\":{\"intValue\":\"0\"}},{\"key\":\"ShardId\",\"value\":{\"intValue\":\"0\"}}", logFilePath) if tagsCount < 1 { t.Fatalf("mandatory tags InstanceId, ShardId, WorkerType should present") diff --git a/tests/unittest/otel_incorrect_endpoint/main_test.go b/tests/unittest/otel_incorrect_endpoint/main_test.go index cadccfa1..40eebf05 100644 --- a/tests/unittest/otel_incorrect_endpoint/main_test.go +++ b/tests/unittest/otel_incorrect_endpoint/main_test.go @@ -105,7 +105,7 @@ func TestOTELMetricsIncorrectEndPoint(t *testing.T) { t.Fatalf("otel publishing error should present in log because of in-correct OTEL port number") } - time.Sleep(5 * time.Second) + time.Sleep(10 * time.Second) calPublishingErrors := testutil.RegexCountFile("failed to send metrics", "cal.log") if calPublishingErrors < 1 { t.Fatalf("otel publishing error should present in CAL log because of in-correct OTEL port number") diff --git a/tests/unittest/otel_remote_endpoint_tls/main_test.go b/tests/unittest/otel_remote_endpoint_tls/main_test.go deleted file mode 100644 index 4d4f96f6..00000000 --- a/tests/unittest/otel_remote_endpoint_tls/main_test.go +++ /dev/null @@ -1,123 +0,0 @@ -package main - -import ( - "context" - "database/sql" - "fmt" - "os" - "strings" - "testing" - "time" - - "github.com/paypal/hera/tests/unittest/testutil" - "github.com/paypal/hera/utility/logger" -) - -var mx testutil.Mux -var tableName string - -func cfg() (map[string]string, map[string]string, testutil.WorkerType) { - - appcfg := make(map[string]string) - // best to chose an "unique" port in case golang runs tests in paralel - appcfg["bind_port"] = "31002" - appcfg["log_level"] = "5" - appcfg["log_file"] = "hera.log" - appcfg["sharding_cfg_reload_interval"] = "5" - appcfg["enable_sharding"] = "true" - appcfg["num_shards"] = "3" - appcfg["max_scuttle"] = "9" - appcfg["rac_sql_interval"] = "0" - appcfg["child.executable"] = "mysqlworker" - appcfg["enable_otel"] = "true" - appcfg["otel_use_tls"] = "true" - appcfg["otel_agent_host"] = "otelmetrics-pp-observability.us-central1.gcp.dev.paypalinc.com" - appcfg["otel_agent_metrics_port"] = "30706" - appcfg["otel_agent_trace_port"] = "30706" - appcfg["otel_resolution_time_in_sec"] = "10" - appcfg["otel_agent_metrics_uri"] = "v1/metrics" - opscfg := make(map[string]string) - opscfg["opscfg.default.server.max_connections"] = "5" - opscfg["opscfg.default.server.log_level"] = "5" - os.Setenv("AVAILABILITY_ZONE", "test-dev") - os.Setenv("ENVIRONMENT", "dev") - return appcfg, opscfg, testutil.MySQLWorker -} - -func before() error { - tableName = os.Getenv("TABLE_NAME") - if tableName == "" { - tableName = "jdbc_hera_test" - } - if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { - // mysql - testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") - } - return nil -} - -func TestMain(m *testing.M) { - os.Exit(testutil.UtilMain(m, cfg, before)) -} - -func TestOTELMetricsRemoteEndPointWithTLS(t *testing.T) { - logger.GetLogger().Log(logger.Debug, "TestOTELMetricsRemoteEndPointWithTLS begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") - - shard := 0 - db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) - if err != nil { - t.Fatal("Error starting Mux:", err) - return - } - db.SetMaxIdleConns(0) - defer db.Close() - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - // cleanup and insert one row in the table - conn, err := db.Conn(ctx) - if err != nil { - t.Fatalf("Error getting connection %s\n", err.Error()) - } - tx, _ := conn.BeginTx(ctx, nil) - sqlTxt := "/*cmd*/delete from " + tableName - stmt, _ := tx.PrepareContext(ctx, sqlTxt) - _, err = stmt.Exec() - if err != nil { - t.Fatalf("Error preparing test (delete table) %s with %s ==== sql\n", err.Error(), sqlTxt) - } - - stmt, _ = tx.PrepareContext(ctx, "/*cmd*/insert into "+tableName+" (id, int_val, str_val) VALUES(?, ?, ?)") - _, err = stmt.Exec(1, time.Now().Unix(), "val 1") - if err != nil { - t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) - } - err = tx.Commit() - if err != nil { - t.Fatalf("Error commit %s\n", err.Error()) - } - - stmt, _ = conn.PrepareContext(ctx, "/*cmd*/Select id, int_val from "+tableName+" where id=?") - rows, _ := stmt.Query(1) - if !rows.Next() { - t.Fatalf("Expected 1 row") - } - - time.Sleep(15 * time.Second) - rows.Close() - stmt.Close() - - time.Sleep(15 * time.Second) - publishingErrors := testutil.RegexCountFile("otel publishing error", "hera.log") - if publishingErrors > 1 { - t.Fatalf("should not fail while publishing metrics remote host") - } - - calPublishingErrors := testutil.RegexCountFile("failed to send metrics", "cal.log") - if calPublishingErrors > 1 { - t.Fatalf("should not fail while publishing metrics remote host") - } - - cancel() - conn.Close() - logger.GetLogger().Log(logger.Debug, "TestOTELMetricsRemoteEndPointWithTLS done -------------------------------------------------------------") -} diff --git a/tests/unittest/otel_sharding/main_test.go b/tests/unittest/otel_sharding/main_test.go new file mode 100644 index 00000000..8e7b1c95 --- /dev/null +++ b/tests/unittest/otel_sharding/main_test.go @@ -0,0 +1,198 @@ +package main + +import ( + "context" + "database/sql" + "path/filepath" + + "fmt" + "os" + "strings" + "testing" + "time" + + _ "github.com/paypal/hera/client/gosqldriver/tcp" + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31003" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["enable_sharding"] = "true" + appcfg["num_shards"] = "3" + appcfg["max_scuttle"] = "9" + appcfg["shard_key_name"] = "id" + pfx := os.Getenv("MGMT_TABLE_PREFIX") + if pfx != "" { + appcfg["management_table_prefix"] = pfx + } + appcfg["sharding_cfg_reload_interval"] = "3600" + appcfg["rac_sql_interval"] = "0" + appcfg["enable_otel"] = "true" + appcfg["otel_resolution_time_in_sec"] = "3" + appcfg["cfg_from_tns"] = "false" + appcfg["num_standby_dbs"] = "0" + + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "3" + opscfg["opscfg.default.server.log_level"] = "5" + opscfg["opscfg.default.server.max_connections"] = "3" + opscfg["opscfg.default.server.log_level"] = "5" + os.Setenv("AVAILABILITY_ZONE", "test-dev") + os.Setenv("ENVIRONMENT", "dev") + + return appcfg, opscfg, testutil.MySQLWorker +} + +func setupShardMap() { + twoTask := os.Getenv("TWO_TASK") + if !strings.HasPrefix(twoTask, "tcp") { + // not mysql + return + } + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + testutil.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + conn, err := db.Conn(ctx) + if err != nil { + testutil.Fatalf("Error getting connection %s\n", err.Error()) + } + defer conn.Close() + + testutil.RunDML("create table hera_shard_map ( scuttle_id smallint not null, shard_id tinyint not null, status char(1) , read_status char(1), write_status char(1), remarks varchar(500))") + + for i := 0; i < 1024; i++ { + shard := 0 + if i <= 8 { + shard = i % 3 + } + testutil.RunDML(fmt.Sprintf("insert into hera_shard_map ( scuttle_id, shard_id, status, read_status, write_status ) values ( %d, %d, 'Y', 'Y', 'Y' )", i, shard)) + } +} + +func before() error { + tableName = os.Getenv("TABLE_NAME") + if tableName == "" { + tableName = "jdbc_hera_test" + } + if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { + // mysql + testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") + } + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func cleanup(ctx context.Context, conn *sql.Conn) error { + tx, _ := conn.BeginTx(ctx, nil) + stmt, _ := tx.PrepareContext(ctx, "/*Cleanup*/delete from "+tableName+" where id != :id") + _, err := stmt.Exec(sql.Named("id", -123)) + if err != nil { + return err + } + err = tx.Commit() + return nil +} + +func TestShardingWithOTELBasic(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestShardingWithOTELBasic setup") + setupShardMap() + logger.GetLogger().Log(logger.Debug, "TestShardingWithOTELBasic begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + + hostname, _ := os.Hostname() + //hostname := "localhost" + db, err := sql.Open("hera", hostname+":31003") + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + conn, err := db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + cleanup(ctx, conn) + // insert one row in the table + tx, _ := conn.BeginTx(ctx, nil) + stmt, _ := tx.PrepareContext(ctx, "/*TestShardingBasic*/insert into "+tableName+" (id, int_val, str_val) VALUES(:id, :int_val, :str_val)") + _, err = stmt.Exec(sql.Named("id", 1), sql.Named("int_val", time.Now().Unix()), sql.Named("str_val", "val 1")) + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + if err != nil { + t.Fatalf("Error commit %s\n", err.Error()) + } + + stmt, _ = conn.PrepareContext(ctx, "/*TestShardingBasic*/Select id, int_val, str_val from "+tableName+" where id=:id") + rows, _ := stmt.Query(sql.Named("id", 1)) + if !rows.Next() { + t.Fatalf("Expected 1 row") + } + var id, int_val uint64 + var str_val sql.NullString + err = rows.Scan(&id, &int_val, &str_val) + if err != nil { + t.Fatalf("Expected values %s", err.Error()) + } + if str_val.String != "val 1" { + t.Fatalf("Expected val 1 , got: %s", str_val.String) + } + + rows.Close() + stmt.Close() + + cancel() + conn.Close() + + out, err := testutil.BashCmd("grep 'Preparing: /\\*TestShardingBasic\\*/' hera.log | grep 'WORKER shd2' | wc -l") + if (err != nil) || (len(out) == 0) { + err = nil + t.Fatalf("Request did not run on shard 2. err = %v, len(out) = %d", err, len(out)) + } + time.Sleep(15 * time.Second) + //Read OTEL log file for metrics validation + logFilePath := filepath.Join(testutil.GetOTELLogDirPath(), "otel_collector.log") + initCount := testutil.RegexCountFile("\"name\":\"pp.occ.init_connection\"", logFilePath) + if initCount < 1 { + t.Fatalf("OTEL event should contain metric name pp.occ.init_connection") + } + shard0Count := testutil.RegexCountFile("{\"key\":\"ShardId\",\"value\":{\"intValue\":\"0\"}", logFilePath) + if shard0Count < 1 { + t.Fatalf("shard0 should dimesion showuld present for metrics") + } + shard1Count := testutil.RegexCountFile("{\"key\":\"ShardId\",\"value\":{\"intValue\":\"1\"}", logFilePath) + if shard1Count < 1 { + t.Fatalf("shard1 should dimesion showuld present for metrics") + } + shard2Count := testutil.RegexCountFile("{\"key\":\"ShardId\",\"value\":{\"intValue\":\"2\"}", logFilePath) + if shard2Count < 1 { + t.Fatalf("shard2 should dimesion showuld present for metrics") + } + shard2Worker := testutil.RegexCountFile("{\"key\":\"occ_worker\",\"value\":{\"stringValue\":\"hera-test.sh2\"}", logFilePath) + if shard2Worker < 1 { + t.Fatalf("occ_worker should container shard2") + } + logger.GetLogger().Log(logger.Debug, "TestShardingWithOTELBasic done -------------------------------------------------------------") +} From ea47767a9b696ee0ddb6fafb03c2b31c23f7dbe7 Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Wed, 18 Sep 2024 11:08:50 +0530 Subject: [PATCH 13/19] adding new metrics free and total (#113) Co-authored-by: Rajesh S --- .github/workflows/go.yml | 3 + lib/main.go | 6 +- lib/statelog.go | 37 +++- tests/unittest/otel_basic/main_test.go | 6 +- tests/unittest/shutdown_cleanup/main_test.go | 110 ++++++++++ utility/logger/otel/defs.go | 45 +++- utility/logger/otel/logger.go | 14 +- utility/logger/otel/state_logger.go | 192 ++++++++++++++---- utility/logger/otel/test/state_logger_test.go | 185 +++++++++++------ 9 files changed, 477 insertions(+), 121 deletions(-) create mode 100644 tests/unittest/shutdown_cleanup/main_test.go diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index bd4810d0..405ca4b3 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -48,6 +48,9 @@ jobs: run: sudo chmod -R 777 /home/runner/work/ - name: Build worker run: go install github.com/paypal/hera/worker/mysqlworker + - name: Check docker version + run: | + docker version - name: Install Docker Compose run: | sudo apt-get update diff --git a/lib/main.go b/lib/main.go index 8a2bcda3..fac28fe7 100644 --- a/lib/main.go +++ b/lib/main.go @@ -39,6 +39,9 @@ func Run() { signal.Ignore(syscall.SIGPIPE) mux_process_id := syscall.Getpid() + // Defer release resource in case of any abnormal exit of for application + defer handlePanicAndReleaseResource(mux_process_id) + namePtr := flag.String("name", "", "module name in v$session table") flag.Parse() @@ -212,9 +215,6 @@ func Run() { cal.ReleaseCxtResource() }() - // Defer release resource in case of any abnormal exit of for application - defer handlePanicAndReleaseResource(mux_process_id) - <-GetWorkerBrokerInstance().Stopped() } diff --git a/lib/statelog.go b/lib/statelog.go index 7a12d7a8..9bd366a3 100644 --- a/lib/statelog.go +++ b/lib/statelog.go @@ -138,6 +138,9 @@ type StateLog struct { // start time since epoch in ns // mServerStartTime int64 + + //worker pool configurations + workerPoolCfg []map[HeraWorkerType]*WorkerPoolCfg } // StateEventType is an event published by proxy when state changes. @@ -475,7 +478,7 @@ func (sl *StateLog) init() error { if GetWorkerBrokerInstance() == nil { return errors.New("broker not initialized") } - workerpoolcfg := GetWorkerBrokerInstance().GetWorkerPoolCfgs() + sl.workerPoolCfg = GetWorkerBrokerInstance().GetWorkerPoolCfgs() // // allocate array for each shard @@ -503,8 +506,8 @@ func (sl *StateLog) init() error { // for each workertype, initialize two dimension array // for t := 0; t < int(wtypeTotalCount); t++ { - instCnt := workerpoolcfg[s][HeraWorkerType(t)].instCnt - workerCnt := workerpoolcfg[s][HeraWorkerType(t)].maxWorkerCnt + instCnt := sl.workerPoolCfg[s][HeraWorkerType(t)].instCnt + workerCnt := sl.workerPoolCfg[s][HeraWorkerType(t)].maxWorkerCnt totalWorkersCount += workerCnt sl.mWorkerStates[s][HeraWorkerType(t)] = make([][]*WorkerStateInfo, instCnt) sl.mConnStates[s][HeraWorkerType(t)] = make([]*ConnStateInfo, instCnt) @@ -553,7 +556,7 @@ func (sl *StateLog) init() error { for s := 0; s < sl.maxShardSize; s++ { for t := wtypeRW; t < wtypeTotalCount; t++ { var suffix = ".sh" + strconv.Itoa(s) - instCnt := workerpoolcfg[s][HeraWorkerType(t)].instCnt + instCnt := sl.workerPoolCfg[s][HeraWorkerType(t)].instCnt for i := 0; i < instCnt; i++ { sl.mTypeTitles[s][t][i] = typeTitlePrefix[t] @@ -771,13 +774,17 @@ func (sl *StateLog) genReport() { if workerCnt == 0 { continue } + + workerStateInfoData := otel_logger.WorkerStateInfo{ + StateTitle: sl.workerDimensionTitle[sl.mTypeTitles[s][HeraWorkerType(t)][n]], + ShardId: s, + WorkerType: t, + InstanceId: n, + } // Initialize statedata object workerStatesData := otel_logger.WorkersStateData{ - StateTitle: sl.workerDimensionTitle[sl.mTypeTitles[s][HeraWorkerType(t)][n]], - ShardId: int(s), - WorkerType: int(t), - InstanceId: int(n), - StateData: make(map[string]int64), + WorkerStateInfo: &workerStateInfoData, + StateData: make(map[string]int64), } // @@ -828,9 +835,17 @@ func (sl *StateLog) genReport() { workerStatesData.StateData[StateNames[i]] = int64(stateCnt[i]) } //Adding req and response metrics to OTEL - workerStatesData.StateData["req"] = int64(reqCnt - sl.mLastReqCnt[s][HeraWorkerType(t)][n]) - workerStatesData.StateData["resp"] = int64(respCnt - sl.mLastRspCnt[s][HeraWorkerType(t)][n]) + workerStatesData.StateData["req"] = reqCnt - sl.mLastReqCnt[s][HeraWorkerType(t)][n] + workerStatesData.StateData["resp"] = respCnt - sl.mLastRspCnt[s][HeraWorkerType(t)][n] + + //Total workers + workerStatesData.StateData["totalConnections"] = int64(sl.workerPoolCfg[s][HeraWorkerType(t)].maxWorkerCnt) + totalConectionData := otel_logger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfoData, + StateData: workerStatesData.StateData["totalConnections"], + } otel_logger.AddDataPointToOTELStateDataChan(&workerStatesData) + otel_logger.AddDataPointToTotalConnectionsDataChannel(&totalConectionData) } else { for i := 0; i < (MaxWorkerState + MaxConnState - 1); i++ { buf.WriteString(fmt.Sprintf("%6d", stateCnt[i])) diff --git a/tests/unittest/otel_basic/main_test.go b/tests/unittest/otel_basic/main_test.go index f1c04a37..231631bd 100644 --- a/tests/unittest/otel_basic/main_test.go +++ b/tests/unittest/otel_basic/main_test.go @@ -104,7 +104,7 @@ func TestOTELMetricsBasic(t *testing.T) { cancel() conn.Close() time.Sleep(15 * time.Second) - //Read OTEL log file for metrics validation + logFilePath := filepath.Join(testutil.GetOTELLogDirPath(), "otel_collector.log") count := testutil.RegexCountFile("{\"key\":\"application\",\"value\":{\"stringValue\":\"hera-test\"}", logFilePath) if count < 1 { @@ -128,5 +128,9 @@ func TestOTELMetricsBasic(t *testing.T) { t.Fatalf("az configured as test-dev and its value should present in otel metric dimension") } + totalConnMetricCount := testutil.RegexCountFile("\"name\":\"pp.occ.total_connections\"(.*)\"asInt\":\"3\"", logFilePath) + if totalConnMetricCount < 1 { + t.Fatalf("total connections gauge metric should present in otel metric") + } logger.GetLogger().Log(logger.Debug, "TestOTELMetricsBasic done -------------------------------------------------------------") } diff --git a/tests/unittest/shutdown_cleanup/main_test.go b/tests/unittest/shutdown_cleanup/main_test.go new file mode 100644 index 00000000..0ddddba2 --- /dev/null +++ b/tests/unittest/shutdown_cleanup/main_test.go @@ -0,0 +1,110 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "os" + "strings" + "syscall" + "testing" + "time" + + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31002" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["sharding_cfg_reload_interval"] = "0" + appcfg["rac_sql_interval"] = "0" + appcfg["child.executable"] = "mysqlworker" + appcfg["enable_otel"] = "true" + appcfg["otel_resolution_time_in_sec"] = "10" + os.Setenv("AVAILABILITY_ZONE", "test-dev") + os.Setenv("ENVIRONMENT", "dev") + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "3" + opscfg["opscfg.default.server.log_level"] = "5" + + return appcfg, opscfg, testutil.MySQLWorker +} + +func before() error { + tableName = os.Getenv("TABLE_NAME") + if tableName == "" { + tableName = "jdbc_hera_test" + } + if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { + // mysql + testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") + } + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func TestCoordinatorWithShutdownCleanup(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestCoordinatorWithShutdownCleanup begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + + go func() { + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + // cleanup and insert one row in the table + conn, err := db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + tx, _ := conn.BeginTx(ctx, nil) + sqlTxt := "/*cmd*/delete from " + tableName + stmt, _ := tx.PrepareContext(ctx, sqlTxt) + _, err = stmt.Exec() + if err != nil { + t.Fatalf("Error preparing test (delete table) %s with %s ==== sql\n", err.Error(), sqlTxt) + } + stmt, _ = tx.PrepareContext(ctx, "/*cmd*/insert into "+tableName+" (id, int_val, str_val) VALUES(?, ?, ?)") + _, err = stmt.Exec(1, time.Now().Unix(), "val 1") + time.Sleep(500 * time.Millisecond) + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + + stmt.Close() + + cancel() + conn.Close() + }() + + time.Sleep(200 * time.Millisecond) + // send SIGTERM signal to mux process + proc, _ := os.FindProcess(os.Getpid()) + proc.Signal(syscall.SIGTERM) + if testutil.RegexCountFile("Got SIGTERM", "hera.log") != 1 { + t.Fatalf("workerbroker should see SIGTERM signal start perform cleanup") + } + + //workerclient pid= 768 to be terminated, sending SIGTERM first for gracefull termination + if testutil.RegexCountFile("workerclient pid=(\\s*\\d+) to be terminated, sending SIGTERM first for gracefull termination", "hera.log") < 1 { + t.Fatalf("workerbroker should send graceful termination to workers") + } + logger.GetLogger().Log(logger.Debug, "TestCoordinatorWithShutdownCleanup done -------------------------------------------------------------") +} diff --git a/utility/logger/otel/defs.go b/utility/logger/otel/defs.go index 088a52aa..a3f44700 100644 --- a/utility/logger/otel/defs.go +++ b/utility/logger/otel/defs.go @@ -17,6 +17,12 @@ const ( FinishedConnMetric = "finished_connection" QuiescedConnMetric = "quiesced_connection" + //free percentage + freePercentage = "free_percentage" + + //total connections + totalConnections = "total_connections" + // Connection States AssignedConnMetric = "assigned_connection" IdleConnMetric = "idle_connection" @@ -71,27 +77,50 @@ type Tags struct { TagValue string } -type WorkersStateData struct { +type WorkerStateInfo struct { StateTitle string ShardId int WorkerType int InstanceId int - StateData map[string]int64 +} + +type WorkersStateData struct { + *WorkerStateInfo + StateData map[string]int64 +} + +type GaugeMetricData struct { + *WorkerStateInfo + StateData int64 } type ( ServerType int ) -// StateLogMetrics state_log_metrics reports workers states -type StateLogMetrics struct { +type TotalConnectionsGaugeData struct { - //Statelog metrics configuration data - metricsConfig stateLogMetricsConfig + //Metric type for total connection data + totalConnections metric.Int64ObservableGauge hostname string - meter metric.Meter + stateLogMeter metric.Meter + + //Data channel + totalConnDataChannel chan *GaugeMetricData + + registration metric.Registration + + //Channel to close sending data + stopPublish chan struct{} +} + +// StateLogMetrics state_log_metrics reports workers states +type StateLogMetrics struct { + hostname string + + stateLogMeter metric.Meter //Channel to receive statelog data mStateDataChan chan *WorkersStateData @@ -112,6 +141,8 @@ type StateLogMetrics struct { idleState metric.Int64Histogram bklgState metric.Int64Histogram strdState metric.Int64Histogram + + freePercentage metric.Float64Histogram } // Object represents the workers states data for worker belongs to specific shardId and workperType with flat-map diff --git a/utility/logger/otel/logger.go b/utility/logger/otel/logger.go index c90dd853..a492164f 100644 --- a/utility/logger/otel/logger.go +++ b/utility/logger/otel/logger.go @@ -58,7 +58,7 @@ func initializeOTelSDK(ctx context.Context) (shutdown func(ctx context.Context) errorDataMap := make(map[string]*OTelErrorData) //Initialize the map after process it. gErrorDataMap.Store(errorDataMap) - //Setup meter provider + //Setup stateLogMeter provider meterProvider, err := newMeterProvider(ctx) otel.SetMeterProvider(meterProvider) if err != nil { @@ -205,8 +205,18 @@ func getStateLogMetricsViews() []metric.View { Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, }, ) + + freePercentageView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(freePercentage), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 10, MaxScale: 10}, + }, + ) return []metric.View{initView, acptStateView, waitStateView, busyStateView, schdStateView, - fnshStateView, quceStateView, asgnStateView, idleStateView, bklgStateView, strdStateView} + fnshStateView, quceStateView, asgnStateView, idleStateView, bklgStateView, strdStateView, freePercentageView} } // getMetricExporter Initialize metric exporter based protocol selected by user. diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go index 955caca6..a1e974a5 100644 --- a/utility/logger/otel/state_logger.go +++ b/utility/logger/otel/state_logger.go @@ -18,8 +18,9 @@ const defaultAppName string = "occ" // This lock prevents a race between batch observer and instrument registration var registerStateMetrics sync.Once var metricsStateLogger *StateLogMetrics +var totalConnectionStateDataLogger *TotalConnectionsGaugeData -// Implement apply function in to configure meter provider +// Implement apply function in to configure stateLogMeter provider func (o MetricProviderOption) apply(c *stateLogMetricsConfig) { if o.MeterProvider != nil { c.MeterProvider = o.MeterProvider @@ -38,7 +39,7 @@ func WithAppName(appName string) StateLogOption { return AppNameOption(appName) } -// WithMetricProvider Create StateLogMetrics with provided meter Provider +// WithMetricProvider Create StateLogMetrics with provided stateLogMeter Provider func WithMetricProvider(provider metric.MeterProvider) StateLogOption { return MetricProviderOption{provider} } @@ -76,59 +77,109 @@ func StartMetricsCollection(ctx context.Context, totalWorkersCount int, opt ...S if hostErr != nil { logger.GetLogger().Log(logger.Alert, "Failed to fetch hostname for current container", err) } + stateLogMeter := stateLogMetricsConfig.MeterProvider.Meter(StateLogMeterName, + metric.WithInstrumentationVersion(OtelInstrumentationVersion)) //Initialize state-log metrics metricsStateLogger = &StateLogMetrics{ - meter: stateLogMetricsConfig.MeterProvider.Meter(StateLogMeterName, - metric.WithInstrumentationVersion(OtelInstrumentationVersion)), - metricsConfig: stateLogMetricsConfig, + stateLogMeter: stateLogMeter, hostname: hostName, mStateDataChan: make(chan *WorkersStateData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*2), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, doneCh: make(chan struct{}), } - err = metricsStateLogger.register() + + totalConnectionStateDataLogger = &TotalConnectionsGaugeData{ + stateLogMeter: stateLogMeter, + hostname: hostName, + totalConnDataChannel: make(chan *GaugeMetricData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*2), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, + stopPublish: make(chan struct{}), + } + err = registerMetrics(metricsStateLogger, totalConnectionStateDataLogger) if err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register state metrics collector", err) } else { - go metricsStateLogger.startStateLogMetricsPoll(ctx) + err = totalConnectionStateDataLogger.registerCallbackForTotalConnectionsData() + if err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register callback for totalConnectionStateDataLogger gauge metric", err) + } + if err == nil { + go metricsStateLogger.startStateLogMetricsPoll(ctx) //Goroutine to poll HERA states data + } } }) return err } -// StopMetricCollection Send notification to stateLogMetrics.doneCh to stop metric collection +// StopMetricCollection Send notification to stateLogMetrics.stopPublish to stop metric collection func StopMetricCollection() { - select { - case metricsStateLogger.doneCh <- struct{}{}: - return - default: - logger.GetLogger().Log(logger.Info, "channel has already been closed.") - return - } + var wg sync.WaitGroup + wg.Add(2) + //Goroutine 1 + go func() { + defer wg.Done() + select { + case metricsStateLogger.doneCh <- struct{}{}: + logger.GetLogger().Log(logger.Info, "this stop metric collection for state-log data on channel metricsStateLogger.") + return + default: + logger.GetLogger().Log(logger.Info, "metricsStateLogger done channel has already been closed.") + return + } + }() + + go func() { + defer wg.Done() + select { + case totalConnectionStateDataLogger.stopPublish <- struct{}{}: + logger.GetLogger().Log(logger.Info, "this stop metric collection for state-log data on channel totalConnectionStateDataLogger.") + return + default: + logger.GetLogger().Log(logger.Info, "totalConnectionStateDataLogger.stopPublish channel has already been closed.") + return + } + }() + wg.Wait() } // AddDataPointToOTELStateDataChan Send data to stateLogMetrics.mStateDataChan channel func AddDataPointToOTELStateDataChan(dataPoint *WorkersStateData) { + defer func() { + if r := recover(); r != nil { + logger.GetLogger().Log(logger.Info, "Panic while adding data-points to StateDataChannel, Recovered from panic: ", r) + } + }() select { case metricsStateLogger.mStateDataChan <- dataPoint: return case <-time.After(time.Millisecond * 100): logger.GetLogger().Log(logger.Alert, "timeout occurred while adding record to stats data channel") default: - select { - case metricsStateLogger.mStateDataChan <- dataPoint: - return - default: - logger.GetLogger().Log(logger.Alert, "metricsStateLogger.mStateData channel closed or full while sending data") + logger.GetLogger().Log(logger.Alert, "metricsStateLogger.mStateData channel closed or full while sending data") + } +} + +// AddDataPointToTotalConnectionsDataChannel Send data to totalConnectionStateDataLogger.totalConnDataChannel channel +func AddDataPointToTotalConnectionsDataChannel(totalConnectionData *GaugeMetricData) { + defer func() { + if r := recover(); r != nil { + logger.GetLogger().Log(logger.Info, "Panic while adding data-points to totalConnDataChannel, Recovered from panic: ", r) } + }() + select { + case totalConnectionStateDataLogger.totalConnDataChannel <- totalConnectionData: + return + case <-time.After(time.Millisecond * 50): + logger.GetLogger().Log(logger.Alert, "timeout occurred while adding guage data record to totalConnDataChannel channel") + default: + logger.GetLogger().Log(logger.Alert, "totalConnectionStateDataLogger.totalConnDataChannel channel closed or full while sending data") } } // Define Instrumentation for each metrics and register with StateLogMetrics -func (stateLogMetrics *StateLogMetrics) register() error { +func registerMetrics(stateLogMetrics *StateLogMetrics, totalConnectionsMetrics *TotalConnectionsGaugeData) error { //"init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd", "cls" var err error - if stateLogMetrics.initState, err = stateLogMetrics.meter.Int64Histogram( + if stateLogMetrics.initState, err = stateLogMetrics.stateLogMeter.Int64Histogram( otelconfig.OTelConfigData.PopulateMetricNamePrefix(InitConnMetric), metric.WithDescription("Number of workers in init state"), ); err != nil { @@ -136,7 +187,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { return err } - if stateLogMetrics.acptState, err = stateLogMetrics.meter.Int64Histogram( + if stateLogMetrics.acptState, err = stateLogMetrics.stateLogMeter.Int64Histogram( otelconfig.OTelConfigData.PopulateMetricNamePrefix(AccptConnMetric), metric.WithDescription("Number of workers in accept state"), ); err != nil { @@ -144,7 +195,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { return err } - if stateLogMetrics.waitState, err = stateLogMetrics.meter.Int64Histogram( + if stateLogMetrics.waitState, err = stateLogMetrics.stateLogMeter.Int64Histogram( otelconfig.OTelConfigData.PopulateMetricNamePrefix(WaitConnMetric), metric.WithDescription("Number of workers in wait state"), ); err != nil { @@ -152,7 +203,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { return err } - if stateLogMetrics.busyState, err = stateLogMetrics.meter.Int64Histogram( + if stateLogMetrics.busyState, err = stateLogMetrics.stateLogMeter.Int64Histogram( otelconfig.OTelConfigData.PopulateMetricNamePrefix(BusyConnMetric), metric.WithDescription("Number of workers in busy state"), ); err != nil { @@ -160,7 +211,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { return err } - if stateLogMetrics.schdState, err = stateLogMetrics.meter.Int64Histogram( + if stateLogMetrics.schdState, err = stateLogMetrics.stateLogMeter.Int64Histogram( otelconfig.OTelConfigData.PopulateMetricNamePrefix(ScheduledConnMetric), metric.WithDescription("Number of workers in scheduled state"), ); err != nil { @@ -168,7 +219,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { return err } - if stateLogMetrics.fnshState, err = stateLogMetrics.meter.Int64Histogram( + if stateLogMetrics.fnshState, err = stateLogMetrics.stateLogMeter.Int64Histogram( otelconfig.OTelConfigData.PopulateMetricNamePrefix(FinishedConnMetric), metric.WithDescription("Number of workers in finished state"), ); err != nil { @@ -176,7 +227,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { return err } - if stateLogMetrics.quceState, err = stateLogMetrics.meter.Int64Histogram( + if stateLogMetrics.quceState, err = stateLogMetrics.stateLogMeter.Int64Histogram( otelconfig.OTelConfigData.PopulateMetricNamePrefix(QuiescedConnMetric), metric.WithDescription("Number of workers in quiesced state"), ); err != nil { @@ -184,7 +235,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { return err } - if stateLogMetrics.asgnState, err = stateLogMetrics.meter.Int64Histogram( + if stateLogMetrics.asgnState, err = stateLogMetrics.stateLogMeter.Int64Histogram( otelconfig.OTelConfigData.PopulateMetricNamePrefix(AssignedConnMetric), metric.WithDescription("Number of workers in assigned state"), ); err != nil { @@ -192,7 +243,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { return err } - if stateLogMetrics.idleState, err = stateLogMetrics.meter.Int64Histogram( + if stateLogMetrics.idleState, err = stateLogMetrics.stateLogMeter.Int64Histogram( otelconfig.OTelConfigData.PopulateMetricNamePrefix(IdleConnMetric), metric.WithDescription("Number of workers in idle state"), ); err != nil { @@ -200,7 +251,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { return err } - if stateLogMetrics.bklgState, err = stateLogMetrics.meter.Int64Histogram( + if stateLogMetrics.bklgState, err = stateLogMetrics.stateLogMeter.Int64Histogram( otelconfig.OTelConfigData.PopulateMetricNamePrefix(BacklogConnMetric), metric.WithDescription("Number of workers in backlog state"), ); err != nil { @@ -208,7 +259,7 @@ func (stateLogMetrics *StateLogMetrics) register() error { return err } - if stateLogMetrics.strdState, err = stateLogMetrics.meter.Int64Histogram( + if stateLogMetrics.strdState, err = stateLogMetrics.stateLogMeter.Int64Histogram( otelconfig.OTelConfigData.PopulateMetricNamePrefix(StrdConnMetric), metric.WithDescription("Number of connections in stranded state"), ); err != nil { @@ -216,6 +267,22 @@ func (stateLogMetrics *StateLogMetrics) register() error { return err } + if stateLogMetrics.freePercentage, err = stateLogMetrics.stateLogMeter.Float64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(freePercentage), + metric.WithDescription("Free connections percentage"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for free connections percentage state", err) + return err + } + + //Register Gauge metric + if totalConnectionsMetrics.totalConnections, err = totalConnectionsMetrics.stateLogMeter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(totalConnections), + metric.WithDescription("Total Connection"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register total connection guage metric", err) + return err + } if err != nil { return err } @@ -255,7 +322,7 @@ mainloop: stateLogMetrics.sendMetricsDataToCollector(ctx, &stateLogTitle, stateLogsData) } case <-stateLogMetrics.doneCh: - logger.GetLogger().Log(logger.Info, "received stopped signal for processing statelog metric. "+ + logger.GetLogger().Log(logger.Alert, "received stopped signal for processing statelog metric. "+ "so stop sending data and closing data channel") close(stateLogMetrics.mStateDataChan) break mainloop @@ -268,7 +335,7 @@ mainloop: */ func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(ctx context.Context, stateLogTitle *string, stateLogsData map[string]map[string]int64) { for key, aggStatesData := range stateLogsData { - logger.GetLogger().Log(logger.Info, fmt.Sprintf("publishing metric with calculated max value and aggregation of gauge for shardid-workertype-instanceId: %s using datapoints size: %d", key, aggStatesData[Datapoints])) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("publishing state logs histogram data for shardid-workertype-instanceId: %s using datapoints size: %d", key, aggStatesData[Datapoints])) commonLabels := []attribute.KeyValue{ attribute.Int(ShardId, int(aggStatesData[ShardId])), attribute.String(WorkerType, WorkerTypeMap[int(aggStatesData[WorkerType])]), @@ -286,6 +353,10 @@ func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(ctx context.C stateLogMetrics.fnshState.Record(ctx, aggStatesData["fnsh"], metric.WithAttributes(commonLabels...)) stateLogMetrics.quceState.Record(ctx, aggStatesData["quce"], metric.WithAttributes(commonLabels...)) + //2. Free Percentage + freePercentageVal := (float64(aggStatesData["acpt"]+aggStatesData["fnsh"]) / float64(aggStatesData["totalConnections"])) * 100 + stateLogMetrics.freePercentage.Record(ctx, freePercentageVal, metric.WithAttributes(commonLabels...)) + //2. Connection States stateLogMetrics.asgnState.Record(ctx, aggStatesData["asgn"], metric.WithAttributes(commonLabels...)) stateLogMetrics.idleState.Record(ctx, aggStatesData["idle"], metric.WithAttributes(commonLabels...)) @@ -293,3 +364,56 @@ func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(ctx context.C stateLogMetrics.strdState.Record(ctx, aggStatesData["strd"], metric.WithAttributes(commonLabels...)) } } + +// This registerCallbackForTotalConnectionsData register callback function to pull totalConnection data for each worker type +func (totalConnectionGauge *TotalConnectionsGaugeData) registerCallbackForTotalConnectionsData() error { + var regError error + totalConnectionGauge.registration, regError = totalConnectionGauge.stateLogMeter.RegisterCallback( + func(ctx context.Context, observer metric.Observer) error { + finalDataMap := make(map[string]*GaugeMetricData) + totalConLoop: + for { + select { + case totalConnData, dataPresent := <-totalConnectionGauge.totalConnDataChannel: + if !dataPresent { + logger.GetLogger().Log(logger.Info, "totalConnection gauge data channel 'totalConnDataChannel' has been closed.") + } else { + keyName := fmt.Sprintf("%d-%d-%d", totalConnData.ShardId, totalConnData.WorkerType, totalConnData.InstanceId) + finalDataMap[keyName] = totalConnData + } + break totalConLoop + case <-totalConnectionGauge.stopPublish: + logger.GetLogger().Log(logger.Alert, "received stopped signal for processing statelog total workers metric. "+ + "so stop sending data to totalConnectionGauge.totalConnDataChannel and closing data channel") + close(totalConnectionGauge.totalConnDataChannel) + if totalConnectionGauge.registration != nil { + logger.GetLogger().Log(logger.Info, "received stopped signal for processing statelog total worker metric. "+ + "so unregister callback function") + go totalConnectionGauge.registration.Unregister() + } + break totalConLoop + case <-ctx.Done(): + logger.GetLogger().Log(logger.Alert, "parent context has been canceled") + break totalConLoop + } + } + if len(finalDataMap) > 0 { + for key, dataPoint := range finalDataMap { + logger.GetLogger().Log(logger.Debug, fmt.Sprintf("publishing total connection gauge for key: %s, worker type: %s with datapoints value: %d", key, dataPoint.StateTitle, dataPoint.StateData)) + commonLabels := []attribute.KeyValue{ + attribute.Int(ShardId, dataPoint.ShardId), + attribute.Int(WorkerType, dataPoint.WorkerType), + attribute.Int(InstanceId, dataPoint.InstanceId), + attribute.String(OccWorkerParamName, dataPoint.StateTitle), + } + observer.ObserveInt64(totalConnectionGauge.totalConnections, dataPoint.StateData, metric.WithAttributes(commonLabels...)) + } + } + return nil + }, totalConnectionGauge.totalConnections) + if regError != nil { + logger.GetLogger().Log(logger.Alert, fmt.Sprintf("Failed to register total connection gauge for total worker metric. error %v", regError)) + return regError + } + return nil +} diff --git a/utility/logger/otel/test/state_logger_test.go b/utility/logger/otel/test/state_logger_test.go index a303599a..401f90b5 100644 --- a/utility/logger/otel/test/state_logger_test.go +++ b/utility/logger/otel/test/state_logger_test.go @@ -27,7 +27,7 @@ func initializeConsoleExporter() (*metric.MeterProvider, error) { Enabled: true, OtelMetricGRPC: false, OtelTraceGRPC: false, - ResolutionTimeInSec: 6, + ResolutionTimeInSec: 2, OTelErrorReportingInterval: 10, PoolName: "occ-testapp", MetricNamePrefix: "pp.occ", @@ -65,8 +65,8 @@ func initializeCustomOTelExporter(t *testing.T) func(ctx context.Context) error Enabled: true, OtelMetricGRPC: false, OtelTraceGRPC: false, - ResolutionTimeInSec: 3, - OTelErrorReportingInterval: 10, + ResolutionTimeInSec: 2, + OTelErrorReportingInterval: 2, PoolName: "occ-testapp", MetricNamePrefix: "pp.occ", MetricsURLPath: DefaultMetricsPath, @@ -99,8 +99,8 @@ func TestVerifyStateLogMetricsInitilization(t *testing.T) { logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") t.Fail() } + defer otellogger.StopMetricCollection() time.Sleep(15 * time.Second) - otellogger.StopMetricCollection() } func TestVerifyStateLogMetricsInitilizationAndContextWithTimeout(t *testing.T) { @@ -118,7 +118,7 @@ func TestVerifyStateLogMetricsInitilizationAndContextWithTimeout(t *testing.T) { err = otellogger.StartMetricsCollection(context.Background(), 5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) defer otellogger.StopMetricCollection() - + time.Sleep(2 * time.Second) if err != nil { logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") t.Fail() @@ -143,32 +143,44 @@ func TestSendingStateLogMetrics(t *testing.T) { logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") t.Fail() } + defer otellogger.StopMetricCollection() + //"init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd", "cls"} var stateData = map[string]int64{ - "init": 6, - "acpt": 10, - "wait": 5, - "busy": 2, - "idle": 5, - "bklg": 0, - "req": 5, - "resp": 5, + "init": 6, + "acpt": 10, + "wait": 5, + "busy": 2, + "idle": 5, + "bklg": 0, + "req": 5, + "resp": 5, + "fnsh": 10, + "totalConnections": 48, } - workersStateData := otellogger.WorkersStateData{ + workerStateInfo := otellogger.WorkerStateInfo{ + StateTitle: "rw", ShardId: 1, WorkerType: 1, InstanceId: 0, - StateData: stateData, + } + workersStateData := otellogger.WorkersStateData{ + WorkerStateInfo: &workerStateInfo, + StateData: stateData, + } + totalConData := otellogger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfo, + StateData: 38, } otellogger.AddDataPointToOTELStateDataChan(&workersStateData) - - defer otellogger.StopMetricCollection() //Clean channel + otellogger.AddDataPointToTotalConnectionsDataChannel(&totalConData) logger.GetLogger().Log(logger.Info, "Data Sent successfully for instrumentation") - time.Sleep(5 * time.Second) - metricsData := mc.GetMetrics() - if len(metricsData) < 11 { - t.Fatalf("got %d, wanted %d", len(metricsData), 11) + time.Sleep(10 * time.Second) + metricsData := mc.metricsStorage.GetMetrics() + logger.GetLogger().Log(logger.Info, "total metrics count is: ", len(metricsData)) + if len(metricsData) < 13 { + t.Fatalf("got %d, wanted %d", len(metricsData), 13) } } @@ -184,78 +196,115 @@ func TestSendingStateLogMetricsConsoleExporter(t *testing.T) { logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") t.Fail() } - + defer otellogger.StopMetricCollection() var stateData = map[string]int64{ - "init": 0, - "acpt": 15, - "wait": 10, - "busy": 4, - "idle": 7, - "bklg": 0, + "init": 0, + "acpt": 15, + "wait": 10, + "busy": 4, + "idle": 7, + "bklg": 0, + "fnsh": 10, + "totalConnections": 46, } var stateData2 = map[string]int64{ - "init": 2, - "acpt": 15, - "wait": 10, - "busy": 4, - "idle": 8, - "bklg": 0, + "init": 3, + "acpt": 15, + "wait": 10, + "busy": 4, + "idle": 8, + "bklg": 0, + "fnsh": 10, + "totalConnections": 50, } - workersStateData := otellogger.WorkersStateData{ + workerStateInfo1 := otellogger.WorkerStateInfo{ + StateTitle: "rw", ShardId: 0, WorkerType: 0, InstanceId: 0, - StateData: stateData, } - workersStateData2 := otellogger.WorkersStateData{ + workerStateInfo2 := otellogger.WorkerStateInfo{ + StateTitle: "rw", ShardId: 2, WorkerType: 0, InstanceId: 0, - StateData: stateData2, + } + workersStateData := otellogger.WorkersStateData{ + WorkerStateInfo: &workerStateInfo1, + StateData: stateData, + } + + workersStateData2 := otellogger.WorkersStateData{ + WorkerStateInfo: &workerStateInfo2, + StateData: stateData2, + } + + totalWorkersStateData := otellogger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfo1, + StateData: 36, + } + + totalWorkersStateData2 := otellogger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfo2, + StateData: 40, } otellogger.AddDataPointToOTELStateDataChan(&workersStateData) + otellogger.AddDataPointToTotalConnectionsDataChannel(&totalWorkersStateData) time.Sleep(150 * time.Millisecond) otellogger.AddDataPointToOTELStateDataChan(&workersStateData2) + otellogger.AddDataPointToTotalConnectionsDataChannel(&totalWorkersStateData2) logger.GetLogger().Log(logger.Info, "Data Sent successfully for instrumentation") time.Sleep(2 * time.Second) var stateData3 = map[string]int64{ - "init": 0, - "acpt": 1, - "wait": 10, - "busy": 4, - "idle": 17, - "bklg": 0, + "init": 0, + "acpt": 1, + "wait": 10, + "busy": 4, + "idle": 17, + "bklg": 0, + "fnsh": 10, + "totalConnections": 42, } var stateData4 = map[string]int64{ - "init": 2, - "acpt": 0, - "wait": 10, - "busy": 4, - "idle": 8, - "bklg": 5, + "init": 2, + "acpt": 0, + "wait": 10, + "busy": 4, + "idle": 8, + "bklg": 5, + "fnsh": 8, + "totalConnections": 37, } + workersStateData3 := otellogger.WorkersStateData{ - ShardId: 0, - WorkerType: 0, - InstanceId: 0, - StateData: stateData3, + WorkerStateInfo: &workerStateInfo1, + StateData: stateData3, } workersStateData4 := otellogger.WorkersStateData{ - ShardId: 2, - WorkerType: 0, - InstanceId: 0, - StateData: stateData4, + WorkerStateInfo: &workerStateInfo2, + StateData: stateData4, + } + totalWorkersStateData3 := otellogger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfo1, + StateData: 38, + } + + totalWorkersStateData4 := otellogger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfo2, + StateData: 29, } otellogger.AddDataPointToOTELStateDataChan(&workersStateData3) + otellogger.AddDataPointToTotalConnectionsDataChannel(&totalWorkersStateData3) time.Sleep(150 * time.Millisecond) otellogger.AddDataPointToOTELStateDataChan(&workersStateData4) - otellogger.StopMetricCollection() + otellogger.AddDataPointToTotalConnectionsDataChannel(&totalWorkersStateData4) + time.Sleep(2 * time.Second) if err3 := cont.Shutdown(context.Background()); err3 != nil { logger.GetLogger().Log(logger.Info, "failed to stop the metric controller:", err3) } @@ -276,7 +325,7 @@ func TestOCCStateLogGeneratorWithRandomValues(t *testing.T) { logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") t.Fatalf("TestOCCStatelogGenerator failed with error %v", err) } - <-time.After(time.Second * time.Duration(30)) + <-time.After(time.Second * time.Duration(60)) } func dataGenerator() { @@ -295,11 +344,15 @@ mainloop: select { case <-timer.C: // Initialize statedata object - workerStatesData := otellogger.WorkersStateData{ + workerStateInfo1 := otellogger.WorkerStateInfo{ + StateTitle: "rw", ShardId: 0, - WorkerType: 1, + WorkerType: 0, InstanceId: 0, - StateData: make(map[string]int64), + } + workerStatesData := otellogger.WorkersStateData{ + WorkerStateInfo: &workerStateInfo1, + StateData: make(map[string]int64), } var numberofMetrics int = 11 var totalSum int = 100 @@ -312,10 +365,16 @@ mainloop: workerStatesData.StateData[metricNames[index]] = int64(value) tempSum += value } + workerStatesData.StateData["totalConnections"] = 100 + totalWorkersStateData := otellogger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfo1, + StateData: 100, + } //Random index randIndex := rand.Intn(len(metricNames)) workerStatesData.StateData[metricNames[randIndex]] += int64(totalSum - tempSum) otellogger.AddDataPointToOTELStateDataChan(&workerStatesData) + otellogger.AddDataPointToTotalConnectionsDataChannel(&totalWorkersStateData) timer.Reset(waitTime) case <-ctx.Done(): logger.GetLogger().Log(logger.Info, "Timedout, so context closed") From 01b311c828b0028f808108e0c3b7029ed260222e Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Wed, 18 Sep 2024 22:33:30 +0530 Subject: [PATCH 14/19] changing timeout for adding data to channel (#114) Co-authored-by: Rajesh S --- utility/logger/otel/state_logger.go | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go index a1e974a5..4ae5bb0c 100644 --- a/utility/logger/otel/state_logger.go +++ b/utility/logger/otel/state_logger.go @@ -83,14 +83,14 @@ func StartMetricsCollection(ctx context.Context, totalWorkersCount int, opt ...S metricsStateLogger = &StateLogMetrics{ stateLogMeter: stateLogMeter, hostname: hostName, - mStateDataChan: make(chan *WorkersStateData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*2), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, + mStateDataChan: make(chan *WorkersStateData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*4), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, doneCh: make(chan struct{}), } totalConnectionStateDataLogger = &TotalConnectionsGaugeData{ stateLogMeter: stateLogMeter, hostname: hostName, - totalConnDataChannel: make(chan *GaugeMetricData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*2), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, + totalConnDataChannel: make(chan *GaugeMetricData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*4), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, stopPublish: make(chan struct{}), } err = registerMetrics(metricsStateLogger, totalConnectionStateDataLogger) @@ -150,10 +150,9 @@ func AddDataPointToOTELStateDataChan(dataPoint *WorkersStateData) { select { case metricsStateLogger.mStateDataChan <- dataPoint: return - case <-time.After(time.Millisecond * 100): + case <-time.After(time.Second * 1): logger.GetLogger().Log(logger.Alert, "timeout occurred while adding record to stats data channel") - default: - logger.GetLogger().Log(logger.Alert, "metricsStateLogger.mStateData channel closed or full while sending data") + return } } @@ -167,10 +166,10 @@ func AddDataPointToTotalConnectionsDataChannel(totalConnectionData *GaugeMetricD select { case totalConnectionStateDataLogger.totalConnDataChannel <- totalConnectionData: return - case <-time.After(time.Millisecond * 50): + case <-time.After(time.Second * 1): logger.GetLogger().Log(logger.Alert, "timeout occurred while adding guage data record to totalConnDataChannel channel") - default: - logger.GetLogger().Log(logger.Alert, "totalConnectionStateDataLogger.totalConnDataChannel channel closed or full while sending data") + return + } } From 688a1be96a3fd2363ddc1fa55c57995bf2e94199 Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Thu, 19 Sep 2024 00:21:34 +0530 Subject: [PATCH 15/19] updating logger level timeout logs (#115) Co-authored-by: Rajesh S --- tests/unittest/otel_basic/main_test.go | 2 +- utility/logger/otel/state_logger.go | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unittest/otel_basic/main_test.go b/tests/unittest/otel_basic/main_test.go index 231631bd..c7032b07 100644 --- a/tests/unittest/otel_basic/main_test.go +++ b/tests/unittest/otel_basic/main_test.go @@ -103,7 +103,7 @@ func TestOTELMetricsBasic(t *testing.T) { cancel() conn.Close() - time.Sleep(15 * time.Second) + time.Sleep(25 * time.Second) logFilePath := filepath.Join(testutil.GetOTELLogDirPath(), "otel_collector.log") count := testutil.RegexCountFile("{\"key\":\"application\",\"value\":{\"stringValue\":\"hera-test\"}", logFilePath) diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go index 4ae5bb0c..51828191 100644 --- a/utility/logger/otel/state_logger.go +++ b/utility/logger/otel/state_logger.go @@ -151,7 +151,7 @@ func AddDataPointToOTELStateDataChan(dataPoint *WorkersStateData) { case metricsStateLogger.mStateDataChan <- dataPoint: return case <-time.After(time.Second * 1): - logger.GetLogger().Log(logger.Alert, "timeout occurred while adding record to stats data channel") + logger.GetLogger().Log(logger.Info, "timeout occurred while adding record to stats data channel") return } } @@ -167,9 +167,8 @@ func AddDataPointToTotalConnectionsDataChannel(totalConnectionData *GaugeMetricD case totalConnectionStateDataLogger.totalConnDataChannel <- totalConnectionData: return case <-time.After(time.Second * 1): - logger.GetLogger().Log(logger.Alert, "timeout occurred while adding guage data record to totalConnDataChannel channel") + logger.GetLogger().Log(logger.Info, "timeout occurred while adding guage data record to totalConnDataChannel channel") return - } } From 83956b8099d3823b774a6862c7c653f5b311e8b7 Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Thu, 19 Sep 2024 12:09:34 +0530 Subject: [PATCH 16/19] incorporate review comments for adding CAL for channel timeout (#116) Co-authored-by: Rajesh S --- lib/statelog.go | 4 ++-- utility/logger/otel/state_logger.go | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/lib/statelog.go b/lib/statelog.go index 9bd366a3..5e644ee9 100644 --- a/lib/statelog.go +++ b/lib/statelog.go @@ -844,8 +844,8 @@ func (sl *StateLog) genReport() { WorkerStateInfo: &workerStateInfoData, StateData: workerStatesData.StateData["totalConnections"], } - otel_logger.AddDataPointToOTELStateDataChan(&workerStatesData) - otel_logger.AddDataPointToTotalConnectionsDataChannel(&totalConectionData) + go otel_logger.AddDataPointToOTELStateDataChan(&workerStatesData) + go otel_logger.AddDataPointToTotalConnectionsDataChannel(&totalConectionData) } else { for i := 0; i < (MaxWorkerState + MaxConnState - 1); i++ { buf.WriteString(fmt.Sprintf("%6d", stateCnt[i])) diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go index 51828191..cf768c04 100644 --- a/utility/logger/otel/state_logger.go +++ b/utility/logger/otel/state_logger.go @@ -3,6 +3,7 @@ package otel import ( "context" "fmt" + "github.com/paypal/hera/cal" "github.com/paypal/hera/utility/logger" otelconfig "github.com/paypal/hera/utility/logger/otel/config" "go.opentelemetry.io/otel" @@ -83,14 +84,14 @@ func StartMetricsCollection(ctx context.Context, totalWorkersCount int, opt ...S metricsStateLogger = &StateLogMetrics{ stateLogMeter: stateLogMeter, hostname: hostName, - mStateDataChan: make(chan *WorkersStateData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*4), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, + mStateDataChan: make(chan *WorkersStateData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*5), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, doneCh: make(chan struct{}), } totalConnectionStateDataLogger = &TotalConnectionsGaugeData{ stateLogMeter: stateLogMeter, hostname: hostName, - totalConnDataChannel: make(chan *GaugeMetricData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*4), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, + totalConnDataChannel: make(chan *GaugeMetricData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*20), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, stopPublish: make(chan struct{}), } err = registerMetrics(metricsStateLogger, totalConnectionStateDataLogger) @@ -152,6 +153,9 @@ func AddDataPointToOTELStateDataChan(dataPoint *WorkersStateData) { return case <-time.After(time.Second * 1): logger.GetLogger().Log(logger.Info, "timeout occurred while adding record to stats data channel") + event := cal.NewCalEvent("OTEL", "DATA_TIMEOUT", "1", "timeout occurred while adding record to mStateDataChan channel") + event.AddDataInt("loggedTime", time.Now().Unix()) + event.Completed() return } } @@ -168,6 +172,9 @@ func AddDataPointToTotalConnectionsDataChannel(totalConnectionData *GaugeMetricD return case <-time.After(time.Second * 1): logger.GetLogger().Log(logger.Info, "timeout occurred while adding guage data record to totalConnDataChannel channel") + event := cal.NewCalEvent("OTEL", "DATA_TIMEOUT", "1", "timeout occurred while adding guage data record to totalConnDataChannel channel") + event.AddDataInt("loggedTime", time.Now().Unix()) + event.Completed() return } } From fa56c28217fa437250f51c52c265d06ba5dcd79d Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Thu, 19 Sep 2024 22:16:52 +0530 Subject: [PATCH 17/19] changes for fixing fetching data from channel (#117) Co-authored-by: Rajesh S --- utility/logger/otel/state_logger.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go index cf768c04..f9ab0465 100644 --- a/utility/logger/otel/state_logger.go +++ b/utility/logger/otel/state_logger.go @@ -84,14 +84,14 @@ func StartMetricsCollection(ctx context.Context, totalWorkersCount int, opt ...S metricsStateLogger = &StateLogMetrics{ stateLogMeter: stateLogMeter, hostname: hostName, - mStateDataChan: make(chan *WorkersStateData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*5), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, + mStateDataChan: make(chan *WorkersStateData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*2), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, doneCh: make(chan struct{}), } totalConnectionStateDataLogger = &TotalConnectionsGaugeData{ stateLogMeter: stateLogMeter, hostname: hostName, - totalConnDataChannel: make(chan *GaugeMetricData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*20), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, + totalConnDataChannel: make(chan *GaugeMetricData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*2), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, stopPublish: make(chan struct{}), } err = registerMetrics(metricsStateLogger, totalConnectionStateDataLogger) @@ -331,6 +331,9 @@ mainloop: "so stop sending data and closing data channel") close(stateLogMetrics.mStateDataChan) break mainloop + case <-ctx.Done(): + logger.GetLogger().Log(logger.Info, "parent context has been closed for metric poll, so exiting loop") + break mainloop } } } @@ -382,11 +385,11 @@ func (totalConnectionGauge *TotalConnectionsGaugeData) registerCallbackForTotalC case totalConnData, dataPresent := <-totalConnectionGauge.totalConnDataChannel: if !dataPresent { logger.GetLogger().Log(logger.Info, "totalConnection gauge data channel 'totalConnDataChannel' has been closed.") + break totalConLoop } else { keyName := fmt.Sprintf("%d-%d-%d", totalConnData.ShardId, totalConnData.WorkerType, totalConnData.InstanceId) finalDataMap[keyName] = totalConnData } - break totalConLoop case <-totalConnectionGauge.stopPublish: logger.GetLogger().Log(logger.Alert, "received stopped signal for processing statelog total workers metric. "+ "so stop sending data to totalConnectionGauge.totalConnDataChannel and closing data channel") @@ -398,13 +401,16 @@ func (totalConnectionGauge *TotalConnectionsGaugeData) registerCallbackForTotalC } break totalConLoop case <-ctx.Done(): - logger.GetLogger().Log(logger.Alert, "parent context has been canceled") + logger.GetLogger().Log(logger.Info, "context closed so exiting from totalConnDataChannel data loop") + break totalConLoop + default: + logger.GetLogger().Log(logger.Info, "totalConnDataChannel channel is empty") break totalConLoop } } if len(finalDataMap) > 0 { for key, dataPoint := range finalDataMap { - logger.GetLogger().Log(logger.Debug, fmt.Sprintf("publishing total connection gauge for key: %s, worker type: %s with datapoints value: %d", key, dataPoint.StateTitle, dataPoint.StateData)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("publishing total connection gauge for key: %s, worker type: %s with datapoints value: %d", key, dataPoint.StateTitle, dataPoint.StateData)) commonLabels := []attribute.KeyValue{ attribute.Int(ShardId, dataPoint.ShardId), attribute.Int(WorkerType, dataPoint.WorkerType), From d1a90592334623bd147ce415147949c62052fd0f Mon Sep 17 00:00:00 2001 From: Rajesh Samala Date: Thu, 26 Sep 2024 16:41:57 +0530 Subject: [PATCH 18/19] changes adding cal event during registration fail and add retry mechanism for callback registration (#118) * changes adding cal event during registration fail and failing statelog initialization * update cal messages * update cal messages --------- Co-authored-by: Rajesh S --- lib/statelog.go | 6 +++++- utility/logger/otel/error_handler.go | 2 +- utility/logger/otel/state_logger.go | 25 ++++++++++++++++++++----- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/lib/statelog.go b/lib/statelog.go index 5e644ee9..3d00a4ff 100644 --- a/lib/statelog.go +++ b/lib/statelog.go @@ -586,7 +586,11 @@ func (sl *StateLog) init() error { otel_logger.WithAppName(otelconfig.OTelConfigData.PoolName)) if stateStartErr != nil { - logger.GetLogger().Log(logger.Alert, "failed to start metric collection agent for statelogs", stateStartErr) + logger.GetLogger().Log(logger.Alert, "failed to start metric collection agent in statelogs", stateStartErr) + event := cal.NewCalEvent("OTEL", "STATELOG_INIT", "1", fmt.Sprintf("msg=%v", stateStartErr.Error())) + event.AddDataInt("loggedTime", time.Now().Unix()) + event.Completed() + return stateStartErr //In case OTEL integration enabled if OTEL initialization failed then we need to fail startup } } // diff --git a/utility/logger/otel/error_handler.go b/utility/logger/otel/error_handler.go index 440e9341..39fb082d 100644 --- a/utility/logger/otel/error_handler.go +++ b/utility/logger/otel/error_handler.go @@ -71,7 +71,7 @@ func (handler OTelErrorHandler) processOTelErrorsMap() { // logOTelErrorCalEvent It takes of logging OTEL func (handler OTelErrorHandler) logOTelErrorCalEvent(errorDataMap map[string]*OTelErrorData) { for _, errorData := range errorDataMap { - event := cal.NewCalEvent("OTEL", "CONNECTION", "2", fmt.Sprintf("%v", errorData.err)) + event := cal.NewCalEvent("OTEL", "METRIC_PUBLISH", "1", fmt.Sprintf("msg=%v", errorData.err)) event.AddDataInt("occurredTime", errorData.occurredTime) event.AddDataInt("loggedTime", time.Now().Unix()) event.Completed() diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go index f9ab0465..2a4aa430 100644 --- a/utility/logger/otel/state_logger.go +++ b/utility/logger/otel/state_logger.go @@ -15,6 +15,7 @@ import ( ) const defaultAppName string = "occ" +const oTelCallbackRegistrationRetryCount int = 3 //Max retry configuration for registering callback function // This lock prevents a race between batch observer and instrument registration var registerStateMetrics sync.Once @@ -98,15 +99,29 @@ func StartMetricsCollection(ctx context.Context, totalWorkersCount int, opt ...S if err != nil { logger.GetLogger().Log(logger.Alert, "Failed to register state metrics collector", err) } else { - err = totalConnectionStateDataLogger.registerCallbackForTotalConnectionsData() - if err != nil { - logger.GetLogger().Log(logger.Alert, "Failed to register callback for totalConnectionStateDataLogger gauge metric", err) + for retryCount := 0; retryCount < oTelCallbackRegistrationRetryCount; retryCount++ { + err = totalConnectionStateDataLogger.registerCallbackForTotalConnectionsData() + if err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register callback for totalConnectionStateDataLogger gauge metric: ", err, "number of retries: ", retryCount) + } else { + logger.GetLogger().Log(logger.Info, "registered callback for totalConnectionStateDataLogger gauge metrics") + break + } } if err == nil { go metricsStateLogger.startStateLogMetricsPoll(ctx) //Goroutine to poll HERA states data } } }) + if err != nil { + calEvent := cal.NewCalEvent("OTEL", "METRIC_REGISTRATION", "1", fmt.Sprintf("msg=failed register statelog metrics, error: %v", err)) + calEvent.AddDataInt("loggedTime", time.Now().Unix()) + calEvent.Completed() + } else { + calEvent := cal.NewCalEvent("OTEL", "METRIC_REGISTRATION", "0", "msg=state-log metrics registration success") + calEvent.AddDataInt("loggedTime", time.Now().Unix()) + calEvent.Completed() + } return err } @@ -153,7 +168,7 @@ func AddDataPointToOTELStateDataChan(dataPoint *WorkersStateData) { return case <-time.After(time.Second * 1): logger.GetLogger().Log(logger.Info, "timeout occurred while adding record to stats data channel") - event := cal.NewCalEvent("OTEL", "DATA_TIMEOUT", "1", "timeout occurred while adding record to mStateDataChan channel") + event := cal.NewCalEvent("OTEL", "DATA_TIMEOUT", "1", "msg=timeout occurred while adding record to mStateDataChan channel") event.AddDataInt("loggedTime", time.Now().Unix()) event.Completed() return @@ -172,7 +187,7 @@ func AddDataPointToTotalConnectionsDataChannel(totalConnectionData *GaugeMetricD return case <-time.After(time.Second * 1): logger.GetLogger().Log(logger.Info, "timeout occurred while adding guage data record to totalConnDataChannel channel") - event := cal.NewCalEvent("OTEL", "DATA_TIMEOUT", "1", "timeout occurred while adding guage data record to totalConnDataChannel channel") + event := cal.NewCalEvent("OTEL", "DATA_TIMEOUT", "1", "msg=timeout occurred while adding guage data record to totalConnDataChannel channel") event.AddDataInt("loggedTime", time.Now().Unix()) event.Completed() return From a32e9eabbd590a84a19fdac923e3f05958c10231 Mon Sep 17 00:00:00 2001 From: Rajesh S Date: Thu, 26 Sep 2024 17:11:28 +0530 Subject: [PATCH 19/19] changes to skip otel test in opensource as spawning otel agent is failing in opensource --- tests/unittest/testall.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unittest/testall.sh b/tests/unittest/testall.sh index 92fce600..c56a3f2a 100755 --- a/tests/unittest/testall.sh +++ b/tests/unittest/testall.sh @@ -1,5 +1,5 @@ overall=0 -for d in `ls -F tests/unittest | grep /$ | sed -e "s,/,," | egrep -v '(mysql_recycle|log_checker_initdb|testutil|rac_maint|mysql_direct|failover)'` +for d in `ls -F tests/unittest | grep /$ | sed -e "s,/,," | egrep -v '(mysql_recycle|log_checker_initdb|testutil|rac_maint|mysql_direct|failover|otel_basic|otel_incorrect_endpoint|otel_sharding|otel_with_skip_cal)'` do echo ==== $d pushd tests/unittest/$d