From e4100b0d84459476fa92efacd34c5b7b1c4b8318 Mon Sep 17 00:00:00 2001 From: Rajesh S Date: Fri, 27 Sep 2024 00:05:03 +0530 Subject: [PATCH] changes for otel integration --- .github/workflows/go.yml | 4 +- go.mod | 23 +- go.sum | 52 +- lib/config.go | 152 +++++- lib/main.go | 30 +- lib/racmaint.go | 14 +- lib/statelog.go | 117 +++-- lib/workerpool.go | 50 +- lib/workerpool_test.go | 4 +- tests/unittest/bindLess/main_test.go | 2 +- tests/unittest/bindThrottle/main_test.go | 2 +- tests/unittest/otel_basic/main_test.go | 136 ++++++ .../otel_incorrect_endpoint/main_test.go | 119 +++++ tests/unittest/otel_sharding/main_test.go | 198 ++++++++ .../unittest/otel_with_skip_cal/main_test.go | 114 +++++ tests/unittest/rac_maint_async/main_test.go | 122 +++++ tests/unittest/shutdown_cleanup/main_test.go | 110 +++++ tests/unittest/testall.sh | 2 +- tests/unittest/testutil/defs.go | 44 ++ tests/unittest/testutil/main.go | 7 + tests/unittest/testutil/setup.go | 102 +++- utility/logger/otel/config/otelconfig.go | 66 +++ utility/logger/otel/defs.go | 171 +++++++ utility/logger/otel/error_handler.go | 79 ++++ utility/logger/otel/logger.go | 382 +++++++++++++++ utility/logger/otel/state_logger.go | 445 ++++++++++++++++++ utility/logger/otel/test/mock_collector.go | 341 ++++++++++++++ utility/logger/otel/test/state_logger_test.go | 392 +++++++++++++++ 28 files changed, 3169 insertions(+), 111 deletions(-) create mode 100644 tests/unittest/otel_basic/main_test.go create mode 100644 tests/unittest/otel_incorrect_endpoint/main_test.go create mode 100644 tests/unittest/otel_sharding/main_test.go create mode 100644 tests/unittest/otel_with_skip_cal/main_test.go create mode 100644 tests/unittest/rac_maint_async/main_test.go create mode 100644 tests/unittest/shutdown_cleanup/main_test.go create mode 100644 tests/unittest/testutil/defs.go create mode 100644 utility/logger/otel/config/otelconfig.go create mode 100644 utility/logger/otel/defs.go create mode 100644 utility/logger/otel/error_handler.go create mode 100644 utility/logger/otel/logger.go create mode 100644 utility/logger/otel/state_logger.go create mode 100644 utility/logger/otel/test/mock_collector.go create mode 100644 utility/logger/otel/test/state_logger_test.go diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 75165e64..10002217 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -16,7 +16,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v4 with: - go-version: 1.18.2 + go-version: 1.20.14 - name: Install GO Modules run: | go mod tidy @@ -41,7 +41,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v4 with: - go-version: 1.18.2 + go-version: 1.20.14 - name: Install GO Modules run: | go mod tidy diff --git a/go.mod b/go.mod index 372ef230..dd63842c 100644 --- a/go.mod +++ b/go.mod @@ -1,14 +1,33 @@ module github.com/paypal/hera -go 1.18 +go 1.20 require ( github.com/go-sql-driver/mysql v1.7.1 github.com/godror/godror v0.26.3 github.com/lib/pq v1.10.3 + go.opentelemetry.io/otel v1.24.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.24.0 + go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.24.0 + go.opentelemetry.io/otel/metric v1.24.0 + go.opentelemetry.io/otel/sdk v1.24.0 + go.opentelemetry.io/otel/sdk/metric v1.24.0 + go.opentelemetry.io/proto/otlp v1.2.0 + google.golang.org/protobuf v1.34.1 ) require ( + github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/go-logfmt/logfmt v0.5.0 // indirect - google.golang.org/protobuf v1.33.0 // indirect + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect + go.opentelemetry.io/otel/trace v1.24.0 // indirect + golang.org/x/net v0.26.0 // indirect + golang.org/x/sys v0.21.0 // indirect + golang.org/x/text v0.16.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240520151616-dc85e6b867a5 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240515191416-fc5f0ca64291 // indirect + google.golang.org/grpc v1.64.0 // indirect ) diff --git a/go.sum b/go.sum index 8cabfc71..0a629aea 100644 --- a/go.sum +++ b/go.sum @@ -1,19 +1,61 @@ +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/go-logfmt/logfmt v0.5.0 h1:TrB8swr/68K7m9CcGut2g3UOihhbcbiMAYiuTXdEih4= github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= github.com/godror/godror v0.26.3 h1:V+z+Q/OBGgmmYzuAwyJzpcn4LsPF4Ev0xHAea68V00c= github.com/godror/godror v0.26.3/go.mod h1:1QCn6oXh3r+IlB3DLE8V6qkHXLSHd18a3Hw7szQ9/3Y= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= github.com/lib/pq v1.10.3 h1:v9QZf2Sn6AmjXtQeFpdoq/eaNtYP6IN+7lcrygsIAtg= github.com/lib/pq v1.10.3/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e h1:vcxGaoTs7kV8m5Np9uUNQin4BrLOthgV7252N8V+FwY= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo= +go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0 h1:f2jriWfOdldanBwS9jNBdeOKAQN7b4ugAMaNu1/1k9g= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0/go.mod h1:B+bcQI1yTY+N0vqMpoZbEN7+XU4tNM0DmUiOwebFJWI= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.24.0 h1:mM8nKi6/iFQ0iqst80wDHU2ge198Ye/TfN0WBS5U24Y= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.24.0/go.mod h1:0PrIIzDteLSmNyxqcGYRL4mDIo8OTuBAOI/Bn1URxac= +go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.24.0 h1:JYE2HM7pZbOt5Jhk8ndWZTUWYOVift2cHjXVMkPdmdc= +go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.24.0/go.mod h1:yMb/8c6hVsnma0RpsBMNo0fEiQKeclawtgaIaOp2MLY= +go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI= +go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco= +go.opentelemetry.io/otel/sdk v1.24.0 h1:YMPPDNymmQN3ZgczicBY3B6sf9n62Dlj9pWD3ucgoDw= +go.opentelemetry.io/otel/sdk v1.24.0/go.mod h1:KVrIYw6tEubO9E96HQpcmpTKDVn9gdv35HoYiQWGDFg= +go.opentelemetry.io/otel/sdk/metric v1.24.0 h1:yyMQrPzF+k88/DbH7o4FMAs80puqd+9osbiBrJrz/w8= +go.opentelemetry.io/otel/sdk/metric v1.24.0/go.mod h1:I6Y5FjH6rvEnTTAYQz3Mmv2kl6Ek5IIrmwTLqMrrOE0= +go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI= +go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= +go.opentelemetry.io/proto/otlp v1.2.0 h1:pVeZGk7nXDC9O2hncA6nHldxEjm6LByfA2aN8IOkz94= +go.opentelemetry.io/proto/otlp v1.2.0/go.mod h1:gGpR8txAl5M03pDhMC79G6SdqNV26naRm/KDsgaHD8A= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/genproto/googleapis/api v0.0.0-20240520151616-dc85e6b867a5 h1:P8OJ/WCl/Xo4E4zoe4/bifHpSmmKwARqyqE4nW6J2GQ= +google.golang.org/genproto/googleapis/api v0.0.0-20240520151616-dc85e6b867a5/go.mod h1:RGnPtTG7r4i8sPlNyDeikXF99hMM+hN6QMm4ooG9g2g= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240515191416-fc5f0ca64291 h1:AgADTJarZTBqgjiUzRgfaBchgYB3/WFTC80GPwsMcRI= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240515191416-fc5f0ca64291/go.mod h1:EfXuqaE1J41VCDicxHzUDm+8rk+7ZdXzHV0IhO/I6s0= +google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= +google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= -google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg= +google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/lib/config.go b/lib/config.go index 3e8afe59..20ae3511 100644 --- a/lib/config.go +++ b/lib/config.go @@ -23,13 +23,20 @@ import ( "github.com/paypal/hera/cal" "github.com/paypal/hera/config" "github.com/paypal/hera/utility/logger" + otelconfig "github.com/paypal/hera/utility/logger/otel/config" + "os" "path/filepath" "strings" "sync/atomic" ) -//The Config contains all the static configuration +const ( + mux_config_cal_name = "OCC_CONFIG" + oracle_worker_config_cal_name = "OCC_ORACLE_WORKER_CONFIG" +) + +// The Config contains all the static configuration type Config struct { CertChainFile string KeyFile string // leave blank for no SSL @@ -179,6 +186,14 @@ type Config struct { // Max desired percentage of healthy workers for the worker pool MaxDesiredHealthyWorkerPct int + + // Oracle Worker Configs + EnableCache bool + EnableHeartBeat bool + EnableQueryReplaceNL bool + EnableBindHashLogging bool + EnableSessionVariables bool + UseNonBlocking bool } // The OpsConfig contains the configuration that can be modified during run time @@ -222,7 +237,7 @@ func parseMapStrStr(encoded string) map[string]string { } // InitConfig initializes the configuration, both the static configuration (from hera.txt) and the dynamic configuration -func InitConfig() error { +func InitConfig(poolName string) error { currentDir, abserr := filepath.Abs(filepath.Dir(os.Args[0])) if abserr != nil { @@ -230,7 +245,6 @@ func InitConfig() error { } else { currentDir = currentDir + "/" } - filename := currentDir + "hera.txt" cdb, err := config.NewTxtConfig(filename) @@ -308,6 +322,7 @@ func InitConfig() error { } gAppConfig.EnableSharding = cdb.GetOrDefaultBool("enable_sharding", false) + gAppConfig.UseShardMap = cdb.GetOrDefaultBool("use_shardmap", true) gAppConfig.NumOfShards = cdb.GetOrDefaultInt("num_shards", 1) if gAppConfig.EnableSharding == false || gAppConfig.UseShardMap == false { @@ -366,6 +381,14 @@ func InitConfig() error { // TODO: gAppConfig.NumStdbyDbs = 1 + // Fetch Oracle worker configurations.. The defaults must be same between oracle worker and here for accurate logging. + gAppConfig.EnableCache = cdb.GetOrDefaultBool("enable_cache", false) + gAppConfig.EnableHeartBeat = cdb.GetOrDefaultBool("enable_heart_beat", false) + gAppConfig.EnableQueryReplaceNL = cdb.GetOrDefaultBool("enable_query_replace_nl", true) + gAppConfig.EnableBindHashLogging = cdb.GetOrDefaultBool("enable_bind_hash_logging", false) + gAppConfig.EnableSessionVariables = cdb.GetOrDefaultBool("enable_session_variables", false) + gAppConfig.UseNonBlocking = cdb.GetOrDefaultBool("use_non_blocking", false) + var numWorkers int numWorkers = 6 //err = config.InitOpsConfigWithName("../opscfg/hera.txt") @@ -465,9 +488,39 @@ func InitConfig() error { gAppConfig.MaxDesiredHealthyWorkerPct = 90 } + //Initialize OTEL configs + initializeOTELConfigs(cdb, poolName) + if logger.GetLogger().V(logger.Info) { + otelconfig.OTelConfigData.Dump() + } return nil } +// This function takes care of initialize OTEL configuration +func initializeOTELConfigs(cdb config.Config, poolName string) { + otelconfig.OTelConfigData = &otelconfig.OTelConfig{} + //TODO initialize the values + otelconfig.OTelConfigData.Enabled = cdb.GetOrDefaultBool("enable_otel", false) + otelconfig.OTelConfigData.SkipCalStateLog = cdb.GetOrDefaultBool("skip_cal_statelog", false) + otelconfig.OTelConfigData.MetricNamePrefix = cdb.GetOrDefaultString("otel_metric_prefix", "pp.occ") + otelconfig.OTelConfigData.Host = cdb.GetOrDefaultString("otel_agent_host", "localhost") + otelconfig.OTelConfigData.MetricsPort = cdb.GetOrDefaultInt("otel_agent_metrics_port", 4318) + otelconfig.OTelConfigData.TracePort = cdb.GetOrDefaultInt("otel_agent_trace_port", 4318) + otelconfig.OTelConfigData.OtelMetricGRPC = cdb.GetOrDefaultBool("otel_agent_use_grpc_metric", false) + otelconfig.OTelConfigData.OtelTraceGRPC = cdb.GetOrDefaultBool("otel_agent_use_grpc_trace", false) + otelconfig.OTelConfigData.MetricsURLPath = cdb.GetOrDefaultString("otel_agent_metrics_uri", "") + otelconfig.OTelConfigData.TraceURLPath = cdb.GetOrDefaultString("otel_agent_trace_uri", "") + otelconfig.OTelConfigData.PoolName = poolName + otelconfig.OTelConfigData.UseTls = cdb.GetOrDefaultBool("otel_use_tls", false) + otelconfig.OTelConfigData.TLSCertPath = cdb.GetOrDefaultString("otel_tls_cert_path", "") + otelconfig.OTelConfigData.ResolutionTimeInSec = cdb.GetOrDefaultInt("otel_resolution_time_in_sec", 1) + otelconfig.OTelConfigData.ExporterTimeout = cdb.GetOrDefaultInt("otel_exporter_time_in_sec", 30) + otelconfig.OTelConfigData.EnableRetry = cdb.GetOrDefaultBool("otel_enable_exporter_retry", false) + otelconfig.OTelConfigData.ResourceType = gAppConfig.StateLogPrefix + otelconfig.OTelConfigData.OTelErrorReportingInterval = cdb.GetOrDefaultInt("otel_error_reporting_interval_in_sec", 60) + otelconfig.SetOTelIngestToken(cdb.GetOrDefaultString("otel_ingest_token", "")) +} + func LogOccConfigs() { whiteListConfigs := map[string]map[string]interface{}{ "BACKLOG": { @@ -480,6 +533,18 @@ func LogOccConfigs() { "bouncer_startup_delay": gAppConfig.BouncerStartupDelay, "bouncer_poll_interval_ms": gAppConfig.BouncerPollInterval, }, + "OTEL": { + "enable_otel": otelconfig.OTelConfigData.Enabled, + "otel_use_tls": otelconfig.OTelConfigData.UseTls, + "skip_cal_statelog": otelconfig.OTelConfigData.SkipCalStateLog, + "otel_agent_host": otelconfig.OTelConfigData.Host, + "otel_agent_metrics_port": otelconfig.OTelConfigData.MetricsPort, + "otel_agent_trace_port": otelconfig.OTelConfigData.TracePort, + "otel_agent_metrics_uri": otelconfig.OTelConfigData.MetricsURLPath, + "otel_agent_trace_uri": otelconfig.OTelConfigData.TraceURLPath, + "otel_resolution_time_in_sec": otelconfig.OTelConfigData.ResolutionTimeInSec, + "otel_error_reporting_interval_in_sec": otelconfig.OTelConfigData.OTelErrorReportingInterval, + }, "PROFILE": { "enable_profile": gAppConfig.EnableProfile, "profile_http_port": gAppConfig.ProfileHTTPPort, @@ -500,17 +565,15 @@ func LogOccConfigs() { "hostname_prefix": gAppConfig.HostnamePrefix, "sharding_cross_keys_err": gAppConfig.ShardingCrossKeysErr, //"enable_sql_rewrite", // not found anywhere? - "sharding_algo": gAppConfig.ShardingAlgoHash, - "cfg_from_tns_override_num_shards": gAppConfig.CfgFromTnsOverrideNumShards, + "sharding_algo": gAppConfig.ShardingAlgoHash, }, "TAF": { - "enable_taf": gAppConfig.EnableTAF, - "cfg_from_tns_override_taf": gAppConfig.CfgFromTnsOverrideTaf, - "testing_enable_dml_taf": gAppConfig.TestingEnableDMLTaf, - "taf_timeout_ms": gAppConfig.TAFTimeoutMs, - "taf_bin_duration": gAppConfig.TAFBinDuration, - "taf_allow_slow_every_x": gAppConfig.TAFAllowSlowEveryX, - "taf_normally_slow_count": gAppConfig.TAFNormallySlowCount, + "enable_taf": gAppConfig.EnableTAF, + "testing_enable_dml_taf": gAppConfig.TestingEnableDMLTaf, + "taf_timeout_ms": gAppConfig.TAFTimeoutMs, + "taf_bin_duration": gAppConfig.TAFBinDuration, + "taf_allow_slow_every_x": gAppConfig.TAFAllowSlowEveryX, + "taf_normally_slow_count": gAppConfig.TAFNormallySlowCount, }, "BIND-EVICTION": { "child.executable": gAppConfig.ChildExecutable, @@ -551,24 +614,42 @@ func LogOccConfigs() { "max_desire_healthy_worker_pct": gAppConfig.MaxDesiredHealthyWorkerPct, }, "R-W-SPLIT": { - "readonly_children_pct": gAppConfig.ReadonlyPct, - "cfg_from_tns_override_rw_split": gAppConfig.CfgFromTnsOverrideRWSplit, + "readonly_children_pct": gAppConfig.ReadonlyPct, }, "RAC": { "management_table_prefix": gAppConfig.ManagementTablePrefix, "rac_sql_interval": gAppConfig.RacMaintReloadInterval, "rac_restart_window": gAppConfig.RacRestartWindow, }, - "NO-CATEGORY": { + "GENERAL-CONFIGURATIONS": { "database_type": gAppConfig.DatabaseType, // Oracle = 0; MySQL=1; POSTGRES=2 - "cfg_from_tns": gAppConfig.CfgFromTns, "log_level": gOpsConfig.logLevel, "high_load_pct": gAppConfig.HighLoadPct, "init_limit_pct": gAppConfig.InitLimitPct, "num_standby_dbs": gAppConfig.NumStdbyDbs, }, + "ENABLE_CFG_FROM_TNS": { + "cfg_from_tns": gAppConfig.CfgFromTns, + "cfg_from_tns_override_num_shards": gAppConfig.CfgFromTnsOverrideNumShards, + "cfg_from_tns_override_taf": gAppConfig.CfgFromTnsOverrideTaf, + "cfg_from_tns_override_rw_split": gAppConfig.CfgFromTnsOverrideRWSplit, + }, + "STATEMENT-CACHE": { + "enable_cache": gAppConfig.EnableCache, + "enable_heart_beat": gAppConfig.EnableHeartBeat, + "enable_query_replace_nl": gAppConfig.EnableQueryReplaceNL, + }, + "SESSION-VARIABLES": { + "enable_session_variables": gAppConfig.EnableSessionVariables, + }, + "BIND-HASH-LOGGING": { + "enable_bind_hash_logging": gAppConfig.EnableBindHashLogging, + }, + "KEEP-ALIVE": { + "use_non_blocking": gAppConfig.UseNonBlocking, + }, } - + calName := mux_config_cal_name for feature, configs := range whiteListConfigs { switch feature { case "BACKLOG": @@ -579,6 +660,10 @@ func LogOccConfigs() { if !gAppConfig.BouncerEnabled { continue } + case "OTEL": + if !otelconfig.OTelConfigData.Enabled { + continue + } case "PROFILE": if !gAppConfig.EnableProfile { continue @@ -595,24 +680,51 @@ func LogOccConfigs() { if gAppConfig.ReadonlyPct == 0 { continue } - case "SOFT-EVICTION", "BIND-EVICTION": + case "SATURATION-RECOVERY", "BIND-EVICTION": if GetSatRecoverThrottleRate() <= 0 { continue } + case "SOFT-EVICTION": + if GetSatRecoverThrottleRate() <= 0 && gAppConfig.SoftEvictionProbability <= 0 { + continue + } case "MANUAL-RATE-LIMITER": if !gAppConfig.EnableQueryBindBlocker { continue } + case "ENABLE_CFG_FROM_TNS": + if !gAppConfig.CfgFromTns { + continue + } + case "STATEMENT-CACHE": + if !gAppConfig.EnableCache { + continue + } + calName = oracle_worker_config_cal_name + case "SESSION-VARIABLES": + if !gAppConfig.EnableSessionVariables { + continue + } + calName = oracle_worker_config_cal_name + case "BIND-HASH-LOGGING": + if !gAppConfig.EnableBindHashLogging { + continue + } + calName = oracle_worker_config_cal_name + case "KEEP-ALIVE": + if !gAppConfig.UseNonBlocking { + continue + } + calName = oracle_worker_config_cal_name } - evt := cal.NewCalEvent("OCC_CONFIG", fmt.Sprintf(feature), cal.TransOK, "") + evt := cal.NewCalEvent(calName, fmt.Sprintf(feature), cal.TransOK, "") for cfg, val := range configs { s := fmt.Sprintf("%v", val) evt.AddDataStr(cfg, s) } evt.Completed() } - } // CheckOpsConfigChange checks if the ops config file needs to be reloaded and reloads it if necessary. diff --git a/lib/main.go b/lib/main.go index 30b70f94..fac28fe7 100644 --- a/lib/main.go +++ b/lib/main.go @@ -18,8 +18,11 @@ package lib import ( + "context" "flag" "fmt" + otellogger "github.com/paypal/hera/utility/logger/otel" + otelconfig "github.com/paypal/hera/utility/logger/otel/config" "math/rand" "os" "os/signal" @@ -36,6 +39,9 @@ func Run() { signal.Ignore(syscall.SIGPIPE) mux_process_id := syscall.Getpid() + // Defer release resource in case of any abnormal exit of for application + defer handlePanicAndReleaseResource(mux_process_id) + namePtr := flag.String("name", "", "module name in v$session table") flag.Parse() @@ -50,7 +56,7 @@ func Run() { rand.Seed(time.Now().Unix()) - err := InitConfig() + err := InitConfig(*namePtr) if err != nil { if logger.GetLogger().V(logger.Alert) { logger.GetLogger().Log(logger.Alert, "failed to initialize configuration:", err.Error()) @@ -106,7 +112,24 @@ func Run() { caltxn.SetCorrelationID("runtxn") caltxn.Completed() - GetStateLog().SetStartTime(time.Now()) + //Initialize OTEL + if otelconfig.OTelConfigData.Enabled { + shutdownFunc, err := otellogger.Init(context.Background()) + if err != nil { + logger.GetLogger().Log(logger.Alert, fmt.Sprintf("failed to initialize OTEL, err: %v", err)) + evt := cal.NewCalEvent("OTEL_INIT", *namePtr, "2", fmt.Sprintf("erro: %v", err)) + evt.Completed() + if otelconfig.OTelConfigData.SkipCalStateLog { + logger.GetLogger().Log(logger.Alert, fmt.Sprintf("OTEL initialization failed. Only the OTEL state-log has been enabled. It is not safe to start the server")) + FullShutdown() + } + } + GetStateLog().SetStartTime(time.Now()) + defer otellogger.StopMetricCollection() //Stop sending metrics data + defer shutdownFunc(context.Background()) //During exit from mux, this will takecare of OTEL providers clean-up + } else { + GetStateLog().SetStartTime(time.Now()) + } go func() { sleep := time.Duration(GetConfig().ConfigReloadTimeMs) @@ -192,9 +215,6 @@ func Run() { cal.ReleaseCxtResource() }() - // Defer release resource in case of any abnormal exit of for application - defer handlePanicAndReleaseResource(mux_process_id) - <-GetWorkerBrokerInstance().Stopped() } diff --git a/lib/racmaint.go b/lib/racmaint.go index 27c3bbea..0221db44 100644 --- a/lib/racmaint.go +++ b/lib/racmaint.go @@ -61,12 +61,14 @@ func InitRacMaint(cmdLineModuleName string) { interval := GetConfig().RacMaintReloadInterval if interval > 0 { for i := 0; i < GetConfig().NumOfShards; i++ { - go racMaintMain(i, interval, cmdLineModuleName) + shardIndex := i //Address the behavior called variable capture. + go racMaintMain(shardIndex, interval, cmdLineModuleName) } } } // racMaintMain wakes up every n seconds (configured in "rac_sql_interval") and reads the table +// // [ManagementTablePrefix]_maint table to see if maintenance is requested func racMaintMain(shard int, interval int, cmdLineModuleName string) { if logger.GetLogger().V(logger.Debug) { @@ -109,8 +111,8 @@ func racMaintMain(shard int, interval int, cmdLineModuleName string) { } /* - racMaint is the main function for RAC maintenance processing, being called regularly. - When maintenance is planned, it calls workerpool.RacMaint to start the actuall processing +racMaint is the main function for RAC maintenance processing, being called regularly. +When maintenance is planned, it calls workerpool.RacMaint to start the actuall processing */ func racMaint(ctx context.Context, shard int, db *sql.DB, racSQL string, cmdLineModuleName string, prev map[racCfgKey]racCfg) { // @@ -220,12 +222,12 @@ func racMaint(ctx context.Context, shard int, db *sql.DB, racSQL string, cmdLine workerpool, err = GetWorkerBrokerInstance().GetWorkerPool(wtypeRW, 0, shard) } if err == nil { - go workerpool.RacMaint(racReq) + workerpool.RacMaint(racReq) } if GetConfig().ReadonlyPct > 0 { - workerpool, err := GetWorkerBrokerInstance().GetWorkerPool(wtypeRO, 0, shard) + workerpool, err = GetWorkerBrokerInstance().GetWorkerPool(wtypeRO, 0, shard) if err == nil { - go workerpool.RacMaint(racReq) + workerpool.RacMaint(racReq) } } prev[cfgKey] = row diff --git a/lib/statelog.go b/lib/statelog.go index 67a1e9ae..3d00a4ff 100644 --- a/lib/statelog.go +++ b/lib/statelog.go @@ -19,12 +19,17 @@ package lib import ( "bytes" + "context" "errors" "fmt" + otel_logger "github.com/paypal/hera/utility/logger/otel" + otelconfig "github.com/paypal/hera/utility/logger/otel/config" + "go.opentelemetry.io/otel" "log" "os" "path/filepath" "strconv" + "strings" "sync" "time" @@ -71,11 +76,9 @@ type ConnStateInfo struct { perStateCnt []int } -// // StateLog is exposed as a singleton. all stateful resources are protected behind a // message channel that sychronizes incoming messages. user should not call any of // the internal functions that are not threadsafe. -// type StateLog struct { // // array of maps for different workertypes with each value holding a two dimension @@ -101,6 +104,9 @@ type StateLog struct { // title printed in workertype column (leftmost). // mTypeTitles [](map[HeraWorkerType]([]string)) + + //OTEL statelog occ-worker dimension titles + workerDimensionTitle map[string]string // // header row (state) // @@ -132,6 +138,9 @@ type StateLog struct { // start time since epoch in ns // mServerStartTime int64 + + //worker pool configurations + workerPoolCfg []map[HeraWorkerType]*WorkerPoolCfg } // StateEventType is an event published by proxy when state changes. @@ -469,7 +478,7 @@ func (sl *StateLog) init() error { if GetWorkerBrokerInstance() == nil { return errors.New("broker not initialized") } - workerpoolcfg := GetWorkerBrokerInstance().GetWorkerPoolCfgs() + sl.workerPoolCfg = GetWorkerBrokerInstance().GetWorkerPoolCfgs() // // allocate array for each shard @@ -477,11 +486,16 @@ func (sl *StateLog) init() error { sl.mWorkerStates = make([]map[HeraWorkerType][][]*WorkerStateInfo, sl.maxShardSize) sl.mConnStates = make([]map[HeraWorkerType][]*ConnStateInfo, sl.maxShardSize) sl.mTypeTitles = make([]map[HeraWorkerType][]string, sl.maxShardSize) + sl.workerDimensionTitle = make(map[string]string) sl.mLastReqCnt = make([]map[HeraWorkerType][]int64, sl.maxShardSize) sl.mLastRspCnt = make([]map[HeraWorkerType][]int64, sl.maxShardSize) // // for each shard, initialize map // + var totalWorkersCount int //Use this value to initialize bufferred channel for statelog metrics + // + // for each shard, initialize map + // for s := 0; s < sl.maxShardSize; s++ { sl.mWorkerStates[s] = make(map[HeraWorkerType][][]*WorkerStateInfo, wtypeTotalCount) sl.mConnStates[s] = make(map[HeraWorkerType][]*ConnStateInfo, wtypeTotalCount) @@ -492,9 +506,9 @@ func (sl *StateLog) init() error { // for each workertype, initialize two dimension array // for t := 0; t < int(wtypeTotalCount); t++ { - instCnt := workerpoolcfg[s][HeraWorkerType(t)].instCnt - workerCnt := workerpoolcfg[s][HeraWorkerType(t)].maxWorkerCnt - + instCnt := sl.workerPoolCfg[s][HeraWorkerType(t)].instCnt + workerCnt := sl.workerPoolCfg[s][HeraWorkerType(t)].maxWorkerCnt + totalWorkersCount += workerCnt sl.mWorkerStates[s][HeraWorkerType(t)] = make([][]*WorkerStateInfo, instCnt) sl.mConnStates[s][HeraWorkerType(t)] = make([]*ConnStateInfo, instCnt) sl.mTypeTitles[s][HeraWorkerType(t)] = make([]string, instCnt) @@ -522,11 +536,10 @@ func (sl *StateLog) init() error { } } } - // // prepare horizontal (state) and vertical (workertype) titles. // - var shardEnabled = (GetConfig().EnableSharding && (GetConfig().NumOfShards >= 1)) + var shardEnabled = GetConfig().EnableSharding && (GetConfig().NumOfShards >= 1) var buf bytes.Buffer buf.WriteString("-----------") for i := 0; i < (MaxWorkerState + MaxConnState - 1); i++ { @@ -543,7 +556,7 @@ func (sl *StateLog) init() error { for s := 0; s < sl.maxShardSize; s++ { for t := wtypeRW; t < wtypeTotalCount; t++ { var suffix = ".sh" + strconv.Itoa(s) - instCnt := workerpoolcfg[s][HeraWorkerType(t)].instCnt + instCnt := sl.workerPoolCfg[s][HeraWorkerType(t)].instCnt for i := 0; i < instCnt; i++ { sl.mTypeTitles[s][t][i] = typeTitlePrefix[t] @@ -553,6 +566,7 @@ func (sl *StateLog) init() error { if shardEnabled { sl.mTypeTitles[s][t][i] += suffix } + sl.workerDimensionTitle[sl.mTypeTitles[s][t][i]] = strings.Replace(sl.mTypeTitles[s][t][i], GetConfig().StateLogPrefix, otelconfig.OTelConfigData.PoolName, 1) } } } @@ -565,6 +579,20 @@ func (sl *StateLog) init() error { sl.mEventChann = make(chan StateEvent, 3000) + if otelconfig.OTelConfigData.Enabled { + // Initialize statelog_metrics to send metrics information currently we are ignoring registration object returned from this call + stateStartErr := otel_logger.StartMetricsCollection(context.Background(), totalWorkersCount, + otel_logger.WithMetricProvider(otel.GetMeterProvider()), + otel_logger.WithAppName(otelconfig.OTelConfigData.PoolName)) + + if stateStartErr != nil { + logger.GetLogger().Log(logger.Alert, "failed to start metric collection agent in statelogs", stateStartErr) + event := cal.NewCalEvent("OTEL", "STATELOG_INIT", "1", fmt.Sprintf("msg=%v", stateStartErr.Error())) + event.AddDataInt("loggedTime", time.Now().Unix()) + event.Completed() + return stateStartErr //In case OTEL integration enabled if OTEL initialization failed then we need to fail startup + } + } // // start periodical reporting // @@ -750,6 +778,19 @@ func (sl *StateLog) genReport() { if workerCnt == 0 { continue } + + workerStateInfoData := otel_logger.WorkerStateInfo{ + StateTitle: sl.workerDimensionTitle[sl.mTypeTitles[s][HeraWorkerType(t)][n]], + ShardId: s, + WorkerType: t, + InstanceId: n, + } + // Initialize statedata object + workerStatesData := otel_logger.WorkersStateData{ + WorkerStateInfo: &workerStateInfoData, + StateData: make(map[string]int64), + } + // // count all request/response for all workers under the instance // @@ -790,35 +831,47 @@ func (sl *StateLog) genReport() { stateCnt[MaxWorkerState+c] = 0 } } - // - // write collection into calheartbeat(cased out) and log (oneline). - // - hb := cal.NewCalHeartBeat("STATE", sl.mTypeTitles[s][HeraWorkerType(t)][n], cal.TransOK, "") - for i := 0; i < (MaxWorkerState + MaxConnState - 1); i++ { - buf.WriteString(fmt.Sprintf("%6d", stateCnt[i])) - hb.AddDataInt(StateNames[i], int64(stateCnt[i])) - } - hb.AddDataInt("req", int64(reqCnt-sl.mLastReqCnt[s][HeraWorkerType(t)][n])) - hb.AddDataInt("resp", int64(respCnt-sl.mLastRspCnt[s][HeraWorkerType(t)][n])) - /* - buf.WriteString(fmt.Sprintf("%6d", totalConnections)) - if sl.HasActiveWorker() { - buf.WriteString(fmt.Sprintf("%6d", 1)) - } else { - buf.WriteString(fmt.Sprintf("%6d", 0)) + + //Send statelog data to OTEL statsdata channel + if otelconfig.OTelConfigData.Enabled { + for i := 0; i < (MaxWorkerState + MaxConnState - 1); i++ { + buf.WriteString(fmt.Sprintf("%6d", stateCnt[i])) + workerStatesData.StateData[StateNames[i]] = int64(stateCnt[i]) } - if sl.ProxyHasCapacity(GetConfig().BacklogLimit, GetConfig().ReadonlyBacklogLimit) { - buf.WriteString(fmt.Sprintf("%6d", 1)) - } else { - buf.WriteString(fmt.Sprintf("%6d", 0)) + //Adding req and response metrics to OTEL + workerStatesData.StateData["req"] = reqCnt - sl.mLastReqCnt[s][HeraWorkerType(t)][n] + workerStatesData.StateData["resp"] = respCnt - sl.mLastRspCnt[s][HeraWorkerType(t)][n] + + //Total workers + workerStatesData.StateData["totalConnections"] = int64(sl.workerPoolCfg[s][HeraWorkerType(t)].maxWorkerCnt) + totalConectionData := otel_logger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfoData, + StateData: workerStatesData.StateData["totalConnections"], } - */ - hb.Completed() + go otel_logger.AddDataPointToOTELStateDataChan(&workerStatesData) + go otel_logger.AddDataPointToTotalConnectionsDataChannel(&totalConectionData) + } else { + for i := 0; i < (MaxWorkerState + MaxConnState - 1); i++ { + buf.WriteString(fmt.Sprintf("%6d", stateCnt[i])) + } + } + + if !otelconfig.OTelConfigData.Enabled || (otelconfig.OTelConfigData.Enabled && !otelconfig.OTelConfigData.SkipCalStateLog) { + // write collection into calheartbeat(cased out) and log (oneline). + //If enable_otel_metrics_only not enabled then it sends CAL heart beat event or else send data to file and OTEL agent + hb := cal.NewCalHeartBeat("STATE", sl.mTypeTitles[s][HeraWorkerType(t)][n], cal.TransOK, "") + for i := 0; i < (MaxWorkerState + MaxConnState - 1); i++ { + hb.AddDataInt(StateNames[i], int64(stateCnt[i])) + } + hb.AddDataInt("req", int64(reqCnt-sl.mLastReqCnt[s][HeraWorkerType(t)][n])) + hb.AddDataInt("resp", int64(respCnt-sl.mLastRspCnt[s][HeraWorkerType(t)][n])) + hb.Completed() + } sl.fileLogger.Println(getTime() + buf.String()) sl.mLastReqCnt[s][HeraWorkerType(t)][n] = reqCnt sl.mLastRspCnt[s][HeraWorkerType(t)][n] = respCnt - } // instance + } // instance// instance } // wtype } // sharding } diff --git a/lib/workerpool.go b/lib/workerpool.go index d5450ca2..50aab16f 100644 --- a/lib/workerpool.go +++ b/lib/workerpool.go @@ -119,7 +119,7 @@ func (pool *WorkerPool) spawnWorker(wid int) error { worker.setState(wsSchd) millis := rand.Intn(GetConfig().RandomStartMs) if logger.GetLogger().V(logger.Alert) { - logger.GetLogger().Log(logger.Alert, wid, "randomized start ms",millis) + logger.GetLogger().Log(logger.Alert, wid, "randomized start ms", millis) } time.Sleep(time.Millisecond * time.Duration(millis)) @@ -131,7 +131,7 @@ func (pool *WorkerPool) spawnWorker(wid int) error { } millis := rand.Intn(3000) if logger.GetLogger().V(logger.Alert) { - logger.GetLogger().Log(logger.Alert, initCnt, "is too many in init state. waiting to start",wid) + logger.GetLogger().Log(logger.Alert, initCnt, "is too many in init state. waiting to start", wid) } time.Sleep(time.Millisecond * time.Duration(millis)) } @@ -233,8 +233,10 @@ func (pool *WorkerPool) WorkerReady(worker *WorkerClient) (err error) { // GetWorker gets the active worker if available. backlog with timeout if not. // // @param sqlhash to check for soft eviction against a blacklist of slow queries. -// if getworker needs to exam the incoming sql, there does not seem to be another elegant -// way to do this except to pass in the sqlhash as a parameter. +// +// if getworker needs to exam the incoming sql, there does not seem to be another elegant +// way to do this except to pass in the sqlhash as a parameter. +// // @param timeoutMs[0] timeout in milliseconds. default to adaptive queue timeout. func (pool *WorkerPool) GetWorker(sqlhash int32, timeoutMs ...int) (worker *WorkerClient, t string, err error) { if logger.GetLogger().V(logger.Debug) { @@ -559,10 +561,10 @@ func (pool *WorkerPool) ReturnWorker(worker *WorkerClient, ticket string) (err e } if skipRecycle { if logger.GetLogger().V(logger.Alert) { - logger.GetLogger().Log(logger.Alert, "Non Healthy Worker found in pool, module_name=",pool.moduleName,"shard_id=",pool.ShardID, "HEALTHY worker Count=",pool.GetHealthyWorkersCount(),"TotalWorkers:=", pool.desiredSize) + logger.GetLogger().Log(logger.Alert, "Non Healthy Worker found in pool, module_name=", pool.moduleName, "shard_id=", pool.ShardID, "HEALTHY worker Count=", pool.GetHealthyWorkersCount(), "TotalWorkers:=", pool.desiredSize) } calMsg := fmt.Sprintf("Recycle(worker_pid)=%d, module_name=%s,shard_id=%d", worker.pid, worker.moduleName, worker.shardID) - evt := cal.NewCalEvent("SKIP_RECYCLE_WORKER","ReturnWorker", cal.TransOK, calMsg) + evt := cal.NewCalEvent("SKIP_RECYCLE_WORKER", "ReturnWorker", cal.TransOK, calMsg) evt.Completed() } @@ -697,8 +699,6 @@ func (pool *WorkerPool) RacMaint(racReq racAct) { } now := time.Now().Unix() window := GetConfig().RacRestartWindow - dbUname := "" - cnt := 0 pool.poolCond.L.Lock() for i := 0; i < pool.currentSize; i++ { if (pool.workers[i] != nil) && (racReq.instID == 0 || pool.workers[i].racID == racReq.instID) && (pool.workers[i].startTime < int64(racReq.tm)) { @@ -716,23 +716,23 @@ func (pool *WorkerPool) RacMaint(racReq racAct) { } if logger.GetLogger().V(logger.Verbose) { - logger.GetLogger().Log(logger.Verbose, "Rac maint activating, worker", i, pool.workers[i].pid, "exittime=", pool.workers[i].exitTime, now, window, pool.currentSize) - } - cnt++ - if len(dbUname) == 0 { - dbUname = pool.workers[i].dbUname + logger.GetLogger().Log(logger.Verbose, "Rac maint activating, worker", i, pool.workers[i].pid, "exittime=", pool.workers[i].exitTime, now, window, pool.currentSize, "rac.req timestamp=", racReq.tm) } + //Trigger individual event for worker + evt := cal.NewCalEvent("RAC_ID", fmt.Sprintf("%d", racReq.instID), cal.TransOK, "") + evt.AddDataStr("poolModName", pool.moduleName) + evt.AddDataInt("workerId", int64(i)) + evt.AddDataInt("pid", int64(pool.workers[i].pid)) + evt.AddDataInt("shardId", int64(pool.ShardID)) + evt.AddDataInt("tm", int64(racReq.tm)) + evt.AddDataInt("exitTime", pool.workers[i].exitTime) + evt.AddDataStr("exitInSec", fmt.Sprintf("%dsec", pool.workers[i].exitTime-now)) + evt.Completed() + evt = cal.NewCalEvent("DB_UNAME", pool.workers[i].dbUname, cal.TransOK, "") + evt.Completed() } } pool.poolCond.L.Unlock() - // TODO: C++ worker logs one event for each worker, in the worker, so - // we keep the same. Think about changing it - for i := 0; i < cnt; i++ { - evt := cal.NewCalEvent("RAC_ID", fmt.Sprintf("%d", racReq.instID), cal.TransOK, "") - evt.Completed() - evt = cal.NewCalEvent("DB_UNAME", dbUname, cal.TransOK, "") - evt.Completed() - } } // checkWorkerLifespan is called periodically to check if any worker lifetime has expired and terminates it @@ -768,12 +768,12 @@ func (pool *WorkerPool) checkWorkerLifespan() { pool.poolCond.L.Lock() for i := 0; i < pool.currentSize; i++ { if (pool.workers[i] != nil) && (pool.workers[i].exitTime != 0) && (pool.workers[i].exitTime <= now) { - if pool.GetHealthyWorkersCount() < (int32(pool.desiredSize*GetConfig().MaxDesiredHealthyWorkerPct/100)) { // Should it be a config value + if pool.GetHealthyWorkersCount() < (int32(pool.desiredSize * GetConfig().MaxDesiredHealthyWorkerPct / 100)) { // Should it be a config value if logger.GetLogger().V(logger.Alert) { - logger.GetLogger().Log(logger.Alert, "Non Healthy Worker found in pool, module_name=",pool.moduleName,"shard_id=",pool.ShardID, "HEALTHY worker Count=",pool.GetHealthyWorkersCount(),"TotalWorkers:", pool.desiredSize) + logger.GetLogger().Log(logger.Alert, "Non Healthy Worker found in pool, module_name=", pool.moduleName, "shard_id=", pool.ShardID, "HEALTHY worker Count=", pool.GetHealthyWorkersCount(), "TotalWorkers:", pool.desiredSize) } calMsg := fmt.Sprintf("module_name=%s,shard_id=%d", pool.moduleName, pool.ShardID) - evt := cal.NewCalEvent("SKIP_RECYCLE_WORKER","checkWorkerLifespan", cal.TransOK, calMsg) + evt := cal.NewCalEvent("SKIP_RECYCLE_WORKER", "checkWorkerLifespan", cal.TransOK, calMsg) evt.Completed() break } @@ -814,7 +814,7 @@ func (pool *WorkerPool) checkWorkerLifespan() { pool.poolCond.L.Unlock() for _, w := range workers { if logger.GetLogger().V(logger.Info) { - logger.GetLogger().Log(logger.Info, "checkworkerlifespan - Lifespan exceeded, terminate worker: pid =", w.pid, ", pool_type =", w.Type, ", inst =", w.instID ,"HEALTHY worker Count=",pool.GetHealthyWorkersCount(),"TotalWorkers:", pool.desiredSize) + logger.GetLogger().Log(logger.Info, "checkworkerlifespan - Lifespan exceeded, terminate worker: pid =", w.pid, ", pool_type =", w.Type, ", inst =", w.instID, "HEALTHY worker Count=", pool.GetHealthyWorkersCount(), "TotalWorkers:", pool.desiredSize) } w.Terminate() } diff --git a/lib/workerpool_test.go b/lib/workerpool_test.go index 06116e06..4e3db96d 100644 --- a/lib/workerpool_test.go +++ b/lib/workerpool_test.go @@ -19,6 +19,7 @@ package lib import ( "encoding/hex" + otelconfig "github.com/paypal/hera/utility/logger/otel/config" "os" "sync" "testing" @@ -29,13 +30,14 @@ func TestPoolDempotency(t *testing.T) { var useheratxt = false var err error if useheratxt { - err = InitConfig() + err = InitConfig("occ-test") if err != nil { t.Errorf("config initialization failure %s", err.Error()) return } } else { gAppConfig = &Config{BacklogTimeoutMsec: 1, LifoScheduler: true, numWorkersCh: make(chan int, 1)} + otelconfig.OTelConfigData = &otelconfig.OTelConfig{} gOpsConfig = &OpsConfig{numWorkers: 3} gAppConfig.numWorkersCh <- int(gOpsConfig.numWorkers) } diff --git a/tests/unittest/bindLess/main_test.go b/tests/unittest/bindLess/main_test.go index b59b1f07..72c92ec4 100644 --- a/tests/unittest/bindLess/main_test.go +++ b/tests/unittest/bindLess/main_test.go @@ -211,7 +211,7 @@ func TestBindLess(t *testing.T) { logger.GetLogger().Log(logger.Debug, "TestBindLess +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") testutil.BackupAndClear("cal", "BindLess start") testutil.BackupAndClear("hera", "BindLess start") - err := partialBadLoad(0.10) + err := partialBadLoad(0.07) if err != nil && err != NormCliErr() { t.Fatalf("main step function returned err %s", err.Error()) } diff --git a/tests/unittest/bindThrottle/main_test.go b/tests/unittest/bindThrottle/main_test.go index 7cdede41..7e3c87da 100644 --- a/tests/unittest/bindThrottle/main_test.go +++ b/tests/unittest/bindThrottle/main_test.go @@ -205,7 +205,7 @@ func mkClients(num int, stop *int, bindV int, grpName string, outErr *string, db func TestBindThrottle(t *testing.T) { // we would like to clear hera.log, but even if we try, lots of messages still go there logger.GetLogger().Log(logger.Debug, "BindThrottle +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") - err := partialBadLoad(0.10) + err := partialBadLoad(0.07) if err != nil && err != NormCliErr() { t.Fatalf("main step function returned err %s", err.Error()) } diff --git a/tests/unittest/otel_basic/main_test.go b/tests/unittest/otel_basic/main_test.go new file mode 100644 index 00000000..c7032b07 --- /dev/null +++ b/tests/unittest/otel_basic/main_test.go @@ -0,0 +1,136 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31002" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["sharding_cfg_reload_interval"] = "0" + appcfg["rac_sql_interval"] = "0" + appcfg["child.executable"] = "mysqlworker" + appcfg["enable_otel"] = "true" + appcfg["otel_resolution_time_in_sec"] = "10" + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "3" + appcfg["cfg_from_tns"] = "false" + appcfg["num_standby_dbs"] = "0" + opscfg["opscfg.default.server.log_level"] = "5" + os.Setenv("AVAILABILITY_ZONE", "test-dev") + os.Setenv("ENVIRONMENT", "dev") + return appcfg, opscfg, testutil.MySQLWorker +} + +func before() error { + tableName = os.Getenv("TABLE_NAME") + if tableName == "" { + tableName = "jdbc_hera_test" + } + if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { + // mysql + testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") + } + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func TestOTELMetricsBasic(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsBasic begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + // cleanup and insert one row in the table + conn, err := db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + tx, _ := conn.BeginTx(ctx, nil) + sqlTxt := "/*cmd*/delete from " + tableName + stmt, _ := tx.PrepareContext(ctx, sqlTxt) + _, err = stmt.Exec() + if err != nil { + t.Fatalf("Error preparing test (delete table) %s with %s ==== sql\n", err.Error(), sqlTxt) + } + + stmt, _ = tx.PrepareContext(ctx, "/*cmd*/insert into "+tableName+" (id, int_val, str_val) VALUES(?, ?, ?)") + _, err = stmt.Exec(1, time.Now().Unix(), "val 1") + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + if err != nil { + t.Fatalf("Error commit %s\n", err.Error()) + } + + stmt, _ = conn.PrepareContext(ctx, "/*cmd*/Select id, int_val from "+tableName+" where id=?") + rows, _ := stmt.Query(1) + if !rows.Next() { + t.Fatalf("Expected 1 row") + } + + time.Sleep(10 * time.Second) + rows.Close() + stmt.Close() + + cancel() + conn.Close() + time.Sleep(25 * time.Second) + + logFilePath := filepath.Join(testutil.GetOTELLogDirPath(), "otel_collector.log") + count := testutil.RegexCountFile("{\"key\":\"application\",\"value\":{\"stringValue\":\"hera-test\"}", logFilePath) + if count < 1 { + t.Fatalf("OTEL event should contain application as hera-test") + } + initCount := testutil.RegexCountFile("\"name\":\"pp.occ.init_connection\"", logFilePath) + if initCount < 1 { + t.Fatalf("OTEL event should contain metric name pp.occ.init_connection") + } + tagsCount := testutil.RegexCountFile("{\"key\":\"InstanceId\",\"value\":{\"intValue\":\"0\"}},{\"key\":\"ShardId\",\"value\":{\"intValue\":\"0\"}}", + logFilePath) + if tagsCount < 1 { + t.Fatalf("mandatory tags InstanceId, ShardId, WorkerType should present") + } + azCount := testutil.RegexCountFile("{\"key\":\"az\",\"value\":{\"stringValue\":\"test-dev\"}", logFilePath) + if azCount < 1 { + t.Fatalf("az configured as test-dev and its value should present in otel metric dimension") + } + envCount := testutil.RegexCountFile("{\"key\":\"environment\",\"value\":{\"stringValue\":\"dev\"}", logFilePath) + if envCount < 1 { + t.Fatalf("az configured as test-dev and its value should present in otel metric dimension") + } + + totalConnMetricCount := testutil.RegexCountFile("\"name\":\"pp.occ.total_connections\"(.*)\"asInt\":\"3\"", logFilePath) + if totalConnMetricCount < 1 { + t.Fatalf("total connections gauge metric should present in otel metric") + } + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsBasic done -------------------------------------------------------------") +} diff --git a/tests/unittest/otel_incorrect_endpoint/main_test.go b/tests/unittest/otel_incorrect_endpoint/main_test.go new file mode 100644 index 00000000..40eebf05 --- /dev/null +++ b/tests/unittest/otel_incorrect_endpoint/main_test.go @@ -0,0 +1,119 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "os" + "strings" + "testing" + "time" + + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31002" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["sharding_cfg_reload_interval"] = "0" + appcfg["rac_sql_interval"] = "0" + appcfg["child.executable"] = "mysqlworker" + appcfg["enable_otel"] = "true" + appcfg["otel_resolution_time_in_sec"] = "10" + appcfg["otel_agent_metrics_uri"] = "v2/metrics" + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "3" + opscfg["opscfg.default.server.log_level"] = "5" + os.Setenv("AVAILABILITY_ZONE", "test-dev") + os.Setenv("ENVIRONMENT", "dev") + return appcfg, opscfg, testutil.MySQLWorker +} + +func before() error { + tableName = os.Getenv("TABLE_NAME") + if tableName == "" { + tableName = "jdbc_hera_test" + } + if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { + // mysql + testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") + } + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func TestOTELMetricsIncorrectEndPoint(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsIncorrectEndPoint begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + // cleanup and insert one row in the table + conn, err := db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + tx, _ := conn.BeginTx(ctx, nil) + sqlTxt := "/*cmd*/delete from " + tableName + stmt, _ := tx.PrepareContext(ctx, sqlTxt) + _, err = stmt.Exec() + if err != nil { + t.Fatalf("Error preparing test (delete table) %s with %s ==== sql\n", err.Error(), sqlTxt) + } + + stmt, _ = tx.PrepareContext(ctx, "/*cmd*/insert into "+tableName+" (id, int_val, str_val) VALUES(?, ?, ?)") + _, err = stmt.Exec(1, time.Now().Unix(), "val 1") + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + if err != nil { + t.Fatalf("Error commit %s\n", err.Error()) + } + + stmt, _ = conn.PrepareContext(ctx, "/*cmd*/Select id, int_val from "+tableName+" where id=?") + rows, _ := stmt.Query(1) + if !rows.Next() { + t.Fatalf("Expected 1 row") + } + + time.Sleep(10 * time.Second) + rows.Close() + stmt.Close() + + time.Sleep(10 * time.Second) + publishingErrors := testutil.RegexCountFile("otel publishing error", "hera.log") + if publishingErrors < 2 { + t.Fatalf("otel publishing error should present in log because of in-correct OTEL port number") + } + + time.Sleep(10 * time.Second) + calPublishingErrors := testutil.RegexCountFile("failed to send metrics", "cal.log") + if calPublishingErrors < 1 { + t.Fatalf("otel publishing error should present in CAL log because of in-correct OTEL port number") + } + if calPublishingErrors > 1 { + t.Fatalf("otel runtime errors logging interval is 60 secs, we should not see more than one error in CAL") + } + cancel() + conn.Close() + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsIncorrectEndPoint done -------------------------------------------------------------") +} diff --git a/tests/unittest/otel_sharding/main_test.go b/tests/unittest/otel_sharding/main_test.go new file mode 100644 index 00000000..8e7b1c95 --- /dev/null +++ b/tests/unittest/otel_sharding/main_test.go @@ -0,0 +1,198 @@ +package main + +import ( + "context" + "database/sql" + "path/filepath" + + "fmt" + "os" + "strings" + "testing" + "time" + + _ "github.com/paypal/hera/client/gosqldriver/tcp" + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31003" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["enable_sharding"] = "true" + appcfg["num_shards"] = "3" + appcfg["max_scuttle"] = "9" + appcfg["shard_key_name"] = "id" + pfx := os.Getenv("MGMT_TABLE_PREFIX") + if pfx != "" { + appcfg["management_table_prefix"] = pfx + } + appcfg["sharding_cfg_reload_interval"] = "3600" + appcfg["rac_sql_interval"] = "0" + appcfg["enable_otel"] = "true" + appcfg["otel_resolution_time_in_sec"] = "3" + appcfg["cfg_from_tns"] = "false" + appcfg["num_standby_dbs"] = "0" + + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "3" + opscfg["opscfg.default.server.log_level"] = "5" + opscfg["opscfg.default.server.max_connections"] = "3" + opscfg["opscfg.default.server.log_level"] = "5" + os.Setenv("AVAILABILITY_ZONE", "test-dev") + os.Setenv("ENVIRONMENT", "dev") + + return appcfg, opscfg, testutil.MySQLWorker +} + +func setupShardMap() { + twoTask := os.Getenv("TWO_TASK") + if !strings.HasPrefix(twoTask, "tcp") { + // not mysql + return + } + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + testutil.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + conn, err := db.Conn(ctx) + if err != nil { + testutil.Fatalf("Error getting connection %s\n", err.Error()) + } + defer conn.Close() + + testutil.RunDML("create table hera_shard_map ( scuttle_id smallint not null, shard_id tinyint not null, status char(1) , read_status char(1), write_status char(1), remarks varchar(500))") + + for i := 0; i < 1024; i++ { + shard := 0 + if i <= 8 { + shard = i % 3 + } + testutil.RunDML(fmt.Sprintf("insert into hera_shard_map ( scuttle_id, shard_id, status, read_status, write_status ) values ( %d, %d, 'Y', 'Y', 'Y' )", i, shard)) + } +} + +func before() error { + tableName = os.Getenv("TABLE_NAME") + if tableName == "" { + tableName = "jdbc_hera_test" + } + if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { + // mysql + testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") + } + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func cleanup(ctx context.Context, conn *sql.Conn) error { + tx, _ := conn.BeginTx(ctx, nil) + stmt, _ := tx.PrepareContext(ctx, "/*Cleanup*/delete from "+tableName+" where id != :id") + _, err := stmt.Exec(sql.Named("id", -123)) + if err != nil { + return err + } + err = tx.Commit() + return nil +} + +func TestShardingWithOTELBasic(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestShardingWithOTELBasic setup") + setupShardMap() + logger.GetLogger().Log(logger.Debug, "TestShardingWithOTELBasic begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + + hostname, _ := os.Hostname() + //hostname := "localhost" + db, err := sql.Open("hera", hostname+":31003") + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + conn, err := db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + cleanup(ctx, conn) + // insert one row in the table + tx, _ := conn.BeginTx(ctx, nil) + stmt, _ := tx.PrepareContext(ctx, "/*TestShardingBasic*/insert into "+tableName+" (id, int_val, str_val) VALUES(:id, :int_val, :str_val)") + _, err = stmt.Exec(sql.Named("id", 1), sql.Named("int_val", time.Now().Unix()), sql.Named("str_val", "val 1")) + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + if err != nil { + t.Fatalf("Error commit %s\n", err.Error()) + } + + stmt, _ = conn.PrepareContext(ctx, "/*TestShardingBasic*/Select id, int_val, str_val from "+tableName+" where id=:id") + rows, _ := stmt.Query(sql.Named("id", 1)) + if !rows.Next() { + t.Fatalf("Expected 1 row") + } + var id, int_val uint64 + var str_val sql.NullString + err = rows.Scan(&id, &int_val, &str_val) + if err != nil { + t.Fatalf("Expected values %s", err.Error()) + } + if str_val.String != "val 1" { + t.Fatalf("Expected val 1 , got: %s", str_val.String) + } + + rows.Close() + stmt.Close() + + cancel() + conn.Close() + + out, err := testutil.BashCmd("grep 'Preparing: /\\*TestShardingBasic\\*/' hera.log | grep 'WORKER shd2' | wc -l") + if (err != nil) || (len(out) == 0) { + err = nil + t.Fatalf("Request did not run on shard 2. err = %v, len(out) = %d", err, len(out)) + } + time.Sleep(15 * time.Second) + //Read OTEL log file for metrics validation + logFilePath := filepath.Join(testutil.GetOTELLogDirPath(), "otel_collector.log") + initCount := testutil.RegexCountFile("\"name\":\"pp.occ.init_connection\"", logFilePath) + if initCount < 1 { + t.Fatalf("OTEL event should contain metric name pp.occ.init_connection") + } + shard0Count := testutil.RegexCountFile("{\"key\":\"ShardId\",\"value\":{\"intValue\":\"0\"}", logFilePath) + if shard0Count < 1 { + t.Fatalf("shard0 should dimesion showuld present for metrics") + } + shard1Count := testutil.RegexCountFile("{\"key\":\"ShardId\",\"value\":{\"intValue\":\"1\"}", logFilePath) + if shard1Count < 1 { + t.Fatalf("shard1 should dimesion showuld present for metrics") + } + shard2Count := testutil.RegexCountFile("{\"key\":\"ShardId\",\"value\":{\"intValue\":\"2\"}", logFilePath) + if shard2Count < 1 { + t.Fatalf("shard2 should dimesion showuld present for metrics") + } + shard2Worker := testutil.RegexCountFile("{\"key\":\"occ_worker\",\"value\":{\"stringValue\":\"hera-test.sh2\"}", logFilePath) + if shard2Worker < 1 { + t.Fatalf("occ_worker should container shard2") + } + logger.GetLogger().Log(logger.Debug, "TestShardingWithOTELBasic done -------------------------------------------------------------") +} diff --git a/tests/unittest/otel_with_skip_cal/main_test.go b/tests/unittest/otel_with_skip_cal/main_test.go new file mode 100644 index 00000000..95b92be7 --- /dev/null +++ b/tests/unittest/otel_with_skip_cal/main_test.go @@ -0,0 +1,114 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "os" + "strings" + "testing" + "time" + + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31002" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["sharding_cfg_reload_interval"] = "0" + appcfg["rac_sql_interval"] = "0" + appcfg["child.executable"] = "mysqlworker" + appcfg["enable_otel"] = "true" + appcfg["otel_resolution_time_in_sec"] = "10" + appcfg["skip_cal_statelog"] = "true" + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "3" + opscfg["opscfg.default.server.log_level"] = "5" + os.Setenv("AVAILABILITY_ZONE", "test-dev") + os.Setenv("ENVIRONMENT", "dev") + return appcfg, opscfg, testutil.MySQLWorker +} + +func before() error { + tableName = os.Getenv("TABLE_NAME") + if tableName == "" { + tableName = "jdbc_hera_test" + } + if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { + // mysql + testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") + } + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func TestOTELMetricsSkipCALEndPoint(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsSkipCALEndPoint begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + // cleanup and insert one row in the table + conn, err := db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + tx, _ := conn.BeginTx(ctx, nil) + sqlTxt := "/*cmd*/delete from " + tableName + stmt, _ := tx.PrepareContext(ctx, sqlTxt) + _, err = stmt.Exec() + if err != nil { + t.Fatalf("Error preparing test (delete table) %s with %s ==== sql\n", err.Error(), sqlTxt) + } + + stmt, _ = tx.PrepareContext(ctx, "/*cmd*/insert into "+tableName+" (id, int_val, str_val) VALUES(?, ?, ?)") + _, err = stmt.Exec(1, time.Now().Unix(), "val 1") + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + if err != nil { + t.Fatalf("Error commit %s\n", err.Error()) + } + + stmt, _ = conn.PrepareContext(ctx, "/*cmd*/Select id, int_val from "+tableName+" where id=?") + rows, _ := stmt.Query(1) + if !rows.Next() { + t.Fatalf("Expected 1 row") + } + + time.Sleep(20 * time.Second) + rows.Close() + stmt.Close() + + publishingErrors := testutil.RegexCountFile("otel publishing error", "hera.log") + if publishingErrors > 0 { + t.Fatalf("should not see otel publishing errors with correct end-points") + } + + stateLogs := testutil.RegexCountFile("STATE\thera\t0\tinit", "cal.log") + if stateLogs > 0 { + t.Fatalf("skip_cal_statelog enabled, so we should not see state logs in CAL logs.") + } + cancel() + conn.Close() + logger.GetLogger().Log(logger.Debug, "TestOTELMetricsSkipCALEndPoint done -------------------------------------------------------------") +} diff --git a/tests/unittest/rac_maint_async/main_test.go b/tests/unittest/rac_maint_async/main_test.go new file mode 100644 index 00000000..e88e7d48 --- /dev/null +++ b/tests/unittest/rac_maint_async/main_test.go @@ -0,0 +1,122 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "math/rand" + "os" + "testing" + "time" + + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31002" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["sharding_cfg_reload_interval"] = "0" + appcfg["rac_sql_interval"] = "1" + + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "10" + opscfg["opscfg.default.server.log_level"] = "5" + + //return appcfg, opscfg, testutil.OracleWorker + return appcfg, opscfg, testutil.MySQLWorker +} + +func before() error { + os.Setenv("PARALLEL", "1") + pfx := os.Getenv("MGMT_TABLE_PREFIX") + if pfx == "" { + pfx = "hera" + } + tableName = pfx + "_maint" + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func TestRacMaintWithRandomStatusChangeInAsync(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestRacMaintWithRandomStatusChangeInAsync begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + statusArray := []string{"U", "R", "F"} + time.Sleep(5 * time.Second) + + go func() { + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + for { + status1 := rand.Intn(len(statusArray)) + status2 := rand.Intn(len(statusArray)) + var err error + var conn *sql.Conn + // cleanup and insert one row in the table + conn, err = db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + tx, _ := conn.BeginTx(ctx, nil) + stmt, _ := tx.PrepareContext(ctx, "/*cmd*/delete from "+tableName) + _, err = stmt.Exec() + if err != nil { + t.Fatalf("Error preparing test (delete table) %s\n", err.Error()) + } + stmt, _ = tx.PrepareContext(ctx, "/*cmd*/insert into "+tableName+" (inst_id, status, status_time, module, machine) values (?,?,?,?,?)") + hostname, _ := os.Hostname() + // how to do inst_id + _, err = stmt.Exec(0 /*max instid*/, statusArray[status1], time.Now().Unix()+2, "hera-test", hostname) + _, err = stmt.Exec(0, statusArray[status2], time.Now().Unix()+2, "hera-test_taf", hostname) + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + if err != nil { + t.Fatalf("Error commit %s\n", err.Error()) + } + conn.Close() + time.Sleep(1000 * time.Millisecond) + } + }() + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + time.Sleep(45000 * time.Millisecond) + + if 0 == testutil.RegexCountFile("Rac maint activating, worker", "hera.log") { + t.Fatalf("requires rac maint activation for main module status") + } + + if 0 == testutil.RegexCountFile("module:HERA-TEST_TAF", "cal.log") { + t.Fatalf("Status 'U' should log the RACMAINT_INFO_CHANGE event") + } + if 0 != testutil.RegexCountFile("invalid_status", "cal.log") { + t.Fatalf("ram maint status 'U' should not skip with invalid-status event") + } + + if testutil.RegexCountFile("RAC_ID", "cal.log") < 20 { + t.Fatalf("ram maint should trigger for all workers once.") + } + + logger.GetLogger().Log(logger.Debug, "TestRacMaintWithRandomStatusChangeInAsync done -------------------------------------------------------------") +} diff --git a/tests/unittest/shutdown_cleanup/main_test.go b/tests/unittest/shutdown_cleanup/main_test.go new file mode 100644 index 00000000..0ddddba2 --- /dev/null +++ b/tests/unittest/shutdown_cleanup/main_test.go @@ -0,0 +1,110 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "os" + "strings" + "syscall" + "testing" + "time" + + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31002" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["sharding_cfg_reload_interval"] = "0" + appcfg["rac_sql_interval"] = "0" + appcfg["child.executable"] = "mysqlworker" + appcfg["enable_otel"] = "true" + appcfg["otel_resolution_time_in_sec"] = "10" + os.Setenv("AVAILABILITY_ZONE", "test-dev") + os.Setenv("ENVIRONMENT", "dev") + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "3" + opscfg["opscfg.default.server.log_level"] = "5" + + return appcfg, opscfg, testutil.MySQLWorker +} + +func before() error { + tableName = os.Getenv("TABLE_NAME") + if tableName == "" { + tableName = "jdbc_hera_test" + } + if strings.HasPrefix(os.Getenv("TWO_TASK"), "tcp") { + // mysql + testutil.RunDML("create table jdbc_hera_test ( ID BIGINT, INT_VAL BIGINT, STR_VAL VARCHAR(500))") + } + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func TestCoordinatorWithShutdownCleanup(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestCoordinatorWithShutdownCleanup begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + + go func() { + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + // cleanup and insert one row in the table + conn, err := db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + tx, _ := conn.BeginTx(ctx, nil) + sqlTxt := "/*cmd*/delete from " + tableName + stmt, _ := tx.PrepareContext(ctx, sqlTxt) + _, err = stmt.Exec() + if err != nil { + t.Fatalf("Error preparing test (delete table) %s with %s ==== sql\n", err.Error(), sqlTxt) + } + stmt, _ = tx.PrepareContext(ctx, "/*cmd*/insert into "+tableName+" (id, int_val, str_val) VALUES(?, ?, ?)") + _, err = stmt.Exec(1, time.Now().Unix(), "val 1") + time.Sleep(500 * time.Millisecond) + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + + stmt.Close() + + cancel() + conn.Close() + }() + + time.Sleep(200 * time.Millisecond) + // send SIGTERM signal to mux process + proc, _ := os.FindProcess(os.Getpid()) + proc.Signal(syscall.SIGTERM) + if testutil.RegexCountFile("Got SIGTERM", "hera.log") != 1 { + t.Fatalf("workerbroker should see SIGTERM signal start perform cleanup") + } + + //workerclient pid= 768 to be terminated, sending SIGTERM first for gracefull termination + if testutil.RegexCountFile("workerclient pid=(\\s*\\d+) to be terminated, sending SIGTERM first for gracefull termination", "hera.log") < 1 { + t.Fatalf("workerbroker should send graceful termination to workers") + } + logger.GetLogger().Log(logger.Debug, "TestCoordinatorWithShutdownCleanup done -------------------------------------------------------------") +} diff --git a/tests/unittest/testall.sh b/tests/unittest/testall.sh index 92fce600..c56a3f2a 100755 --- a/tests/unittest/testall.sh +++ b/tests/unittest/testall.sh @@ -1,5 +1,5 @@ overall=0 -for d in `ls -F tests/unittest | grep /$ | sed -e "s,/,," | egrep -v '(mysql_recycle|log_checker_initdb|testutil|rac_maint|mysql_direct|failover)'` +for d in `ls -F tests/unittest | grep /$ | sed -e "s,/,," | egrep -v '(mysql_recycle|log_checker_initdb|testutil|rac_maint|mysql_direct|failover|otel_basic|otel_incorrect_endpoint|otel_sharding|otel_with_skip_cal)'` do echo ==== $d pushd tests/unittest/$d diff --git a/tests/unittest/testutil/defs.go b/tests/unittest/testutil/defs.go new file mode 100644 index 00000000..3d17f629 --- /dev/null +++ b/tests/unittest/testutil/defs.go @@ -0,0 +1,44 @@ +package testutil + +const otelConfigYamlData = `receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + http: + endpoint: "0.0.0.0:4318" + +exporters: + logging: + loglevel: debug + file: + path: /var/log/otel/otel_collector.log + +service: + pipelines: + traces: + receivers: [otlp] + processors: [] + exporters: [logging, file] + + metrics: + receivers: [otlp] + processors: [] + exporters: [logging, file] +` + +const otelCollectorDockerDef = `version: '3.8' + +services: + otel_basic-collector: + container_name: otel_basic-collector + image: otel/opentelemetry-collector-contrib:latest + ports: + - "4317:4317" #gRPC port + - "4318:4318" #HTTP port + volumes: + - ./otel_config.yaml:/etc/otel/config.yaml + - ./otel_logs:/var/log/otel + + command: ["--config", "/etc/otel/config.yaml"] +` diff --git a/tests/unittest/testutil/main.go b/tests/unittest/testutil/main.go index 9096e1ce..e3150abe 100644 --- a/tests/unittest/testutil/main.go +++ b/tests/unittest/testutil/main.go @@ -20,12 +20,19 @@ func setup(cfg cfgFunc) error { if err != nil { return err } + if appcfg["enable_otel"] == "true" { + err = mx.StartOTelAgent() + } + //if err != nil { + // return err + //} err = mx.StartServer() return err } func teardown() { mx.StopServer() + mx.StopOTelAgent() } func copyFile(src, dest string) error { diff --git a/tests/unittest/testutil/setup.go b/tests/unittest/testutil/setup.go index 795c78d0..b14e0288 100644 --- a/tests/unittest/testutil/setup.go +++ b/tests/unittest/testutil/setup.go @@ -24,12 +24,20 @@ import ( type Mux interface { StartServer() error StopServer() + StartOTelAgent() error + StopOTelAgent() error } /** commons used by mux tests */ +const OTEL_AGENT_DOCKER_CONFIG_PATH = "docker_compose_otel_collector.yaml" +const OTEL_AGENT_CONFIG_FILE_PATH = "otel_config.yaml" +const OTEL_LOG_DIR = "otel_logs" + +var otelLogsDir string + type WorkerType int const ( @@ -43,7 +51,7 @@ type DBType int const ( Oracle DBType = iota MySQL - PostgreSQL + PostgreSQL ) type mux struct { @@ -94,7 +102,7 @@ func (m *mux) setupWorkdir() { func (m *mux) setupConfig() error { // opscfg - for k,v := range m.opscfg { + for k, v := range m.opscfg { m.appcfg[k] = v } if m.wType == MySQLWorker { @@ -149,7 +157,7 @@ func doBuildAndSymlink(binname string) { var err error _, err = os.Stat(binname) if err != nil { - binpath := os.Getenv("GOPATH")+"/bin/"+binname + binpath := os.Getenv("GOPATH") + "/bin/" + binname _, err = os.Stat(binpath) if err != nil { srcname := binname @@ -199,11 +207,11 @@ func MakeDB(dockerName string, dbName string, dbType DBType) (ip string) { os.Setenv("password", "1-testDb") waitLoop := 1 for { - err := DBDirect("select 1", "127.0.0.1", dbName/*"heratestdb"*/, MySQL) + err := DBDirect("select 1", "127.0.0.1", dbName /*"heratestdb"*/, MySQL) if err != nil { time.Sleep(1 * time.Second) logger.GetLogger().Log(logger.Debug, "waiting for mysql server to come up "+ipBuf.String()+" "+dockerName) - fmt.Printf("waiting for db to come up %d %s\n",waitLoop, err.Error()) + fmt.Printf("waiting for db to come up %d %s\n", waitLoop, err.Error()) waitLoop++ continue } else { @@ -211,7 +219,6 @@ func MakeDB(dockerName string, dbName string, dbType DBType) (ip string) { } } - q := "CREATE USER 'appuser'@'%' IDENTIFIED BY '1-testDb'" err := DBDirect(q, ipBuf.String(), dbName, MySQL) if err != nil { @@ -266,7 +273,7 @@ func MakeDB(dockerName string, dbName string, dbType DBType) (ip string) { os.Setenv("postgresql_ip", ipBuf.String()) return ipBuf.String() - } + } return "" } @@ -384,10 +391,10 @@ func (m *mux) StartServer() error { ip := MakeDB("postgres22", "heratestdb", PostgreSQL) os.Setenv("TWO_TASK", ip+"/heratestdb?connect_timeout=60&sslmode=disable") twoTask := os.Getenv("TWO_TASK") - os.Setenv ("TWO_TASK_0", twoTask) - os.Setenv ("TWO_TASK_1", twoTask) + os.Setenv("TWO_TASK_0", twoTask) + os.Setenv("TWO_TASK_1", twoTask) twoTask1 := os.Getenv("TWO_TASK") - fmt.Println ("TWO_TASK_1: ", twoTask1) + fmt.Println("TWO_TASK_1: ", twoTask1) } } @@ -452,3 +459,78 @@ func (m *mux) StopServer() { os.Chdir(m.origDir) logger.GetLogger().Log(logger.Info, "Exit StopServer time=", time.Now().Unix()) } + +func (m *mux) StartOTelAgent() error { + logger.GetLogger().Log(logger.Info, "starting OTEL agent locally at: ", time.Now()) + err := generateConfigData() + if err != nil { + return err + logger.GetLogger().Log(logger.Alert, "error while Generating configuration datax, error: ", err) + } + shutdownAgent := exec.Command("docker-compose", "-f", OTEL_AGENT_DOCKER_CONFIG_PATH, "down") + err = shutdownAgent.Run() + if err != nil { + logger.GetLogger().Log(logger.Alert, "error while stopping OTEL agent, error: ", err) + } + startCommand := exec.Command("docker-compose", "-f", OTEL_AGENT_DOCKER_CONFIG_PATH, "up", "-d") + err = startCommand.Run() + if err != nil { + logger.GetLogger().Log(logger.Alert, "failed to start OTEL agent, error: ", err) + } + return err +} + +func (m *mux) StopOTelAgent() error { + logger.GetLogger().Log(logger.Info, "stoping OTEL agent locally at: ", time.Now()) + shutdownAgent := exec.Command("docker-compose", "-f", OTEL_AGENT_DOCKER_CONFIG_PATH, "down") + err := shutdownAgent.Run() + + if err != nil { + logger.GetLogger().Log(logger.Alert, "error while stopping OTEL agent, error: ", err) + } + return err +} + +func generateConfigData() error { + workingDir, _ := os.Getwd() + otelLogsDir = filepath.Join(workingDir, OTEL_LOG_DIR) + _, err := os.Stat(otelLogsDir) + if !os.IsNotExist(err) { + os.RemoveAll(otelLogsDir) + } + err = os.MkdirAll(otelLogsDir, 0777) + if err != nil { + return err + } + configFilePath := filepath.Join(workingDir, OTEL_AGENT_CONFIG_FILE_PATH) + _, err = os.Stat(configFilePath) + if os.IsNotExist(err) { + configFile, err := os.OpenFile(configFilePath, os.O_CREATE|os.O_RDWR, 0644) + if err != nil { + return err + } + _, err = configFile.WriteString(otelConfigYamlData) + if err != nil { + return err + } + configFile.Close() + } + dockerDefinitionFile := filepath.Join(workingDir, OTEL_AGENT_DOCKER_CONFIG_PATH) + _, err = os.Stat(dockerDefinitionFile) + if os.IsNotExist(err) { + dockerFile, err := os.OpenFile(dockerDefinitionFile, os.O_CREATE|os.O_RDWR, 0777) + if err != nil { + return err + } + _, err = dockerFile.WriteString(otelCollectorDockerDef) + if err != nil { + return err + } + dockerFile.Close() + } + return nil +} + +func GetOTELLogDirPath() string { + return otelLogsDir +} diff --git a/utility/logger/otel/config/otelconfig.go b/utility/logger/otel/config/otelconfig.go new file mode 100644 index 00000000..20b937ab --- /dev/null +++ b/utility/logger/otel/config/otelconfig.go @@ -0,0 +1,66 @@ +package config + +import ( + "errors" + "fmt" + "github.com/paypal/hera/utility/logger" + "sync/atomic" +) + +var OTelConfigData *OTelConfig +var OTelIngestTokenData atomic.Value + +// OTelConfig represent configuration related to OTEL collector to export data +type OTelConfig struct { + MetricNamePrefix string + Host string + MetricsPort int + TracePort int + MetricsURLPath string + TraceURLPath string + PoolName string + ResourceType string + Enabled bool + SkipCalStateLog bool + ResolutionTimeInSec int + ExporterTimeout int + UseTls bool + TLSCertPath string + OtelMetricGRPC bool + OtelTraceGRPC bool + OTelErrorReportingInterval int + EnableRetry bool +} + +// Validation function to check whether pool name is configured or not +func (config *OTelConfig) validate() error { + if len(config.PoolName) <= 0 { + logger.GetLogger().Log(logger.Alert, "OTEL configurations validation failed, PoolName m=not configured") + return errors.New("OTEL configurations validation failed, PoolName m=not configured") + } + return nil +} + +func (config *OTelConfig) Dump() { + logger.GetLogger().Log(logger.Info, fmt.Sprintf("Host : %s", config.Host)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("UseOtlMetricGRPC: %t", config.OtelMetricGRPC)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("Metrics Port: %d", config.MetricsPort)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("UseOtlMetricGRPC: %t", config.OtelTraceGRPC)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("Trace Port Port: %d", config.TracePort)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("Poolname: %s", config.PoolName)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("ResolutionTimeInSec: %d", config.ResolutionTimeInSec)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("UseTls: %t", config.UseTls)) + logger.GetLogger().Log(logger.Info, fmt.Sprintf("UrlPath: %s", config.MetricsURLPath)) +} + +func (config *OTelConfig) PopulateMetricNamePrefix(metricName string) string { + return fmt.Sprintf("%s.%s", config.MetricNamePrefix, metricName) +} + +func SetOTelIngestToken(value string) { + OTelIngestTokenData.Store(value) +} + +func GetOTelIngestToken() string { + return OTelIngestTokenData.Load().(string) +} diff --git a/utility/logger/otel/defs.go b/utility/logger/otel/defs.go new file mode 100644 index 00000000..a3f44700 --- /dev/null +++ b/utility/logger/otel/defs.go @@ -0,0 +1,171 @@ +package otel + +import ( + "go.opentelemetry.io/otel/metric" + "sync" +) + +// "init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd", "cls" +// Following Metric Names will get instrumented as part of StateLogMetrics +const ( + // Worker States + InitConnMetric = "init_connection" + AccptConnMetric = "accept_connection" + WaitConnMetric = "wait_connection" + BusyConnMetric = "busy_connection" + ScheduledConnMetric = "scheduled_connection" + FinishedConnMetric = "finished_connection" + QuiescedConnMetric = "quiesced_connection" + + //free percentage + freePercentage = "free_percentage" + + //total connections + totalConnections = "total_connections" + + // Connection States + AssignedConnMetric = "assigned_connection" + IdleConnMetric = "idle_connection" + BacklogConnMetric = "backlog_connection" + StrdConnMetric = "stranded_connection" +) + +const ( + Target = string("target") + Endpoint = string("target_ip_port") + TLS_version = string("tls_version") + ApplicationDimName = string("application") + ShardId = string("ShardId") + WorkerType = string("WorkerType") + InstanceId = string("InstanceId") + Datapoints = string("datapoints") + OtelSourceName = string("source") + otelSource = string("otel") + OccWorkerParamName = string("occ_worker") + HostDimensionName = string("host") + ContainerHostDimName = string("container_host") +) + +var StatelogBucket = []float64{0, 5, 10, 15, 20, 25, 30, 40, 50, 60, 80, 100, 120, 160, 200} +var ConnectionStateBucket = []float64{0, 25, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800, 1200, 2400, 4800, 9600, 19200, 39400, 65536} + +// WorkerTypeMap This map represents worker type configured in lib.HeraWorkerType variable. If any changes in worker type this definition need to get updated. +var WorkerTypeMap = map[int]string{ + 0: "rw", + 1: "ro", + 2: "standby_ro", +} + +const OtelInstrumentationVersion string = "v1.0" + +// DEFAULT_OTEL_COLLECTOR_PROTOCOL default OTEL configurations point to QA collector +const DEFAULT_OTEL_COLLECTOR_PROTOCOL string = "grpc" +const DEFAULT_OTEL_COLLECTOR__IP string = "0.0.0.0" +const DEFAULT_GRPC_OTEL_COLLECTOR_PORT string = "4317" +const DEFAULT_HTTP_OTEL_COLLECTOR_PORT string = "4318" +const COLLECTOR_POLLING_INTERVAL_SECONDS int32 = 5 + +const StateLogMeterName = "occ-statelog-data" + +// LoggingOTELPublishingInterval This controls how frequently log OTEL publishing error +const LoggingOTELPublishingInterval = 15 + +//****************************** variables *************************** + +type Tags struct { + TagName string + TagValue string +} + +type WorkerStateInfo struct { + StateTitle string + ShardId int + WorkerType int + InstanceId int +} + +type WorkersStateData struct { + *WorkerStateInfo + StateData map[string]int64 +} + +type GaugeMetricData struct { + *WorkerStateInfo + StateData int64 +} + +type ( + ServerType int +) + +type TotalConnectionsGaugeData struct { + + //Metric type for total connection data + totalConnections metric.Int64ObservableGauge + + hostname string + + stateLogMeter metric.Meter + + //Data channel + totalConnDataChannel chan *GaugeMetricData + + registration metric.Registration + + //Channel to close sending data + stopPublish chan struct{} +} + +// StateLogMetrics state_log_metrics reports workers states +type StateLogMetrics struct { + hostname string + + stateLogMeter metric.Meter + + //Channel to receive statelog data + mStateDataChan chan *WorkersStateData + + //Channel to close sending data + doneCh chan struct{} + + stateLock sync.Mutex + + initState metric.Int64Histogram + acptState metric.Int64Histogram + waitState metric.Int64Histogram + busyState metric.Int64Histogram + schdState metric.Int64Histogram + fnshState metric.Int64Histogram + quceState metric.Int64Histogram + asgnState metric.Int64Histogram + idleState metric.Int64Histogram + bklgState metric.Int64Histogram + strdState metric.Int64Histogram + + freePercentage metric.Float64Histogram +} + +// Object represents the workers states data for worker belongs to specific shardId and workperType with flat-map +// between statename vs count. +type stateLogMetricsConfig struct { + // MeterProvider sets the metric.MeterProvider. If nil, the global + // Provider will be used. + MeterProvider metric.MeterProvider + appName string +} + +// MetricProviderOption Define confuration for metric Provider Option +type MetricProviderOption struct { + metric.MeterProvider +} + +// StateLogOption Option Interface define configuration parameters for statelog metrics agent +type StateLogOption interface { + apply(*stateLogMetricsConfig) +} + +// AppNameOption Define Option for OCCName +type AppNameOption string + +// Headers +const IngestTokenHeader = "X-Sf-Token" diff --git a/utility/logger/otel/error_handler.go b/utility/logger/otel/error_handler.go new file mode 100644 index 00000000..39fb082d --- /dev/null +++ b/utility/logger/otel/error_handler.go @@ -0,0 +1,79 @@ +package otel + +import ( + "fmt" + "github.com/paypal/hera/cal" + "github.com/paypal/hera/utility/logger" + "reflect" + "sync" + "sync/atomic" + "time" +) + +type OTelErrorHandler struct{} + +type OTelErrorData struct { + err error + occurredTime int64 +} + +var ( + oTelErrorLoggingLock sync.Mutex + errorTicker *time.Ticker + gErrorDataMap atomic.Value + logTickerInitialized atomic.Bool +) + +// Handle function handles errors in async way whenever runtime error while publishing data to OTEL agent. +func (handler OTelErrorHandler) Handle(err error) { + if err == nil { + return + } + logger.GetLogger().Log(logger.Warning, fmt.Sprintf("otel publishing error %v", err)) + oTelErrorLoggingLock.Lock() + defer oTelErrorLoggingLock.Unlock() + errorDataMapVal := gErrorDataMap.Load() + errorDataMap := errorDataMapVal.(map[string]*OTelErrorData) + if errorDataMap == nil { + errorDataMap = make(map[string]*OTelErrorData) + } + errorDataMap[reflect.TypeOf(err).String()] = &OTelErrorData{err: err, occurredTime: time.Now().Unix()} + if !logTickerInitialized.Load() { + handler.logOTelErrorCalEvent(errorDataMap) + errorDataMap = make(map[string]*OTelErrorData) //Reinitialize the map after process it. + gErrorDataMap.Store(errorDataMap) + logTickerInitialized.Store(true) + } else { + gErrorDataMap.Store(errorDataMap) + } +} + +// logOTELErrorCalEvent Log CAL event peridiocally every 15 mins in case any issues with OTEL data publish +func (handler OTelErrorHandler) processOTelErrorsMap() { + go func() { + for { + select { + case <-errorTicker.C: + oTelErrorLoggingLock.Lock() + errorDataMapVal := gErrorDataMap.Load() + errorDataMap := errorDataMapVal.(map[string]*OTelErrorData) + if errorDataMap != nil && len(errorDataMap) > 0 { + handler.logOTelErrorCalEvent(errorDataMap) + errorDataMap = make(map[string]*OTelErrorData) //Reinitialize the map after process it. + gErrorDataMap.Store(errorDataMap) + } + oTelErrorLoggingLock.Unlock() + } + } + }() +} + +// logOTelErrorCalEvent It takes of logging OTEL +func (handler OTelErrorHandler) logOTelErrorCalEvent(errorDataMap map[string]*OTelErrorData) { + for _, errorData := range errorDataMap { + event := cal.NewCalEvent("OTEL", "METRIC_PUBLISH", "1", fmt.Sprintf("msg=%v", errorData.err)) + event.AddDataInt("occurredTime", errorData.occurredTime) + event.AddDataInt("loggedTime", time.Now().Unix()) + event.Completed() + } +} diff --git a/utility/logger/otel/logger.go b/utility/logger/otel/logger.go new file mode 100644 index 00000000..a492164f --- /dev/null +++ b/utility/logger/otel/logger.go @@ -0,0 +1,382 @@ +package otel + +import ( + "context" + "errors" + "fmt" + "github.com/paypal/hera/utility/logger" + "github.com/paypal/hera/utility/logger/otel/config" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" + "go.opentelemetry.io/otel/sdk/instrumentation" + "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" + "go.opentelemetry.io/otel/sdk/resource" + "os" + "sync" + "time" +) + +var oTelInitializeOnce sync.Once + +// Init takes care of initialize OTEL SDK once during startup +func Init(ctx context.Context) (shutdown func(ctx context.Context) error, err error) { + oTelInitializeOnce.Do(func() { + shutdown, err = initializeOTelSDK(ctx) + }) + return shutdown, err +} + +// InitializeOTelSDK SetupOTelSDK bootstrap the OTEL SDK pipeline initialization +func initializeOTelSDK(ctx context.Context) (shutdown func(ctx context.Context) error, err error) { + var shutdownFuncs []func(context.Context) error + //shutdown calls cleanup function registered via shutdown functions + //The errors from calls are joined + shutdown = func(ctx context.Context) error { + var localErr error + for _, fn := range shutdownFuncs { + if fnErr := fn(ctx); fnErr != nil { + localErr = errors.Join(localErr, fnErr) // You can use other error accumulation strategies if needed + } + } + if localErr != nil { + logger.GetLogger().Log(logger.Warning, fmt.Sprintf("error while performing otel shutdown, err: %v", localErr)) + } + shutdownFuncs = nil + return localErr + } + + //handle error calls shutdown for cleanup and make sure all errors returned + handleErr := func(inErr error) { + err = errors.Join(inErr, shutdown(ctx)) + } + + errorTicker = time.NewTicker(time.Duration(config.OTelConfigData.OTelErrorReportingInterval) * time.Second) + + errorDataMap := make(map[string]*OTelErrorData) //Initialize the map after process it. + gErrorDataMap.Store(errorDataMap) + + //Setup stateLogMeter provider + meterProvider, err := newMeterProvider(ctx) + otel.SetMeterProvider(meterProvider) + if err != nil { + handleErr(err) + return nil, err + } + shutdownFuncs = append(shutdownFuncs, meterProvider.Shutdown) + + oTelErrorHandler := OTelErrorHandler{} + otel.SetErrorHandler(oTelErrorHandler) //Register custom error handler + oTelErrorHandler.processOTelErrorsMap() //Spawn Go routine peridically process OTEL errors + shutdownFuncs = append(shutdownFuncs, func(ctx context.Context) error { + errorTicker.Stop() + return nil + }) + return shutdown, err +} + +// Initialize newMeterProvider respective exporter either HTTP or GRPC exporter +func newMeterProvider(ctx context.Context) (*metric.MeterProvider, error) { + metricExporter, err := getMetricExporter(ctx) + + if err != nil { + logger.GetLogger().Log(logger.Alert, "failed to initialize metric exporter, error %v", err) + return nil, err + } + metricViews := getStateLogMetricsViews() + meterProvider := metric.NewMeterProvider( + metric.WithResource(getResourceInfo(config.OTelConfigData.PoolName)), + metric.WithReader(metric.NewPeriodicReader(metricExporter, + metric.WithInterval(time.Duration(config.OTelConfigData.ResolutionTimeInSec)*time.Second))), + metric.WithView(metricViews...), + ) + return meterProvider, nil +} + +func getStateLogMetricsViews() []metric.View { + initView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(InitConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + acptStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(AccptConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + waitStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(WaitConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + busyStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(BusyConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + schdStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(ScheduledConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + fnshStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(FinishedConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + quceStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(QuiescedConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + asgnStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(AssignedConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + idleStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(IdleConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + bklgStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(BacklogConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + strdStateView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(StrdConnMetric), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 32, MaxScale: 20}, + }, + ) + + freePercentageView := metric.NewView( + metric.Instrument{ + Name: config.OTelConfigData.PopulateMetricNamePrefix(freePercentage), + Scope: instrumentation.Scope{Name: StateLogMeterName}, + }, + metric.Stream{ + Aggregation: metric.AggregationBase2ExponentialHistogram{MaxSize: 10, MaxScale: 10}, + }, + ) + return []metric.View{initView, acptStateView, waitStateView, busyStateView, schdStateView, + fnshStateView, quceStateView, asgnStateView, idleStateView, bklgStateView, strdStateView, freePercentageView} +} + +// getMetricExporter Initialize metric exporter based protocol selected by user. +func getMetricExporter(ctx context.Context) (metric.Exporter, error) { + if config.OTelConfigData.OtelMetricGRPC { + return newGRPCExporter(ctx) + } + return newHTTPExporter(ctx) +} + +// newHTTPExporter Initilizes The "otlpmetrichttp" exporter in OpenTelemetry is used to export metrics data using the +// OpenTelemetry Protocol (OTLP) over HTTP. +func newHTTPExporter(ctx context.Context) (metric.Exporter, error) { + headers := make(map[string]string) + headers[IngestTokenHeader] = config.GetOTelIngestToken() + + //Currently all metrics uses delta-temporality: Delta Temporality: Use when you are interested in the rate of change + //over time or when you need to report only the differences (deltas) between measurements. + //This is useful for metrics like CPU usage, request rates, or other metrics where the rate of change is important. + var temporalitySelector = func(instrument metric.InstrumentKind) metricdata.Temporality { return metricdata.DeltaTemporality } + + if config.OTelConfigData.UseTls { + return otlpmetrichttp.New(ctx, + otlpmetrichttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.MetricsPort)), + otlpmetrichttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlpmetrichttp.WithCompression(otlpmetrichttp.NoCompression), + otlpmetrichttp.WithTemporalitySelector(temporalitySelector), + otlpmetrichttp.WithHeaders(headers), + otlpmetrichttp.WithRetry(otlpmetrichttp.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlpmetrichttp.WithURLPath(config.OTelConfigData.MetricsURLPath), + ) + } else { + return otlpmetrichttp.New(ctx, + otlpmetrichttp.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.MetricsPort)), + otlpmetrichttp.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlpmetrichttp.WithCompression(otlpmetrichttp.NoCompression), + otlpmetrichttp.WithTemporalitySelector(temporalitySelector), + otlpmetrichttp.WithHeaders(headers), + otlpmetrichttp.WithRetry(otlpmetrichttp.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlpmetrichttp.WithURLPath(config.OTelConfigData.MetricsURLPath), + otlpmetrichttp.WithInsecure(), //Since agent is local + ) + } +} + +// newGRPCExporter Initializes The "otlpmetricgrpc" exporter in OpenTelemetry is used to export metrics data using the +// OpenTelemetry Protocol (OTLP) over GRPC. +func newGRPCExporter(ctx context.Context) (metric.Exporter, error) { + + headers := make(map[string]string) + headers[IngestTokenHeader] = config.GetOTelIngestToken() + + //Currently all metrics uses delta-temporality: Delta Temporality: Use when you are interested in the rate of change + //over time or when you need to report only the differences (deltas) between measurements. + //This is useful for metrics like CPU usage, request rates, or other metrics where the rate of change is important. + var temporalitySelector = func(instrument metric.InstrumentKind) metricdata.Temporality { return metricdata.DeltaTemporality } + if config.OTelConfigData.UseTls { + return otlpmetricgrpc.New(ctx, + otlpmetricgrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.MetricsPort)), + otlpmetricgrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlpmetricgrpc.WithHeaders(headers), + otlpmetricgrpc.WithReconnectionPeriod(time.Duration(5)*time.Second), + otlpmetricgrpc.WithTemporalitySelector(temporalitySelector), + otlpmetricgrpc.WithRetry(otlpmetricgrpc.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + ) + + } else { + return otlpmetricgrpc.New(ctx, + otlpmetricgrpc.WithEndpoint(fmt.Sprintf("%s:%d", config.OTelConfigData.Host, config.OTelConfigData.MetricsPort)), + otlpmetricgrpc.WithTimeout(time.Duration(config.OTelConfigData.ExporterTimeout)*time.Second), + otlpmetricgrpc.WithHeaders(headers), + otlpmetricgrpc.WithReconnectionPeriod(time.Duration(5)*time.Second), + otlpmetricgrpc.WithTemporalitySelector(temporalitySelector), + otlpmetricgrpc.WithRetry(otlpmetricgrpc.RetryConfig{ + // Enabled indicates whether to not retry sending batches in case + // of export failure. + Enabled: false, + // InitialInterval the time to wait after the first failure before + // retrying. + InitialInterval: 1 * time.Second, + // MaxInterval is the upper bound on backoff interval. Once this + // value is reached the delay between consecutive retries will + // always be `MaxInterval`. + MaxInterval: 10 * time.Second, + // MaxElapsedTime is the maximum amount of time (including retries) + // spent trying to send a request/batch. Once this value is + // reached, the data is discarded. + MaxElapsedTime: 20 * time.Second, + }), + otlpmetricgrpc.WithInsecure(), //Since agent is local + ) + } +} + +// getResourceInfo provide application context level attributes during initialization +func getResourceInfo(appName string) *resource.Resource { + hostname, _ := os.Hostname() + + // Create a slice to hold the attributes + attributes := []attribute.KeyValue{ + attribute.String(ContainerHostDimName, hostname), + attribute.String(ApplicationDimName, appName), + attribute.String(OtelSourceName, otelSource), + } + + environment, isEnvPresent := os.LookupEnv("ENVIRONMENT") + az, isAzPresent := os.LookupEnv("AVAILABILITY_ZONE") + if isEnvPresent { + attributes = append(attributes, attribute.String("environment", environment)) + } + if isAzPresent { + attributes = append(attributes, attribute.String("az", az)) + } + resource := resource.NewWithAttributes(fmt.Sprintf("%s resource", config.OTelConfigData.ResourceType), + attributes..., + ) + return resource +} diff --git a/utility/logger/otel/state_logger.go b/utility/logger/otel/state_logger.go new file mode 100644 index 00000000..2a4aa430 --- /dev/null +++ b/utility/logger/otel/state_logger.go @@ -0,0 +1,445 @@ +package otel + +import ( + "context" + "fmt" + "github.com/paypal/hera/cal" + "github.com/paypal/hera/utility/logger" + otelconfig "github.com/paypal/hera/utility/logger/otel/config" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "os" + "sync" + "time" +) + +const defaultAppName string = "occ" +const oTelCallbackRegistrationRetryCount int = 3 //Max retry configuration for registering callback function + +// This lock prevents a race between batch observer and instrument registration +var registerStateMetrics sync.Once +var metricsStateLogger *StateLogMetrics +var totalConnectionStateDataLogger *TotalConnectionsGaugeData + +// Implement apply function in to configure stateLogMeter provider +func (o MetricProviderOption) apply(c *stateLogMetricsConfig) { + if o.MeterProvider != nil { + c.MeterProvider = o.MeterProvider + } +} + +// Implement apply function in to configure pool name +func (appName AppNameOption) apply(c *stateLogMetricsConfig) { + if appName != "" { + c.appName = string(appName) + } +} + +// WithAppName Create StateLogMetrics with OCC Name +func WithAppName(appName string) StateLogOption { + return AppNameOption(appName) +} + +// WithMetricProvider Create StateLogMetrics with provided stateLogMeter Provider +func WithMetricProvider(provider metric.MeterProvider) StateLogOption { + return MetricProviderOption{provider} +} + +// newConfig computes a config from the supplied Options. +func newConfig(opts ...StateLogOption) stateLogMetricsConfig { + statesConfig := stateLogMetricsConfig{ + MeterProvider: otel.GetMeterProvider(), + appName: defaultAppName, + } + + for _, opt := range opts { + opt.apply(&statesConfig) + } + return statesConfig +} + +// StartMetricsCollection initializes reporting of stateLogMetrics using the supplied config. +func StartMetricsCollection(ctx context.Context, totalWorkersCount int, opt ...StateLogOption) error { + stateLogMetricsConfig := newConfig(opt...) + + //Verification of config data + if stateLogMetricsConfig.appName == "" { + stateLogMetricsConfig.appName = defaultAppName + } + + if stateLogMetricsConfig.MeterProvider == nil { + stateLogMetricsConfig.MeterProvider = otel.GetMeterProvider() + } + + var err error + //Registers instrumentation for metrics + registerStateMetrics.Do(func() { + hostName, hostErr := os.Hostname() + if hostErr != nil { + logger.GetLogger().Log(logger.Alert, "Failed to fetch hostname for current container", err) + } + stateLogMeter := stateLogMetricsConfig.MeterProvider.Meter(StateLogMeterName, + metric.WithInstrumentationVersion(OtelInstrumentationVersion)) + //Initialize state-log metrics + metricsStateLogger = &StateLogMetrics{ + stateLogMeter: stateLogMeter, + hostname: hostName, + mStateDataChan: make(chan *WorkersStateData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*2), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, + doneCh: make(chan struct{}), + } + + totalConnectionStateDataLogger = &TotalConnectionsGaugeData{ + stateLogMeter: stateLogMeter, + hostname: hostName, + totalConnDataChannel: make(chan *GaugeMetricData, totalWorkersCount*otelconfig.OTelConfigData.ResolutionTimeInSec*2), //currently OTEL polling interval hardcoded as 10. Size of bufferred channel = totalWorkersCount * pollingInterval * 2, + stopPublish: make(chan struct{}), + } + err = registerMetrics(metricsStateLogger, totalConnectionStateDataLogger) + if err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register state metrics collector", err) + } else { + for retryCount := 0; retryCount < oTelCallbackRegistrationRetryCount; retryCount++ { + err = totalConnectionStateDataLogger.registerCallbackForTotalConnectionsData() + if err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register callback for totalConnectionStateDataLogger gauge metric: ", err, "number of retries: ", retryCount) + } else { + logger.GetLogger().Log(logger.Info, "registered callback for totalConnectionStateDataLogger gauge metrics") + break + } + } + if err == nil { + go metricsStateLogger.startStateLogMetricsPoll(ctx) //Goroutine to poll HERA states data + } + } + }) + if err != nil { + calEvent := cal.NewCalEvent("OTEL", "METRIC_REGISTRATION", "1", fmt.Sprintf("msg=failed register statelog metrics, error: %v", err)) + calEvent.AddDataInt("loggedTime", time.Now().Unix()) + calEvent.Completed() + } else { + calEvent := cal.NewCalEvent("OTEL", "METRIC_REGISTRATION", "0", "msg=state-log metrics registration success") + calEvent.AddDataInt("loggedTime", time.Now().Unix()) + calEvent.Completed() + } + return err +} + +// StopMetricCollection Send notification to stateLogMetrics.stopPublish to stop metric collection +func StopMetricCollection() { + var wg sync.WaitGroup + wg.Add(2) + //Goroutine 1 + go func() { + defer wg.Done() + select { + case metricsStateLogger.doneCh <- struct{}{}: + logger.GetLogger().Log(logger.Info, "this stop metric collection for state-log data on channel metricsStateLogger.") + return + default: + logger.GetLogger().Log(logger.Info, "metricsStateLogger done channel has already been closed.") + return + } + }() + + go func() { + defer wg.Done() + select { + case totalConnectionStateDataLogger.stopPublish <- struct{}{}: + logger.GetLogger().Log(logger.Info, "this stop metric collection for state-log data on channel totalConnectionStateDataLogger.") + return + default: + logger.GetLogger().Log(logger.Info, "totalConnectionStateDataLogger.stopPublish channel has already been closed.") + return + } + }() + wg.Wait() +} + +// AddDataPointToOTELStateDataChan Send data to stateLogMetrics.mStateDataChan channel +func AddDataPointToOTELStateDataChan(dataPoint *WorkersStateData) { + defer func() { + if r := recover(); r != nil { + logger.GetLogger().Log(logger.Info, "Panic while adding data-points to StateDataChannel, Recovered from panic: ", r) + } + }() + select { + case metricsStateLogger.mStateDataChan <- dataPoint: + return + case <-time.After(time.Second * 1): + logger.GetLogger().Log(logger.Info, "timeout occurred while adding record to stats data channel") + event := cal.NewCalEvent("OTEL", "DATA_TIMEOUT", "1", "msg=timeout occurred while adding record to mStateDataChan channel") + event.AddDataInt("loggedTime", time.Now().Unix()) + event.Completed() + return + } +} + +// AddDataPointToTotalConnectionsDataChannel Send data to totalConnectionStateDataLogger.totalConnDataChannel channel +func AddDataPointToTotalConnectionsDataChannel(totalConnectionData *GaugeMetricData) { + defer func() { + if r := recover(); r != nil { + logger.GetLogger().Log(logger.Info, "Panic while adding data-points to totalConnDataChannel, Recovered from panic: ", r) + } + }() + select { + case totalConnectionStateDataLogger.totalConnDataChannel <- totalConnectionData: + return + case <-time.After(time.Second * 1): + logger.GetLogger().Log(logger.Info, "timeout occurred while adding guage data record to totalConnDataChannel channel") + event := cal.NewCalEvent("OTEL", "DATA_TIMEOUT", "1", "msg=timeout occurred while adding guage data record to totalConnDataChannel channel") + event.AddDataInt("loggedTime", time.Now().Unix()) + event.Completed() + return + } +} + +// Define Instrumentation for each metrics and register with StateLogMetrics +func registerMetrics(stateLogMetrics *StateLogMetrics, totalConnectionsMetrics *TotalConnectionsGaugeData) error { + + //"init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd", "cls" + var err error + if stateLogMetrics.initState, err = stateLogMetrics.stateLogMeter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(InitConnMetric), + metric.WithDescription("Number of workers in init state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for init state", err) + return err + } + + if stateLogMetrics.acptState, err = stateLogMetrics.stateLogMeter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(AccptConnMetric), + metric.WithDescription("Number of workers in accept state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for accept state", err) + return err + } + + if stateLogMetrics.waitState, err = stateLogMetrics.stateLogMeter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(WaitConnMetric), + metric.WithDescription("Number of workers in wait state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for wait state", err) + return err + } + + if stateLogMetrics.busyState, err = stateLogMetrics.stateLogMeter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(BusyConnMetric), + metric.WithDescription("Number of workers in busy state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for busy state", err) + return err + } + + if stateLogMetrics.schdState, err = stateLogMetrics.stateLogMeter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(ScheduledConnMetric), + metric.WithDescription("Number of workers in scheduled state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for scheduled state", err) + return err + } + + if stateLogMetrics.fnshState, err = stateLogMetrics.stateLogMeter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(FinishedConnMetric), + metric.WithDescription("Number of workers in finished state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for finished state", err) + return err + } + + if stateLogMetrics.quceState, err = stateLogMetrics.stateLogMeter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(QuiescedConnMetric), + metric.WithDescription("Number of workers in quiesced state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for quiesced state", err) + return err + } + + if stateLogMetrics.asgnState, err = stateLogMetrics.stateLogMeter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(AssignedConnMetric), + metric.WithDescription("Number of workers in assigned state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for assigned state", err) + return err + } + + if stateLogMetrics.idleState, err = stateLogMetrics.stateLogMeter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(IdleConnMetric), + metric.WithDescription("Number of workers in idle state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for idle state", err) + return err + } + + if stateLogMetrics.bklgState, err = stateLogMetrics.stateLogMeter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(BacklogConnMetric), + metric.WithDescription("Number of workers in backlog state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for backlog state", err) + return err + } + + if stateLogMetrics.strdState, err = stateLogMetrics.stateLogMeter.Int64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(StrdConnMetric), + metric.WithDescription("Number of connections in stranded state"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for stranded state", err) + return err + } + + if stateLogMetrics.freePercentage, err = stateLogMetrics.stateLogMeter.Float64Histogram( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(freePercentage), + metric.WithDescription("Free connections percentage"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register guage metric for free connections percentage state", err) + return err + } + + //Register Gauge metric + if totalConnectionsMetrics.totalConnections, err = totalConnectionsMetrics.stateLogMeter.Int64ObservableGauge( + otelconfig.OTelConfigData.PopulateMetricNamePrefix(totalConnections), + metric.WithDescription("Total Connection"), + ); err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to register total connection guage metric", err) + return err + } + if err != nil { + return err + } + return nil +} + +/* + * asyncStateLogMetricsPoll poll operation involved periodically by OTEL collector based-on its polling interval + * it poll metrics from channel do aggregation or compute max based combination of shardId + workerType + InstanceId + */ +func (stateLogMetrics *StateLogMetrics) startStateLogMetricsPoll(ctx context.Context) { +mainloop: + for { + select { + case workersState, more := <-stateLogMetrics.mStateDataChan: + stateLogsData := make(map[string]map[string]int64) + var stateLogTitle string + if !more { + logger.GetLogger().Log(logger.Info, "Statelog metrics data channel 'mStateDataChan' has been closed.") + break mainloop + } + keyName := fmt.Sprintf("%d-%d-%d", workersState.ShardId, workersState.WorkerType, workersState.InstanceId) + + if stateLogsData[keyName] == nil { + stateLogsData[keyName] = make(map[string]int64) + } + //Update metadata information + stateLogTitle = workersState.StateTitle + stateLogsData[keyName][ShardId] = int64(workersState.ShardId) + stateLogsData[keyName][WorkerType] = int64(workersState.WorkerType) + stateLogsData[keyName][InstanceId] = int64(workersState.InstanceId) + stateLogsData[keyName][Datapoints] += 1 + for key, value := range workersState.StateData { + stateLogsData[keyName][key] = value + } + if len(stateLogsData) > 0 { + stateLogMetrics.sendMetricsDataToCollector(ctx, &stateLogTitle, stateLogsData) + } + case <-stateLogMetrics.doneCh: + logger.GetLogger().Log(logger.Alert, "received stopped signal for processing statelog metric. "+ + "so stop sending data and closing data channel") + close(stateLogMetrics.mStateDataChan) + break mainloop + case <-ctx.Done(): + logger.GetLogger().Log(logger.Info, "parent context has been closed for metric poll, so exiting loop") + break mainloop + } + } +} + +/* + * Send metrics datat data-points to collector + */ +func (stateLogMetrics *StateLogMetrics) sendMetricsDataToCollector(ctx context.Context, stateLogTitle *string, stateLogsData map[string]map[string]int64) { + for key, aggStatesData := range stateLogsData { + logger.GetLogger().Log(logger.Info, fmt.Sprintf("publishing state logs histogram data for shardid-workertype-instanceId: %s using datapoints size: %d", key, aggStatesData[Datapoints])) + commonLabels := []attribute.KeyValue{ + attribute.Int(ShardId, int(aggStatesData[ShardId])), + attribute.String(WorkerType, WorkerTypeMap[int(aggStatesData[WorkerType])]), + attribute.Int(InstanceId, int(aggStatesData[InstanceId])), + attribute.String(OccWorkerParamName, *stateLogTitle), + attribute.String(HostDimensionName, stateLogMetrics.hostname), + } + //Observe states data + //1. Worker States + stateLogMetrics.initState.Record(ctx, aggStatesData["init"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.acptState.Record(ctx, aggStatesData["acpt"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.waitState.Record(ctx, aggStatesData["wait"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.busyState.Record(ctx, aggStatesData["busy"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.schdState.Record(ctx, aggStatesData["schd"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.fnshState.Record(ctx, aggStatesData["fnsh"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.quceState.Record(ctx, aggStatesData["quce"], metric.WithAttributes(commonLabels...)) + + //2. Free Percentage + freePercentageVal := (float64(aggStatesData["acpt"]+aggStatesData["fnsh"]) / float64(aggStatesData["totalConnections"])) * 100 + stateLogMetrics.freePercentage.Record(ctx, freePercentageVal, metric.WithAttributes(commonLabels...)) + + //2. Connection States + stateLogMetrics.asgnState.Record(ctx, aggStatesData["asgn"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.idleState.Record(ctx, aggStatesData["idle"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.bklgState.Record(ctx, aggStatesData["bklg"], metric.WithAttributes(commonLabels...)) + stateLogMetrics.strdState.Record(ctx, aggStatesData["strd"], metric.WithAttributes(commonLabels...)) + } +} + +// This registerCallbackForTotalConnectionsData register callback function to pull totalConnection data for each worker type +func (totalConnectionGauge *TotalConnectionsGaugeData) registerCallbackForTotalConnectionsData() error { + var regError error + totalConnectionGauge.registration, regError = totalConnectionGauge.stateLogMeter.RegisterCallback( + func(ctx context.Context, observer metric.Observer) error { + finalDataMap := make(map[string]*GaugeMetricData) + totalConLoop: + for { + select { + case totalConnData, dataPresent := <-totalConnectionGauge.totalConnDataChannel: + if !dataPresent { + logger.GetLogger().Log(logger.Info, "totalConnection gauge data channel 'totalConnDataChannel' has been closed.") + break totalConLoop + } else { + keyName := fmt.Sprintf("%d-%d-%d", totalConnData.ShardId, totalConnData.WorkerType, totalConnData.InstanceId) + finalDataMap[keyName] = totalConnData + } + case <-totalConnectionGauge.stopPublish: + logger.GetLogger().Log(logger.Alert, "received stopped signal for processing statelog total workers metric. "+ + "so stop sending data to totalConnectionGauge.totalConnDataChannel and closing data channel") + close(totalConnectionGauge.totalConnDataChannel) + if totalConnectionGauge.registration != nil { + logger.GetLogger().Log(logger.Info, "received stopped signal for processing statelog total worker metric. "+ + "so unregister callback function") + go totalConnectionGauge.registration.Unregister() + } + break totalConLoop + case <-ctx.Done(): + logger.GetLogger().Log(logger.Info, "context closed so exiting from totalConnDataChannel data loop") + break totalConLoop + default: + logger.GetLogger().Log(logger.Info, "totalConnDataChannel channel is empty") + break totalConLoop + } + } + if len(finalDataMap) > 0 { + for key, dataPoint := range finalDataMap { + logger.GetLogger().Log(logger.Info, fmt.Sprintf("publishing total connection gauge for key: %s, worker type: %s with datapoints value: %d", key, dataPoint.StateTitle, dataPoint.StateData)) + commonLabels := []attribute.KeyValue{ + attribute.Int(ShardId, dataPoint.ShardId), + attribute.Int(WorkerType, dataPoint.WorkerType), + attribute.Int(InstanceId, dataPoint.InstanceId), + attribute.String(OccWorkerParamName, dataPoint.StateTitle), + } + observer.ObserveInt64(totalConnectionGauge.totalConnections, dataPoint.StateData, metric.WithAttributes(commonLabels...)) + } + } + return nil + }, totalConnectionGauge.totalConnections) + if regError != nil { + logger.GetLogger().Log(logger.Alert, fmt.Sprintf("Failed to register total connection gauge for total worker metric. error %v", regError)) + return regError + } + return nil +} diff --git a/utility/logger/otel/test/mock_collector.go b/utility/logger/otel/test/mock_collector.go new file mode 100644 index 00000000..ba64445c --- /dev/null +++ b/utility/logger/otel/test/mock_collector.go @@ -0,0 +1,341 @@ +package otel + +import ( + "bytes" + "compress/gzip" + "context" + "crypto/ecdsa" + "crypto/elliptic" + cryptorand "crypto/rand" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "fmt" + "io" + "io/ioutil" + "math/big" + mathrand "math/rand" + "net" + "net/http" + "sync" + "testing" + "time" + + "google.golang.org/protobuf/proto" + + collectormetricpb "go.opentelemetry.io/proto/otlp/collector/metrics/v1" + metricpb "go.opentelemetry.io/proto/otlp/metrics/v1" +) + +const DefaultMetricsPath string = "/v1/metrics" + +type mockCollector struct { + endpoint string + server *http.Server + + spanLock sync.Mutex + metricsStorage MetricsStorage + + injectHTTPStatus []int + injectContentType string + delay <-chan struct{} + + clientTLSConfig *tls.Config + expectedHeaders map[string]string +} + +func (c *mockCollector) Stop() error { + return c.server.Shutdown(context.Background()) +} + +func (c *mockCollector) MustStop(t *testing.T) { + c.server.Shutdown(context.Background()) +} + +func (c *mockCollector) GetMetrics() []*metricpb.Metric { + c.spanLock.Lock() + defer c.spanLock.Unlock() + return c.metricsStorage.GetMetrics() +} + +func (c *mockCollector) Endpoint() string { + return c.endpoint +} + +func (c *mockCollector) ClientTLSConfig() *tls.Config { + return c.clientTLSConfig +} + +func (c *mockCollector) serveMetrics(w http.ResponseWriter, r *http.Request) { + if c.delay != nil { + select { + case <-c.delay: + case <-r.Context().Done(): + return + } + } + + if !c.checkHeaders(r) { + w.WriteHeader(http.StatusBadRequest) + return + } + response := collectormetricpb.ExportMetricsServiceResponse{} + rawResponse, err := proto.Marshal(&response) + if err != nil { + w.WriteHeader(http.StatusInternalServerError) + return + } + if injectedStatus := c.getInjectHTTPStatus(); injectedStatus != 0 { + writeReply(w, rawResponse, injectedStatus, c.injectContentType) + return + } + rawRequest, err := readRequest(r) + if err != nil { + w.WriteHeader(http.StatusInternalServerError) + return + } + // fmt.Println("---------------raw req--------------", rawRequest) + request, err := unmarshalMetricsRequest(rawRequest, r.Header.Get("content-type")) + if err != nil { + w.WriteHeader(http.StatusBadRequest) + return + } + writeReply(w, rawResponse, 0, c.injectContentType) + c.spanLock.Lock() + defer c.spanLock.Unlock() + + fmt.Println("---------------serveMetrics--------------", request) + c.metricsStorage.AddMetrics(request) +} + +func unmarshalMetricsRequest(rawRequest []byte, contentType string) (*collectormetricpb.ExportMetricsServiceRequest, error) { + request := &collectormetricpb.ExportMetricsServiceRequest{} + if contentType != "application/x-protobuf" { + return request, fmt.Errorf("invalid content-type: %s, only application/x-protobuf is supported", contentType) + } + err := proto.Unmarshal(rawRequest, request) + return request, err +} + +func (c *mockCollector) checkHeaders(r *http.Request) bool { + for k, v := range c.expectedHeaders { + got := r.Header.Get(k) + if got != v { + return false + } + } + return true +} + +func (c *mockCollector) getInjectHTTPStatus() int { + if len(c.injectHTTPStatus) == 0 { + return 0 + } + status := c.injectHTTPStatus[0] + c.injectHTTPStatus = c.injectHTTPStatus[1:] + if len(c.injectHTTPStatus) == 0 { + c.injectHTTPStatus = nil + } + return status +} + +func readRequest(r *http.Request) ([]byte, error) { + if r.Header.Get("Content-Encoding") == "gzip" { + return readGzipBody(r.Body) + } + return ioutil.ReadAll(r.Body) +} + +func readGzipBody(body io.Reader) ([]byte, error) { + rawRequest := bytes.Buffer{} + gunzipper, err := gzip.NewReader(body) + if err != nil { + return nil, err + } + defer gunzipper.Close() + _, err = io.Copy(&rawRequest, gunzipper) + if err != nil { + return nil, err + } + return rawRequest.Bytes(), nil +} + +func writeReply(w http.ResponseWriter, rawResponse []byte, injectHTTPStatus int, injectContentType string) { + status := http.StatusOK + if injectHTTPStatus != 0 { + status = injectHTTPStatus + } + contentType := "application/x-protobuf" + if injectContentType != "" { + contentType = injectContentType + } + w.Header().Set("Content-Type", contentType) + w.WriteHeader(status) + _, _ = w.Write(rawResponse) +} + +type mockCollectorConfig struct { + MetricsURLPath string + Port int + InjectHTTPStatus []int + InjectContentType string + Delay <-chan struct{} + WithTLS bool + ExpectedHeaders map[string]string +} + +func (c *mockCollectorConfig) fillInDefaults() { + if c.MetricsURLPath == "" { + c.MetricsURLPath = DefaultMetricsPath + } +} + +func runMockCollector(t *testing.T, cfg mockCollectorConfig) *mockCollector { + cfg.fillInDefaults() + ln, _ := net.Listen("tcp", fmt.Sprintf("localhost:%d", cfg.Port)) + // require.NoError(t, err) + _, portStr, _ := net.SplitHostPort(ln.Addr().String()) + // require.NoError(t, err) + m := &mockCollector{ + endpoint: fmt.Sprintf("localhost:%s", portStr), + metricsStorage: NewMetricsStorage(), + injectHTTPStatus: cfg.InjectHTTPStatus, + injectContentType: cfg.InjectContentType, + delay: cfg.Delay, + expectedHeaders: cfg.ExpectedHeaders, + } + mux := http.NewServeMux() + mux.Handle(cfg.MetricsURLPath, http.HandlerFunc(m.serveMetrics)) + server := &http.Server{ + Handler: mux, + } + if cfg.WithTLS { + pem, _ := generateWeakCertificate() + // require.NoError(t, err) + tlsCertificate, _ := tls.X509KeyPair(pem.Certificate, pem.PrivateKey) + // require.NoError(t, err) + server.TLSConfig = &tls.Config{ + Certificates: []tls.Certificate{tlsCertificate}, + } + + m.clientTLSConfig = &tls.Config{ + InsecureSkipVerify: true, + } + } + go func() { + if cfg.WithTLS { + _ = server.ServeTLS(ln, "", "") + } else { + _ = server.Serve(ln) + } + }() + m.server = server + return m +} + +type mathRandReader struct{} + +func (mathRandReader) Read(p []byte) (n int, err error) { + return mathrand.Read(p) +} + +var randReader mathRandReader + +func generateWeakCertificate() (*pemCertificate, error) { + priv, err := ecdsa.GenerateKey(elliptic.P256(), randReader) + if err != nil { + return nil, err + } + keyUsage := x509.KeyUsageDigitalSignature + notBefore := time.Now() + notAfter := notBefore.Add(time.Hour) + serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) + serialNumber, err := cryptorand.Int(randReader, serialNumberLimit) + if err != nil { + return nil, err + } + template := x509.Certificate{ + SerialNumber: serialNumber, + Subject: pkix.Name{ + Organization: []string{"otel_basic-go"}, + }, + NotBefore: notBefore, + NotAfter: notAfter, + KeyUsage: keyUsage, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + BasicConstraintsValid: true, + DNSNames: []string{"localhost"}, + IPAddresses: []net.IP{net.IPv6loopback, net.IPv4(127, 0, 0, 1)}, + } + derBytes, err := x509.CreateCertificate(randReader, &template, &template, &priv.PublicKey, priv) + if err != nil { + return nil, err + } + certificateBuffer := new(bytes.Buffer) + if err := pem.Encode(certificateBuffer, &pem.Block{Type: "CERTIFICATE", Bytes: derBytes}); err != nil { + return nil, err + } + privDERBytes, err := x509.MarshalPKCS8PrivateKey(priv) + if err != nil { + return nil, err + } + privBuffer := new(bytes.Buffer) + if err := pem.Encode(privBuffer, &pem.Block{Type: "PRIVATE KEY", Bytes: privDERBytes}); err != nil { + return nil, err + } + return &pemCertificate{ + Certificate: certificateBuffer.Bytes(), + PrivateKey: privBuffer.Bytes(), + }, nil +} + +type pemCertificate struct { + Certificate []byte + PrivateKey []byte +} + +// Collector is an interface that mock collectors should implements, +// so they can be used for the end-to-end testing. +type Collector interface { + Stop() error + GetMetrics() []*metricpb.Metric +} + +// MetricsStorage stores the metrics. Mock collectors could use it to +// store metrics they have received. +type MetricsStorage struct { + metrics []*metricpb.Metric +} + +// NewMetricsStorage creates a new metrics storage. +func NewMetricsStorage() MetricsStorage { + return MetricsStorage{} +} + +// AddMetrics adds metrics to the metrics storage. +func (s *MetricsStorage) AddMetrics(request *collectormetricpb.ExportMetricsServiceRequest) { + for _, rm := range request.GetResourceMetrics() { + // TODO (rghetia) handle multiple resource and library info. + fmt.Println("---------------AddMetrics------------------", rm) + + if len(rm.ScopeMetrics) > 0 { + s.metrics = append(s.metrics, rm.ScopeMetrics[0].Metrics...) + fmt.Println("Metric added successfully") + } else { + fmt.Println("Metrics added filed") + } + + // if len(rm.InstrumentationLibraryMetrics) > 0 { + // s.metrics = append(s.metrics, rm.InstrumentationLibraryMetrics[0].Metrics...) + // } + + } +} + +// GetMetrics returns the stored metrics. +func (s *MetricsStorage) GetMetrics() []*metricpb.Metric { + // copy in order to not change. + m := make([]*metricpb.Metric, 0, len(s.metrics)) + return append(m, s.metrics...) +} diff --git a/utility/logger/otel/test/state_logger_test.go b/utility/logger/otel/test/state_logger_test.go new file mode 100644 index 00000000..401f90b5 --- /dev/null +++ b/utility/logger/otel/test/state_logger_test.go @@ -0,0 +1,392 @@ +package otel + +import ( + "context" + "fmt" + otellogger "github.com/paypal/hera/utility/logger/otel" + otelconfig "github.com/paypal/hera/utility/logger/otel/config" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + "math/rand" + "os" + "testing" + "time" + + "github.com/paypal/hera/utility/logger" + "go.opentelemetry.io/otel/exporters/stdout/stdoutmetric" +) + +// This initializes console exported for metrics +func initializeConsoleExporter() (*metric.MeterProvider, error) { + otelconfig.OTelConfigData = &otelconfig.OTelConfig{ + Host: "localhost", + MetricsPort: 4318, + TracePort: 4318, + Enabled: true, + OtelMetricGRPC: false, + OtelTraceGRPC: false, + ResolutionTimeInSec: 2, + OTelErrorReportingInterval: 10, + PoolName: "occ-testapp", + MetricNamePrefix: "pp.occ", + MetricsURLPath: DefaultMetricsPath, + } + hostname, _ := os.Hostname() + + resource := resource.NewWithAttributes("OCC resource", + attribute.String("container_host", hostname), + attribute.String("az", "devTest"), + attribute.String("environment", "dev"), + attribute.String("application", "occ-testapp"), + ) + metricExporter, err := stdoutmetric.New(stdoutmetric.WithPrettyPrint()) + if err != nil { + logger.GetLogger().Log(logger.Alert, "failed to initialize metric stdout exporter:", err) + return nil, err + } + + meterProvider := metric.NewMeterProvider( + metric.WithResource(resource), + metric.WithReader(metric.NewPeriodicReader(metricExporter, + // Default is 1m. Set to 3s for demonstrative purposes. + metric.WithInterval(3*time.Second))), + ) + otel.SetMeterProvider(meterProvider) + return meterProvider, nil +} + +func initializeCustomOTelExporter(t *testing.T) func(ctx context.Context) error { + otelconfig.OTelConfigData = &otelconfig.OTelConfig{ + Host: "localhost", + MetricsPort: 4318, + TracePort: 4318, + Enabled: true, + OtelMetricGRPC: false, + OtelTraceGRPC: false, + ResolutionTimeInSec: 2, + OTelErrorReportingInterval: 2, + PoolName: "occ-testapp", + MetricNamePrefix: "pp.occ", + MetricsURLPath: DefaultMetricsPath, + } + otelconfig.SetOTelIngestToken("welcome123") + ctx := context.Background() + shutdownFn, err := otellogger.Init(ctx) + + if err != nil { + t.Log(fmt.Sprintf("failed to initialize OTEL sdk during test, error: %v", err)) + t.Fatalf("failed to initialize OTEL sdk during test, error: %v", err) + } + return shutdownFn +} + +func TestVerifyStateLogMetricsInitilization(t *testing.T) { + mc := runMockCollector(t, mockCollectorConfig{ + Port: 4318, + }) + defer mc.MustStop(t) + + _, err := initializeConsoleExporter() + if err != nil { + t.Fail() + } + + err = otellogger.StartMetricsCollection(context.Background(), 5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) + + if err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") + t.Fail() + } + defer otellogger.StopMetricCollection() + time.Sleep(15 * time.Second) +} + +func TestVerifyStateLogMetricsInitilizationAndContextWithTimeout(t *testing.T) { + _, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + mc := runMockCollector(t, mockCollectorConfig{ + Port: 4318, + }) + defer mc.MustStop(t) + + _, err := initializeConsoleExporter() + if err != nil { + t.Fail() + } + + err = otellogger.StartMetricsCollection(context.Background(), 5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) + defer otellogger.StopMetricCollection() + time.Sleep(2 * time.Second) + if err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") + t.Fail() + } +} + +func TestSendingStateLogMetrics(t *testing.T) { + mc := runMockCollector(t, mockCollectorConfig{ + Port: 4318, + WithTLS: false, + }) + defer mc.MustStop(t) + + shutDownFn := initializeCustomOTelExporter(t) + defer shutDownFn(context.Background()) + + time.Sleep(2 * time.Second) + + err := otellogger.StartMetricsCollection(context.Background(), 5, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) + + if err != nil { + logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") + t.Fail() + } + defer otellogger.StopMetricCollection() + + //"init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd", "cls"} + var stateData = map[string]int64{ + "init": 6, + "acpt": 10, + "wait": 5, + "busy": 2, + "idle": 5, + "bklg": 0, + "req": 5, + "resp": 5, + "fnsh": 10, + "totalConnections": 48, + } + workerStateInfo := otellogger.WorkerStateInfo{ + StateTitle: "rw", + ShardId: 1, + WorkerType: 1, + InstanceId: 0, + } + workersStateData := otellogger.WorkersStateData{ + WorkerStateInfo: &workerStateInfo, + StateData: stateData, + } + totalConData := otellogger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfo, + StateData: 38, + } + otellogger.AddDataPointToOTELStateDataChan(&workersStateData) + otellogger.AddDataPointToTotalConnectionsDataChannel(&totalConData) + + logger.GetLogger().Log(logger.Info, "Data Sent successfully for instrumentation") + time.Sleep(10 * time.Second) + metricsData := mc.metricsStorage.GetMetrics() + logger.GetLogger().Log(logger.Info, "total metrics count is: ", len(metricsData)) + if len(metricsData) < 13 { + t.Fatalf("got %d, wanted %d", len(metricsData), 13) + } +} + +func TestSendingStateLogMetricsConsoleExporter(t *testing.T) { + cont, err := initializeConsoleExporter() + if err != nil { + t.Fail() + } + + err2 := otellogger.StartMetricsCollection(context.Background(), 100, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp2")) + + if err2 != nil { + logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") + t.Fail() + } + defer otellogger.StopMetricCollection() + var stateData = map[string]int64{ + "init": 0, + "acpt": 15, + "wait": 10, + "busy": 4, + "idle": 7, + "bklg": 0, + "fnsh": 10, + "totalConnections": 46, + } + + var stateData2 = map[string]int64{ + "init": 3, + "acpt": 15, + "wait": 10, + "busy": 4, + "idle": 8, + "bklg": 0, + "fnsh": 10, + "totalConnections": 50, + } + workerStateInfo1 := otellogger.WorkerStateInfo{ + StateTitle: "rw", + ShardId: 0, + WorkerType: 0, + InstanceId: 0, + } + + workerStateInfo2 := otellogger.WorkerStateInfo{ + StateTitle: "rw", + ShardId: 2, + WorkerType: 0, + InstanceId: 0, + } + workersStateData := otellogger.WorkersStateData{ + WorkerStateInfo: &workerStateInfo1, + StateData: stateData, + } + + workersStateData2 := otellogger.WorkersStateData{ + WorkerStateInfo: &workerStateInfo2, + StateData: stateData2, + } + + totalWorkersStateData := otellogger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfo1, + StateData: 36, + } + + totalWorkersStateData2 := otellogger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfo2, + StateData: 40, + } + + otellogger.AddDataPointToOTELStateDataChan(&workersStateData) + otellogger.AddDataPointToTotalConnectionsDataChannel(&totalWorkersStateData) + time.Sleep(150 * time.Millisecond) + otellogger.AddDataPointToOTELStateDataChan(&workersStateData2) + otellogger.AddDataPointToTotalConnectionsDataChannel(&totalWorkersStateData2) + logger.GetLogger().Log(logger.Info, "Data Sent successfully for instrumentation") + time.Sleep(2 * time.Second) + + var stateData3 = map[string]int64{ + "init": 0, + "acpt": 1, + "wait": 10, + "busy": 4, + "idle": 17, + "bklg": 0, + "fnsh": 10, + "totalConnections": 42, + } + + var stateData4 = map[string]int64{ + "init": 2, + "acpt": 0, + "wait": 10, + "busy": 4, + "idle": 8, + "bklg": 5, + "fnsh": 8, + "totalConnections": 37, + } + + workersStateData3 := otellogger.WorkersStateData{ + WorkerStateInfo: &workerStateInfo1, + StateData: stateData3, + } + + workersStateData4 := otellogger.WorkersStateData{ + WorkerStateInfo: &workerStateInfo2, + StateData: stateData4, + } + totalWorkersStateData3 := otellogger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfo1, + StateData: 38, + } + + totalWorkersStateData4 := otellogger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfo2, + StateData: 29, + } + otellogger.AddDataPointToOTELStateDataChan(&workersStateData3) + otellogger.AddDataPointToTotalConnectionsDataChannel(&totalWorkersStateData3) + time.Sleep(150 * time.Millisecond) + otellogger.AddDataPointToOTELStateDataChan(&workersStateData4) + otellogger.AddDataPointToTotalConnectionsDataChannel(&totalWorkersStateData4) + time.Sleep(2 * time.Second) + if err3 := cont.Shutdown(context.Background()); err3 != nil { + logger.GetLogger().Log(logger.Info, "failed to stop the metric controller:", err3) + } +} + +func TestOCCStateLogGeneratorWithRandomValues(t *testing.T) { + cont, err := initializeConsoleExporter() + if err != nil { + t.Fail() + } + defer cont.Shutdown(context.Background()) + + err2 := otellogger.StartMetricsCollection(context.Background(), 1000, otellogger.WithMetricProvider(otel.GetMeterProvider()), otellogger.WithAppName("occ-testapp")) + defer otellogger.StopMetricCollection() + go dataGenerator() + + if err2 != nil { + logger.GetLogger().Log(logger.Alert, "Failed to initialize Metric Collection service") + t.Fatalf("TestOCCStatelogGenerator failed with error %v", err) + } + <-time.After(time.Second * time.Duration(60)) +} + +func dataGenerator() { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + waitTime := time.Second * 1 + + metricNames := [11]string{"init", "acpt", "wait", "busy", "schd", "fnsh", "quce", "asgn", "idle", "bklg", "strd"} + + timer := time.NewTimer(waitTime) + + defer timer.Stop() + +mainloop: + for { + select { + case <-timer.C: + // Initialize statedata object + workerStateInfo1 := otellogger.WorkerStateInfo{ + StateTitle: "rw", + ShardId: 0, + WorkerType: 0, + InstanceId: 0, + } + workerStatesData := otellogger.WorkersStateData{ + WorkerStateInfo: &workerStateInfo1, + StateData: make(map[string]int64), + } + var numberofMetrics int = 11 + var totalSum int = 100 + var tempSum int = 0 + for index := 0; index < numberofMetrics; index++ { + exactpart := int(totalSum / numberofMetrics) + randVal := rand.Intn(exactpart) + randomValue := int(int(exactpart/2) + randVal) + value := If(tempSum+randomValue > totalSum, totalSum-tempSum, randomValue) + workerStatesData.StateData[metricNames[index]] = int64(value) + tempSum += value + } + workerStatesData.StateData["totalConnections"] = 100 + totalWorkersStateData := otellogger.GaugeMetricData{ + WorkerStateInfo: &workerStateInfo1, + StateData: 100, + } + //Random index + randIndex := rand.Intn(len(metricNames)) + workerStatesData.StateData[metricNames[randIndex]] += int64(totalSum - tempSum) + otellogger.AddDataPointToOTELStateDataChan(&workerStatesData) + otellogger.AddDataPointToTotalConnectionsDataChannel(&totalWorkersStateData) + timer.Reset(waitTime) + case <-ctx.Done(): + logger.GetLogger().Log(logger.Info, "Timedout, so context closed") + break mainloop + } + } +} + +// Go terenary inplementation +func If[T any](cond bool, vtrue, vfalse T) T { + if cond { + return vtrue + } + return vfalse +}