diff --git a/lib/config.go b/lib/config.go index 49fc837d..20ae3511 100644 --- a/lib/config.go +++ b/lib/config.go @@ -31,6 +31,11 @@ import ( "sync/atomic" ) +const ( + mux_config_cal_name = "OCC_CONFIG" + oracle_worker_config_cal_name = "OCC_ORACLE_WORKER_CONFIG" +) + // The Config contains all the static configuration type Config struct { CertChainFile string @@ -181,6 +186,14 @@ type Config struct { // Max desired percentage of healthy workers for the worker pool MaxDesiredHealthyWorkerPct int + + // Oracle Worker Configs + EnableCache bool + EnableHeartBeat bool + EnableQueryReplaceNL bool + EnableBindHashLogging bool + EnableSessionVariables bool + UseNonBlocking bool } // The OpsConfig contains the configuration that can be modified during run time @@ -232,7 +245,6 @@ func InitConfig(poolName string) error { } else { currentDir = currentDir + "/" } - filename := currentDir + "hera.txt" cdb, err := config.NewTxtConfig(filename) @@ -310,6 +322,7 @@ func InitConfig(poolName string) error { } gAppConfig.EnableSharding = cdb.GetOrDefaultBool("enable_sharding", false) + gAppConfig.UseShardMap = cdb.GetOrDefaultBool("use_shardmap", true) gAppConfig.NumOfShards = cdb.GetOrDefaultInt("num_shards", 1) if gAppConfig.EnableSharding == false || gAppConfig.UseShardMap == false { @@ -368,6 +381,14 @@ func InitConfig(poolName string) error { // TODO: gAppConfig.NumStdbyDbs = 1 + // Fetch Oracle worker configurations.. The defaults must be same between oracle worker and here for accurate logging. + gAppConfig.EnableCache = cdb.GetOrDefaultBool("enable_cache", false) + gAppConfig.EnableHeartBeat = cdb.GetOrDefaultBool("enable_heart_beat", false) + gAppConfig.EnableQueryReplaceNL = cdb.GetOrDefaultBool("enable_query_replace_nl", true) + gAppConfig.EnableBindHashLogging = cdb.GetOrDefaultBool("enable_bind_hash_logging", false) + gAppConfig.EnableSessionVariables = cdb.GetOrDefaultBool("enable_session_variables", false) + gAppConfig.UseNonBlocking = cdb.GetOrDefaultBool("use_non_blocking", false) + var numWorkers int numWorkers = 6 //err = config.InitOpsConfigWithName("../opscfg/hera.txt") @@ -544,17 +565,15 @@ func LogOccConfigs() { "hostname_prefix": gAppConfig.HostnamePrefix, "sharding_cross_keys_err": gAppConfig.ShardingCrossKeysErr, //"enable_sql_rewrite", // not found anywhere? - "sharding_algo": gAppConfig.ShardingAlgoHash, - "cfg_from_tns_override_num_shards": gAppConfig.CfgFromTnsOverrideNumShards, + "sharding_algo": gAppConfig.ShardingAlgoHash, }, "TAF": { - "enable_taf": gAppConfig.EnableTAF, - "cfg_from_tns_override_taf": gAppConfig.CfgFromTnsOverrideTaf, - "testing_enable_dml_taf": gAppConfig.TestingEnableDMLTaf, - "taf_timeout_ms": gAppConfig.TAFTimeoutMs, - "taf_bin_duration": gAppConfig.TAFBinDuration, - "taf_allow_slow_every_x": gAppConfig.TAFAllowSlowEveryX, - "taf_normally_slow_count": gAppConfig.TAFNormallySlowCount, + "enable_taf": gAppConfig.EnableTAF, + "testing_enable_dml_taf": gAppConfig.TestingEnableDMLTaf, + "taf_timeout_ms": gAppConfig.TAFTimeoutMs, + "taf_bin_duration": gAppConfig.TAFBinDuration, + "taf_allow_slow_every_x": gAppConfig.TAFAllowSlowEveryX, + "taf_normally_slow_count": gAppConfig.TAFNormallySlowCount, }, "BIND-EVICTION": { "child.executable": gAppConfig.ChildExecutable, @@ -595,24 +614,42 @@ func LogOccConfigs() { "max_desire_healthy_worker_pct": gAppConfig.MaxDesiredHealthyWorkerPct, }, "R-W-SPLIT": { - "readonly_children_pct": gAppConfig.ReadonlyPct, - "cfg_from_tns_override_rw_split": gAppConfig.CfgFromTnsOverrideRWSplit, + "readonly_children_pct": gAppConfig.ReadonlyPct, }, "RAC": { "management_table_prefix": gAppConfig.ManagementTablePrefix, "rac_sql_interval": gAppConfig.RacMaintReloadInterval, "rac_restart_window": gAppConfig.RacRestartWindow, }, - "NO-CATEGORY": { + "GENERAL-CONFIGURATIONS": { "database_type": gAppConfig.DatabaseType, // Oracle = 0; MySQL=1; POSTGRES=2 - "cfg_from_tns": gAppConfig.CfgFromTns, "log_level": gOpsConfig.logLevel, "high_load_pct": gAppConfig.HighLoadPct, "init_limit_pct": gAppConfig.InitLimitPct, "num_standby_dbs": gAppConfig.NumStdbyDbs, }, + "ENABLE_CFG_FROM_TNS": { + "cfg_from_tns": gAppConfig.CfgFromTns, + "cfg_from_tns_override_num_shards": gAppConfig.CfgFromTnsOverrideNumShards, + "cfg_from_tns_override_taf": gAppConfig.CfgFromTnsOverrideTaf, + "cfg_from_tns_override_rw_split": gAppConfig.CfgFromTnsOverrideRWSplit, + }, + "STATEMENT-CACHE": { + "enable_cache": gAppConfig.EnableCache, + "enable_heart_beat": gAppConfig.EnableHeartBeat, + "enable_query_replace_nl": gAppConfig.EnableQueryReplaceNL, + }, + "SESSION-VARIABLES": { + "enable_session_variables": gAppConfig.EnableSessionVariables, + }, + "BIND-HASH-LOGGING": { + "enable_bind_hash_logging": gAppConfig.EnableBindHashLogging, + }, + "KEEP-ALIVE": { + "use_non_blocking": gAppConfig.UseNonBlocking, + }, } - + calName := mux_config_cal_name for feature, configs := range whiteListConfigs { switch feature { case "BACKLOG": @@ -643,17 +680,45 @@ func LogOccConfigs() { if gAppConfig.ReadonlyPct == 0 { continue } - case "SOFT-EVICTION", "BIND-EVICTION": + case "SATURATION-RECOVERY", "BIND-EVICTION": if GetSatRecoverThrottleRate() <= 0 { continue } + case "SOFT-EVICTION": + if GetSatRecoverThrottleRate() <= 0 && gAppConfig.SoftEvictionProbability <= 0 { + continue + } case "MANUAL-RATE-LIMITER": if !gAppConfig.EnableQueryBindBlocker { continue } + case "ENABLE_CFG_FROM_TNS": + if !gAppConfig.CfgFromTns { + continue + } + case "STATEMENT-CACHE": + if !gAppConfig.EnableCache { + continue + } + calName = oracle_worker_config_cal_name + case "SESSION-VARIABLES": + if !gAppConfig.EnableSessionVariables { + continue + } + calName = oracle_worker_config_cal_name + case "BIND-HASH-LOGGING": + if !gAppConfig.EnableBindHashLogging { + continue + } + calName = oracle_worker_config_cal_name + case "KEEP-ALIVE": + if !gAppConfig.UseNonBlocking { + continue + } + calName = oracle_worker_config_cal_name } - evt := cal.NewCalEvent("OCC_CONFIG", fmt.Sprintf(feature), cal.TransOK, "") + evt := cal.NewCalEvent(calName, fmt.Sprintf(feature), cal.TransOK, "") for cfg, val := range configs { s := fmt.Sprintf("%v", val) evt.AddDataStr(cfg, s) diff --git a/lib/coordinatortaf.go b/lib/coordinatortaf.go index e2ae8243..e7d4fa07 100644 --- a/lib/coordinatortaf.go +++ b/lib/coordinatortaf.go @@ -68,7 +68,7 @@ func (p *tafResponsePreproc) Write(bf []byte) (int, error) { } ora, sz := atoi(ns.Payload) switch ora { - case 3113, 3114, 3135, 12514, 3128, 3127, 3123, 3111, 3106, 1012, 28, 31, 51, 25400, 25401, 25402, 25403, 25404, 25405, 25407, 25408, 25409, 25425, 24343, 1041, 600, 700, 7445: + case 3113, 3114, 3135, 12514, 3128, 3127, 3123, 3111, 3106, 1012, 28, 31, 51, 25400, 25401, 25402, 25403, 25404, 25405, 25407, 25408, 25409, 25425, 24343, 1041, 600, 700, 7445, 4025: //for testing 962=: case 942: p.ok = false p.ora = string(ns.Payload[:sz]) diff --git a/lib/racmaint.go b/lib/racmaint.go index 27c3bbea..0221db44 100644 --- a/lib/racmaint.go +++ b/lib/racmaint.go @@ -61,12 +61,14 @@ func InitRacMaint(cmdLineModuleName string) { interval := GetConfig().RacMaintReloadInterval if interval > 0 { for i := 0; i < GetConfig().NumOfShards; i++ { - go racMaintMain(i, interval, cmdLineModuleName) + shardIndex := i //Address the behavior called variable capture. + go racMaintMain(shardIndex, interval, cmdLineModuleName) } } } // racMaintMain wakes up every n seconds (configured in "rac_sql_interval") and reads the table +// // [ManagementTablePrefix]_maint table to see if maintenance is requested func racMaintMain(shard int, interval int, cmdLineModuleName string) { if logger.GetLogger().V(logger.Debug) { @@ -109,8 +111,8 @@ func racMaintMain(shard int, interval int, cmdLineModuleName string) { } /* - racMaint is the main function for RAC maintenance processing, being called regularly. - When maintenance is planned, it calls workerpool.RacMaint to start the actuall processing +racMaint is the main function for RAC maintenance processing, being called regularly. +When maintenance is planned, it calls workerpool.RacMaint to start the actuall processing */ func racMaint(ctx context.Context, shard int, db *sql.DB, racSQL string, cmdLineModuleName string, prev map[racCfgKey]racCfg) { // @@ -220,12 +222,12 @@ func racMaint(ctx context.Context, shard int, db *sql.DB, racSQL string, cmdLine workerpool, err = GetWorkerBrokerInstance().GetWorkerPool(wtypeRW, 0, shard) } if err == nil { - go workerpool.RacMaint(racReq) + workerpool.RacMaint(racReq) } if GetConfig().ReadonlyPct > 0 { - workerpool, err := GetWorkerBrokerInstance().GetWorkerPool(wtypeRO, 0, shard) + workerpool, err = GetWorkerBrokerInstance().GetWorkerPool(wtypeRO, 0, shard) if err == nil { - go workerpool.RacMaint(racReq) + workerpool.RacMaint(racReq) } } prev[cfgKey] = row diff --git a/lib/workerpool.go b/lib/workerpool.go index d5450ca2..50aab16f 100644 --- a/lib/workerpool.go +++ b/lib/workerpool.go @@ -119,7 +119,7 @@ func (pool *WorkerPool) spawnWorker(wid int) error { worker.setState(wsSchd) millis := rand.Intn(GetConfig().RandomStartMs) if logger.GetLogger().V(logger.Alert) { - logger.GetLogger().Log(logger.Alert, wid, "randomized start ms",millis) + logger.GetLogger().Log(logger.Alert, wid, "randomized start ms", millis) } time.Sleep(time.Millisecond * time.Duration(millis)) @@ -131,7 +131,7 @@ func (pool *WorkerPool) spawnWorker(wid int) error { } millis := rand.Intn(3000) if logger.GetLogger().V(logger.Alert) { - logger.GetLogger().Log(logger.Alert, initCnt, "is too many in init state. waiting to start",wid) + logger.GetLogger().Log(logger.Alert, initCnt, "is too many in init state. waiting to start", wid) } time.Sleep(time.Millisecond * time.Duration(millis)) } @@ -233,8 +233,10 @@ func (pool *WorkerPool) WorkerReady(worker *WorkerClient) (err error) { // GetWorker gets the active worker if available. backlog with timeout if not. // // @param sqlhash to check for soft eviction against a blacklist of slow queries. -// if getworker needs to exam the incoming sql, there does not seem to be another elegant -// way to do this except to pass in the sqlhash as a parameter. +// +// if getworker needs to exam the incoming sql, there does not seem to be another elegant +// way to do this except to pass in the sqlhash as a parameter. +// // @param timeoutMs[0] timeout in milliseconds. default to adaptive queue timeout. func (pool *WorkerPool) GetWorker(sqlhash int32, timeoutMs ...int) (worker *WorkerClient, t string, err error) { if logger.GetLogger().V(logger.Debug) { @@ -559,10 +561,10 @@ func (pool *WorkerPool) ReturnWorker(worker *WorkerClient, ticket string) (err e } if skipRecycle { if logger.GetLogger().V(logger.Alert) { - logger.GetLogger().Log(logger.Alert, "Non Healthy Worker found in pool, module_name=",pool.moduleName,"shard_id=",pool.ShardID, "HEALTHY worker Count=",pool.GetHealthyWorkersCount(),"TotalWorkers:=", pool.desiredSize) + logger.GetLogger().Log(logger.Alert, "Non Healthy Worker found in pool, module_name=", pool.moduleName, "shard_id=", pool.ShardID, "HEALTHY worker Count=", pool.GetHealthyWorkersCount(), "TotalWorkers:=", pool.desiredSize) } calMsg := fmt.Sprintf("Recycle(worker_pid)=%d, module_name=%s,shard_id=%d", worker.pid, worker.moduleName, worker.shardID) - evt := cal.NewCalEvent("SKIP_RECYCLE_WORKER","ReturnWorker", cal.TransOK, calMsg) + evt := cal.NewCalEvent("SKIP_RECYCLE_WORKER", "ReturnWorker", cal.TransOK, calMsg) evt.Completed() } @@ -697,8 +699,6 @@ func (pool *WorkerPool) RacMaint(racReq racAct) { } now := time.Now().Unix() window := GetConfig().RacRestartWindow - dbUname := "" - cnt := 0 pool.poolCond.L.Lock() for i := 0; i < pool.currentSize; i++ { if (pool.workers[i] != nil) && (racReq.instID == 0 || pool.workers[i].racID == racReq.instID) && (pool.workers[i].startTime < int64(racReq.tm)) { @@ -716,23 +716,23 @@ func (pool *WorkerPool) RacMaint(racReq racAct) { } if logger.GetLogger().V(logger.Verbose) { - logger.GetLogger().Log(logger.Verbose, "Rac maint activating, worker", i, pool.workers[i].pid, "exittime=", pool.workers[i].exitTime, now, window, pool.currentSize) - } - cnt++ - if len(dbUname) == 0 { - dbUname = pool.workers[i].dbUname + logger.GetLogger().Log(logger.Verbose, "Rac maint activating, worker", i, pool.workers[i].pid, "exittime=", pool.workers[i].exitTime, now, window, pool.currentSize, "rac.req timestamp=", racReq.tm) } + //Trigger individual event for worker + evt := cal.NewCalEvent("RAC_ID", fmt.Sprintf("%d", racReq.instID), cal.TransOK, "") + evt.AddDataStr("poolModName", pool.moduleName) + evt.AddDataInt("workerId", int64(i)) + evt.AddDataInt("pid", int64(pool.workers[i].pid)) + evt.AddDataInt("shardId", int64(pool.ShardID)) + evt.AddDataInt("tm", int64(racReq.tm)) + evt.AddDataInt("exitTime", pool.workers[i].exitTime) + evt.AddDataStr("exitInSec", fmt.Sprintf("%dsec", pool.workers[i].exitTime-now)) + evt.Completed() + evt = cal.NewCalEvent("DB_UNAME", pool.workers[i].dbUname, cal.TransOK, "") + evt.Completed() } } pool.poolCond.L.Unlock() - // TODO: C++ worker logs one event for each worker, in the worker, so - // we keep the same. Think about changing it - for i := 0; i < cnt; i++ { - evt := cal.NewCalEvent("RAC_ID", fmt.Sprintf("%d", racReq.instID), cal.TransOK, "") - evt.Completed() - evt = cal.NewCalEvent("DB_UNAME", dbUname, cal.TransOK, "") - evt.Completed() - } } // checkWorkerLifespan is called periodically to check if any worker lifetime has expired and terminates it @@ -768,12 +768,12 @@ func (pool *WorkerPool) checkWorkerLifespan() { pool.poolCond.L.Lock() for i := 0; i < pool.currentSize; i++ { if (pool.workers[i] != nil) && (pool.workers[i].exitTime != 0) && (pool.workers[i].exitTime <= now) { - if pool.GetHealthyWorkersCount() < (int32(pool.desiredSize*GetConfig().MaxDesiredHealthyWorkerPct/100)) { // Should it be a config value + if pool.GetHealthyWorkersCount() < (int32(pool.desiredSize * GetConfig().MaxDesiredHealthyWorkerPct / 100)) { // Should it be a config value if logger.GetLogger().V(logger.Alert) { - logger.GetLogger().Log(logger.Alert, "Non Healthy Worker found in pool, module_name=",pool.moduleName,"shard_id=",pool.ShardID, "HEALTHY worker Count=",pool.GetHealthyWorkersCount(),"TotalWorkers:", pool.desiredSize) + logger.GetLogger().Log(logger.Alert, "Non Healthy Worker found in pool, module_name=", pool.moduleName, "shard_id=", pool.ShardID, "HEALTHY worker Count=", pool.GetHealthyWorkersCount(), "TotalWorkers:", pool.desiredSize) } calMsg := fmt.Sprintf("module_name=%s,shard_id=%d", pool.moduleName, pool.ShardID) - evt := cal.NewCalEvent("SKIP_RECYCLE_WORKER","checkWorkerLifespan", cal.TransOK, calMsg) + evt := cal.NewCalEvent("SKIP_RECYCLE_WORKER", "checkWorkerLifespan", cal.TransOK, calMsg) evt.Completed() break } @@ -814,7 +814,7 @@ func (pool *WorkerPool) checkWorkerLifespan() { pool.poolCond.L.Unlock() for _, w := range workers { if logger.GetLogger().V(logger.Info) { - logger.GetLogger().Log(logger.Info, "checkworkerlifespan - Lifespan exceeded, terminate worker: pid =", w.pid, ", pool_type =", w.Type, ", inst =", w.instID ,"HEALTHY worker Count=",pool.GetHealthyWorkersCount(),"TotalWorkers:", pool.desiredSize) + logger.GetLogger().Log(logger.Info, "checkworkerlifespan - Lifespan exceeded, terminate worker: pid =", w.pid, ", pool_type =", w.Type, ", inst =", w.instID, "HEALTHY worker Count=", pool.GetHealthyWorkersCount(), "TotalWorkers:", pool.desiredSize) } w.Terminate() } diff --git a/tests/unittest/rac_maint_async/main_test.go b/tests/unittest/rac_maint_async/main_test.go new file mode 100644 index 00000000..e88e7d48 --- /dev/null +++ b/tests/unittest/rac_maint_async/main_test.go @@ -0,0 +1,122 @@ +package main + +import ( + "context" + "database/sql" + "fmt" + "math/rand" + "os" + "testing" + "time" + + "github.com/paypal/hera/tests/unittest/testutil" + "github.com/paypal/hera/utility/logger" +) + +var mx testutil.Mux +var tableName string + +func cfg() (map[string]string, map[string]string, testutil.WorkerType) { + + appcfg := make(map[string]string) + // best to chose an "unique" port in case golang runs tests in paralel + appcfg["bind_port"] = "31002" + appcfg["log_level"] = "5" + appcfg["log_file"] = "hera.log" + appcfg["sharding_cfg_reload_interval"] = "0" + appcfg["rac_sql_interval"] = "1" + + opscfg := make(map[string]string) + opscfg["opscfg.default.server.max_connections"] = "10" + opscfg["opscfg.default.server.log_level"] = "5" + + //return appcfg, opscfg, testutil.OracleWorker + return appcfg, opscfg, testutil.MySQLWorker +} + +func before() error { + os.Setenv("PARALLEL", "1") + pfx := os.Getenv("MGMT_TABLE_PREFIX") + if pfx == "" { + pfx = "hera" + } + tableName = pfx + "_maint" + return nil +} + +func TestMain(m *testing.M) { + os.Exit(testutil.UtilMain(m, cfg, before)) +} + +func TestRacMaintWithRandomStatusChangeInAsync(t *testing.T) { + logger.GetLogger().Log(logger.Debug, "TestRacMaintWithRandomStatusChangeInAsync begin +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") + shard := 0 + db, err := sql.Open("heraloop", fmt.Sprintf("%d:0:0", shard)) + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + statusArray := []string{"U", "R", "F"} + time.Sleep(5 * time.Second) + + go func() { + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + for { + status1 := rand.Intn(len(statusArray)) + status2 := rand.Intn(len(statusArray)) + var err error + var conn *sql.Conn + // cleanup and insert one row in the table + conn, err = db.Conn(ctx) + if err != nil { + t.Fatalf("Error getting connection %s\n", err.Error()) + } + tx, _ := conn.BeginTx(ctx, nil) + stmt, _ := tx.PrepareContext(ctx, "/*cmd*/delete from "+tableName) + _, err = stmt.Exec() + if err != nil { + t.Fatalf("Error preparing test (delete table) %s\n", err.Error()) + } + stmt, _ = tx.PrepareContext(ctx, "/*cmd*/insert into "+tableName+" (inst_id, status, status_time, module, machine) values (?,?,?,?,?)") + hostname, _ := os.Hostname() + // how to do inst_id + _, err = stmt.Exec(0 /*max instid*/, statusArray[status1], time.Now().Unix()+2, "hera-test", hostname) + _, err = stmt.Exec(0, statusArray[status2], time.Now().Unix()+2, "hera-test_taf", hostname) + if err != nil { + t.Fatalf("Error preparing test (create row in table) %s\n", err.Error()) + } + err = tx.Commit() + if err != nil { + t.Fatalf("Error commit %s\n", err.Error()) + } + conn.Close() + time.Sleep(1000 * time.Millisecond) + } + }() + if err != nil { + t.Fatal("Error starting Mux:", err) + return + } + db.SetMaxIdleConns(0) + defer db.Close() + + time.Sleep(45000 * time.Millisecond) + + if 0 == testutil.RegexCountFile("Rac maint activating, worker", "hera.log") { + t.Fatalf("requires rac maint activation for main module status") + } + + if 0 == testutil.RegexCountFile("module:HERA-TEST_TAF", "cal.log") { + t.Fatalf("Status 'U' should log the RACMAINT_INFO_CHANGE event") + } + if 0 != testutil.RegexCountFile("invalid_status", "cal.log") { + t.Fatalf("ram maint status 'U' should not skip with invalid-status event") + } + + if testutil.RegexCountFile("RAC_ID", "cal.log") < 20 { + t.Fatalf("ram maint should trigger for all workers once.") + } + + logger.GetLogger().Log(logger.Debug, "TestRacMaintWithRandomStatusChangeInAsync done -------------------------------------------------------------") +}