Skip to content

Commit

Permalink
[Enhancement] split Metric "query_error" into there part (#52646)
Browse files Browse the repository at this point in the history
Signed-off-by: before-Sunrise <[email protected]>
(cherry picked from commit 932b28a)
  • Loading branch information
before-Sunrise authored and mergify[bot] committed Nov 11, 2024
1 parent 22d46ad commit 7f9827b
Show file tree
Hide file tree
Showing 10 changed files with 87 additions and 14 deletions.
10 changes: 10 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/common/ErrorReport.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,16 @@ public static void reportUserException(ErrorCode errorCode, Object... objs)
throw new UserException(reportCommon(null, errorCode, objs));
}

public static void reportTimeoutException(ErrorCode errorCode, Object... objs)
throws TimeoutException {
throw new TimeoutException(reportCommon(null, errorCode, objs));
}

public static void reportNoAliveBackendException(ErrorCode errorCode, Object... objs)
throws NoAliveBackendException {
throw new NoAliveBackendException(reportCommon(null, errorCode, objs));
}

public interface DdlExecutor {
void apply() throws UserException;
}
Expand Down
5 changes: 5 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/common/Status.java
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ public void setRpcStatus(String msg) {
this.errorMsg = msg;
}

public void setTimeOutStatus(String msg) {
this.errorCode = TStatusCode.TIMEOUT;
this.errorMsg = msg;
}

public void rewriteErrorMsg() {
if (ok()) {
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@
import com.google.common.base.Strings;

/**
* Thrown for internal server errors.
* Thrown for non-internal server errors.
* such as analyze error/parser error
* which implies this is a user error.so we don't need to pay attention to it
*/
public class UserException extends Exception {
private final InternalErrorCode errorCode;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ public class MetricCalculator extends TimerTask {
private long lastQueryCounter = -1;
private long lastRequestCounter = -1;
private long lastQueryErrCounter = -1;
private long lastQueryInternalErrCounter = -1;
private long lastQueryAnalysisErrCounter = -1;
private long lastQueryTimeOutCounter = -1;

private long lastQueryEventTime = -1;

@Override
Expand All @@ -68,6 +72,9 @@ private void update() {
lastQueryCounter = MetricRepo.COUNTER_QUERY_ALL.getValue();
lastRequestCounter = MetricRepo.COUNTER_REQUEST_ALL.getValue();
lastQueryErrCounter = MetricRepo.COUNTER_QUERY_ERR.getValue();
lastQueryInternalErrCounter = MetricRepo.COUNTER_QUERY_INTERNAL_ERR.getValue();
lastQueryAnalysisErrCounter = MetricRepo.COUNTER_QUERY_ANALYSIS_ERR.getValue();
lastQueryTimeOutCounter = MetricRepo.COUNTER_QUERY_TIMEOUT.getValue();
lastQueryEventTime = System.currentTimeMillis() * 1000000;
return;
}
Expand All @@ -92,6 +99,24 @@ private void update() {
MetricRepo.GAUGE_QUERY_ERR_RATE.setValue(errRate < 0 ? 0.0 : errRate);
lastQueryErrCounter = currentErrCounter;

// internal err rate
long currentInternalErrCounter = MetricRepo.COUNTER_QUERY_INTERNAL_ERR.getValue();
double internalErrRate = (double) (currentInternalErrCounter - lastQueryInternalErrCounter) / interval;
MetricRepo.GAUGE_QUERY_INTERNAL_ERR_RATE.setValue(errRate < 0 ? 0.0 : internalErrRate);
lastQueryInternalErrCounter = currentErrCounter;

// analysis error rate
long currentAnalysisErrCounter = MetricRepo.COUNTER_QUERY_ANALYSIS_ERR.getValue();
double analysisErrRate = (double) (currentAnalysisErrCounter - lastQueryAnalysisErrCounter) / interval;
MetricRepo.GAUGE_QUERY_ANALYSIS_ERR_RATE.setValue(errRate < 0 ? 0.0 : analysisErrRate);
lastQueryAnalysisErrCounter = currentErrCounter;

// query timeout rate
long currentTimeoutErrCounter = MetricRepo.COUNTER_QUERY_TIMEOUT.getValue();
double timeoutErrRate = (double) (currentTimeoutErrCounter - lastQueryTimeOutCounter) / interval;
MetricRepo.GAUGE_QUERY_TIMEOUT_RATE.setValue(errRate < 0 ? 0.0 : timeoutErrRate);
lastQueryTimeOutCounter = currentErrCounter;

lastTs = currentTs;

// max tablet compaction score of all backends
Expand Down
20 changes: 20 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/metric/MetricRepo.java
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@ public final class MetricRepo {
public static GaugeMetricImpl<Double> GAUGE_QUERY_PER_SECOND;
public static GaugeMetricImpl<Double> GAUGE_REQUEST_PER_SECOND;
public static GaugeMetricImpl<Double> GAUGE_QUERY_ERR_RATE;
public static GaugeMetricImpl<Double> GAUGE_QUERY_INTERNAL_ERR_RATE;
public static GaugeMetricImpl<Double> GAUGE_QUERY_ANALYSIS_ERR_RATE;
public static GaugeMetricImpl<Double> GAUGE_QUERY_TIMEOUT_RATE;

// these query latency is different from HISTO_QUERY_LATENCY, for these only summarize the latest queries, but HISTO_QUERY_LATENCY summarizes all queries.
public static GaugeMetricImpl<Double> GAUGE_QUERY_LATENCY_MEAN;
public static GaugeMetricImpl<Double> GAUGE_QUERY_LATENCY_MEDIAN;
Expand Down Expand Up @@ -332,6 +336,21 @@ public Long getValue() {
GAUGE_QUERY_ERR_RATE.setValue(0.0);
STARROCKS_METRIC_REGISTER.addMetric(GAUGE_QUERY_ERR_RATE);

GAUGE_QUERY_INTERNAL_ERR_RATE =
new GaugeMetricImpl<>("query_internal_err_rate", MetricUnit.NOUNIT, "query internal error rate");
GAUGE_QUERY_INTERNAL_ERR_RATE.setValue(0.0);
STARROCKS_METRIC_REGISTER.addMetric(GAUGE_QUERY_INTERNAL_ERR_RATE);

GAUGE_QUERY_ANALYSIS_ERR_RATE =
new GaugeMetricImpl<>("query_analysis_err_rate", MetricUnit.NOUNIT, "query analysis error rate");
GAUGE_QUERY_ANALYSIS_ERR_RATE.setValue(0.0);
STARROCKS_METRIC_REGISTER.addMetric(GAUGE_QUERY_ANALYSIS_ERR_RATE);

GAUGE_QUERY_TIMEOUT_RATE =
new GaugeMetricImpl<>("query_timeout_rate", MetricUnit.NOUNIT, "query timeout rate");
GAUGE_QUERY_TIMEOUT_RATE.setValue(0.0);
STARROCKS_METRIC_REGISTER.addMetric(GAUGE_QUERY_TIMEOUT_RATE);

GAUGE_MAX_TABLET_COMPACTION_SCORE = new GaugeMetricImpl<>("max_tablet_compaction_score",
MetricUnit.NOUNIT, "max tablet compaction score of all backends");
GAUGE_MAX_TABLET_COMPACTION_SCORE.setValue(0L);
Expand Down Expand Up @@ -454,6 +473,7 @@ public Long getValue() {
COUNTER_QUERY_ANALYSIS_ERR = new LongCounterMetric("query_analysis_err", MetricUnit.REQUESTS,
"total analysis error query");
STARROCKS_METRIC_REGISTER.addMetric(COUNTER_QUERY_ANALYSIS_ERR);

COUNTER_QUERY_INTERNAL_ERR = new LongCounterMetric("query_internal_err", MetricUnit.REQUESTS,
"total internal error query");
STARROCKS_METRIC_REGISTER.addMetric(COUNTER_QUERY_INTERNAL_ERR);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,8 @@ public void auditAfterExec(String origStmt, StatementBase parsedStmt, PQueryStat
//represent analysis err
if (ctx.getState().getErrType() == QueryState.ErrType.ANALYSIS_ERR) {
MetricRepo.COUNTER_QUERY_ANALYSIS_ERR.increase(1L);
} else if (ctx.getState().getErrType() == QueryState.ErrType.EXEC_TIME_OUT) {
MetricRepo.COUNTER_QUERY_TIMEOUT.increase(1L);
} else {
MetricRepo.COUNTER_QUERY_INTERNAL_ERR.increase(1L);
}
Expand Down Expand Up @@ -299,6 +301,8 @@ protected void handleQuery() {
.setCatalog(ctx.getCurrentCatalog())
.setWarehouse(ctx.getCurrentWarehouseName());
Tracers.register(ctx);
// set isQuery before `forwardToLeader` to make it right for audit log.
ctx.getState().setIsQuery(true);

// execute this query.
StatementBase parsedStmt = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
import com.starrocks.catalog.ResourceGroup;
import com.starrocks.common.AnalysisException;
import com.starrocks.common.Config;
import com.starrocks.common.ErrorCode;
import com.starrocks.common.ErrorReport;
import com.starrocks.common.FeConstants;
import com.starrocks.common.InternalErrorCode;
import com.starrocks.common.Status;
Expand Down Expand Up @@ -842,6 +844,8 @@ public RowBatch getNext() throws Exception {
if (copyStatus.isCancelled() &&
copyStatus.getErrorMsg().equals(FeConstants.BACKEND_NODE_NOT_FOUND_ERROR)) {
ec = InternalErrorCode.CANCEL_NODE_NOT_ALIVE_ERR;
} else if (copyStatus.isTimeout()) {
ErrorReport.reportTimeoutException(ErrorCode.ERR_QUERY_TIMEOUT, errMsg);
}
throw new UserException(ec, errMsg);
}
Expand Down
3 changes: 3 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/qe/QueryState.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ public enum ErrType {

IO_ERR,

// execution Timeout
EXEC_TIME_OUT,

UNKNOWN
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@

import com.starrocks.common.Status;
import com.starrocks.common.util.DebugUtil;
import com.starrocks.metric.MetricRepo;
import com.starrocks.proto.PFetchDataResult;
import com.starrocks.proto.PUniqueId;
import com.starrocks.rpc.BackendServiceClient;
Expand Down Expand Up @@ -148,11 +147,8 @@ public RowBatch getNext(Status status) throws TException {
}
} catch (TimeoutException e) {
LOG.warn("fetch result timeout, finstId={}", DebugUtil.printId(finstId), e);
status.setInternalErrorStatus(String.format("Query exceeded time limit of %d seconds",
status.setTimeOutStatus(String.format("Query exceeded time limit of %d seconds",
ConnectContext.get().getSessionVariable().getQueryTimeoutS()));
if (MetricRepo.hasInit) {
MetricRepo.COUNTER_QUERY_TIMEOUT.increase(1L);
}
}

if (isCancel) {
Expand Down
20 changes: 12 additions & 8 deletions fe/fe-core/src/main/java/com/starrocks/qe/StmtExecutor.java
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,11 @@
import com.starrocks.common.ErrorReport;
import com.starrocks.common.FeConstants;
import com.starrocks.common.MetaNotFoundException;
import com.starrocks.common.NoAliveBackendException;
import com.starrocks.common.Pair;
import com.starrocks.common.QueryDumpLog;
import com.starrocks.common.Status;
import com.starrocks.common.TimeoutException;
import com.starrocks.common.UserException;
import com.starrocks.common.Version;
import com.starrocks.common.profile.Timer;
Expand Down Expand Up @@ -477,10 +479,6 @@ public void execute() throws Exception {
}

try {
boolean isQuery = parsedStmt instanceof QueryStatement;
// set isQuery before `forwardToLeader` to make it right for audit log.
context.getState().setIsQuery(isQuery);

if (parsedStmt.isExistQueryScopeHint()) {
processQueryScopeHint();
}
Expand Down Expand Up @@ -576,7 +574,7 @@ public void execute() throws Exception {

// For follower: verify sql in BlackList before forward to leader
// For leader: if this is a proxy sql, no need to verify sql in BlackList because every fe has its own blacklist
if ((isQuery || parsedStmt instanceof InsertStmt)
if ((parsedStmt instanceof QueryStatement || parsedStmt instanceof InsertStmt)
&& Config.enable_sql_blacklist && !parsedStmt.isExplain() && !isProxy) {
OriginStatement origStmt = parsedStmt.getOrigStmt();
if (origStmt != null) {
Expand Down Expand Up @@ -750,7 +748,13 @@ public void execute() throws Exception {
if (parsedStmt instanceof KillStmt) {
// ignore kill stmt execute err(not monitor it)
context.getState().setErrType(QueryState.ErrType.IGNORE_ERR);
} else if (e instanceof TimeoutException) {
context.getState().setErrType(QueryState.ErrType.EXEC_TIME_OUT);
} else if (e instanceof NoAliveBackendException) {
context.getState().setErrType(QueryState.ErrType.INTERNAL_ERR);
} else {
// TODO: some UserException doesn't belong to analysis error
// we should set such error type to internal error
context.getState().setErrType(QueryState.ErrType.ANALYSIS_ERR);
}
} catch (Throwable e) {
Expand Down Expand Up @@ -2177,17 +2181,17 @@ public void handleDMLStmt(ExecPlan execPlan, DmlStmt stmt) throws Exception {
}

coord.cancel(ErrorCode.ERR_QUERY_EXCEPTION.formatErrorMsg());
ErrorReport.reportDdlException(ErrorCode.ERR_QUERY_EXCEPTION);
ErrorReport.reportNoAliveBackendException(ErrorCode.ERR_QUERY_EXCEPTION);
} else {
coord.cancel(ErrorCode.ERR_QUERY_TIMEOUT.formatErrorMsg());
if (coord.isThriftServerHighLoad()) {
ErrorReport.reportDdlException(ErrorCode.ERR_QUERY_TIMEOUT,
ErrorReport.reportTimeoutException(ErrorCode.ERR_QUERY_TIMEOUT,
"Please check the thrift-server-pool metrics, " +
"if the pool size reaches thrift_server_max_worker_threads(default is 4096), " +
"you can set the config to a higher value in fe.conf, " +
"or set parallel_fragment_exec_instance_num to a lower value in session variable");
} else {
ErrorReport.reportDdlException(ErrorCode.ERR_QUERY_TIMEOUT,
ErrorReport.reportTimeoutException(ErrorCode.ERR_QUERY_TIMEOUT,
"Increase the query_timeout session variable and retry");
}
}
Expand Down

0 comments on commit 7f9827b

Please sign in to comment.