Skip to content

Commit

Permalink
AgentJanitor Rework (#1217)
Browse files Browse the repository at this point in the history
## Problems
1. AgentJanitor cleans up based on the agents_and_hosts table, but it's not complete. There are hosts that' never had an agent. So it should look at the hosts table as well. 
2. The order of processing was wrong. Stale hosts should be a subset of unreachable hosts. The current implementation processes unreachable hosts first then stale hosts. This can result in stale hosts being processed twice. 

## Improvements
Now the `AgentJanitor` does 3 things:
1. `processStaleHosts();`
2. `determineStaleHostCandidates();`
3. `cleanUpAgentlessHosts();`

The first 2 remain the same, except the order has been swapped. This resolves problem 2. The third task solves problem 1.

###  cleanUpAgentlessHosts
This method queries the DB via the new API `HostDAO:getStaleAgentlessHostIds` to get a list of host IDs without associated agents. The list is filtered by `last_update` so only stale hosts will be process. 

```SQL
SELECT DISTINCT
    hosts.host_id
FROM
    hosts
    LEFT JOIN hosts_and_agents ON hosts.host_id = hosts_and_agents.host_id
WHERE
    hosts.last_update < ?
    AND hosts_and_agents.host_id IS NULL
ORDER BY
    hosts.last_update DESC
LIMIT
    ?
```

## Tests and validations
Deployed to dev1 and manually validated the status by reviewing corresponding logs. 

CDP-6636
  • Loading branch information
tylerwowen authored Jul 27, 2023
1 parent 440f8c8 commit b2bd6b8
Show file tree
Hide file tree
Showing 8 changed files with 252 additions and 142 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Expand All @@ -15,11 +15,10 @@
*/
package com.pinterest.deployservice.dao;

import com.pinterest.deployservice.bean.HostAgentBean;

import java.util.Collection;
import java.sql.SQLException;
import java.util.List;
import java.util.Set;

import com.pinterest.deployservice.bean.HostAgentBean;

/**
* A collection of methods to help hosts and groups mapping
Expand All @@ -35,9 +34,11 @@ public interface HostAgentDAO {

HostAgentBean getHostById(String hostId) throws Exception;

List<HostAgentBean> getStaleHosts(long after) throws Exception;
List<HostAgentBean> getStaleHosts(long lastUpdateBefore) throws SQLException;

List<HostAgentBean> getStaleHosts(long lastUpdateAfter, long lastUpdateBefore) throws SQLException;

List<HostAgentBean> getStaleEnvHosts(long after) throws Exception;
List<HostAgentBean> getStaleEnvHosts(long lastUpdateBefore) throws Exception;

List<HostAgentBean> getHostsByAgent(String agentVersion, long pageIndex, int pageSize) throws Exception;
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Expand All @@ -17,6 +17,7 @@

import com.pinterest.deployservice.bean.HostBean;

import java.sql.SQLException;
import java.util.Collection;
import java.util.List;
import java.util.Set;
Expand Down Expand Up @@ -55,6 +56,8 @@ public interface HostDAO {

List<HostBean> getTerminatingHosts() throws Exception;

List<String> getStaleAgentlessHostIds(long lastUpdateBefore, int limit) throws SQLException;

Collection<HostBean> getHostsByEnvId(String envId) throws Exception;

HostBean getByEnvIdAndHostId(String envId, String hostId) throws Exception;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Expand All @@ -15,28 +15,27 @@
*/
package com.pinterest.deployservice.db;

import com.pinterest.deployservice.bean.HostAgentBean;
import com.pinterest.deployservice.bean.HostState;
import com.pinterest.deployservice.bean.SetClause;
import com.pinterest.deployservice.dao.HostAgentDAO;
import java.sql.SQLException;
import java.util.List;

import org.apache.commons.dbcp.BasicDataSource;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.ResultSetHandler;
import org.apache.commons.dbutils.handlers.BeanHandler;
import org.apache.commons.dbutils.handlers.BeanListHandler;

import java.util.Collection;
import java.util.List;
import java.util.Set;
import com.pinterest.deployservice.bean.HostAgentBean;
import com.pinterest.deployservice.bean.SetClause;
import com.pinterest.deployservice.dao.HostAgentDAO;

public class DBHostAgentDAOImpl implements HostAgentDAO {
private static final String INSERT_HOST_TEMPLATE = "INSERT INTO hosts_and_agents SET %s ON DUPLICATE KEY UPDATE %s";
private static final String UPDATE_HOST_BY_ID = "UPDATE hosts_and_agents SET %s WHERE host_id=?";
private static final String DELETE_HOST_BY_ID = "DELETE FROM hosts_and_agents WHERE host_id=?";
private static final String GET_HOST_BY_NAME = "SELECT * FROM hosts_and_agents WHERE host_name=?";
private static final String GET_HOST_BY_HOSTID = "SELECT * FROM hosts_and_agents WHERE host_id=?";
private static final String GET_STALE_HOST = "SELECT DISTINCT hosts_and_agents.* FROM hosts_and_agents WHERE hosts_and_agents.last_update<?";
private static final String GET_HOSTS_BY_LAST_UPDATE = "SELECT DISTINCT * FROM hosts_and_agents WHERE last_update<?";
private static final String GET_HOSTS_BY_LAST_UPDATES = "SELECT DISTINCT * FROM hosts_and_agents WHERE last_update>? AND last_update<?";
private static final String GET_STALE_ENV_HOST = "SELECT DISTINCT hosts_and_agents.* FROM hosts_and_agents INNER JOIN hosts_and_envs ON hosts_and_agents.host_name=hosts_and_envs.host_name WHERE hosts_and_agents.last_update<?";
private static final String GET_HOSTS_BY_AGENT = "SELECT * FROM hosts_statuses WHERE agent_version=? ORDER BY host_id LIMIT ?,?";

Expand Down Expand Up @@ -78,9 +77,15 @@ public HostAgentBean getHostById(String hostId) throws Exception {
}

@Override
public List<HostAgentBean> getStaleHosts(long after) throws Exception {
public List<HostAgentBean> getStaleHosts(long lastUpdateBefore) throws SQLException {
ResultSetHandler<List<HostAgentBean>> h = new BeanListHandler<>(HostAgentBean.class);
return new QueryRunner(dataSource).query(GET_HOSTS_BY_LAST_UPDATE, h, lastUpdateBefore);
}

@Override
public List<HostAgentBean> getStaleHosts(long lastUpdateAfter, long lastUpdateBefore) throws SQLException {
ResultSetHandler<List<HostAgentBean>> h = new BeanListHandler<>(HostAgentBean.class);
return new QueryRunner(dataSource).query(GET_STALE_HOST, h, after);
return new QueryRunner(dataSource).query(GET_HOSTS_BY_LAST_UPDATES, h, lastUpdateAfter, lastUpdateBefore);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.commons.dbutils.handlers.BeanHandler;
import org.apache.commons.dbutils.handlers.BeanListHandler;

import java.sql.SQLException;
import java.util.Collection;
import java.util.List;
import java.util.Set;
Expand All @@ -51,8 +52,7 @@ public class DBHostDAOImpl implements HostDAO {
private static final String GET_HOST_BY_HOSTID = "SELECT * FROM hosts WHERE host_id=?";
private static final String GET_HOSTS_BY_STATES = "SELECT * FROM hosts WHERE state in (?, ?, ?) GROUP BY host_id ORDER BY last_update";
private static final String GET_GROUP_NAMES_BY_HOST = "SELECT group_name FROM hosts WHERE host_name=?";
private static final String GET_STALE_ENV_HOST = "SELECT DISTINCT hosts.* FROM hosts INNER JOIN hosts_and_envs ON hosts.host_name=hosts_and_envs.host_name WHERE hosts.last_update<?";
private static final String GET_STALE_HOST = "SELECT DISTINCT hosts.* FROM hosts WHERE hosts.last_update<?";
private static final String GET_STALE_AGENTLESS_HOST_IDS = "SELECT DISTINCT hosts.host_id FROM hosts LEFT JOIN hosts_and_agents ON hosts.host_id = hosts_and_agents.host_id WHERE hosts.last_update < ? AND hosts_and_agents.host_id IS NULL ORDER BY hosts.last_update DESC LIMIT ?";
private static final String GET_HOST_NAMES_BY_GROUP = "SELECT host_name FROM hosts WHERE group_name=?";
private static final String GET_HOST_IDS_BY_GROUP = "SELECT DISTINCT host_id FROM hosts WHERE group_name=?";
private static final String GET_HOSTS_BY_ENVID = "SELECT h.* FROM hosts h INNER JOIN groups_and_envs ge ON ge.group_name = h.group_name WHERE ge.env_id=? UNION DISTINCT SELECT hs.* FROM hosts hs INNER JOIN hosts_and_envs he ON he.host_name = hs.host_name WHERE he.env_id=?";
Expand Down Expand Up @@ -194,6 +194,12 @@ public List<HostBean> getTerminatingHosts() throws Exception {
HostState.TERMINATING.toString(), HostState.PENDING_TERMINATE_NO_REPLACE.toString());
}

@Override
public List<String> getStaleAgentlessHostIds(long lastUpdateBefore, int limit) throws SQLException {
return new QueryRunner(dataSource).query(GET_STALE_AGENTLESS_HOST_IDS,
SingleResultSetHandlerFactory.<String>newListObjectHandler(), lastUpdateBefore, limit);
}

@Override
public List<HostBean> getAllActiveHostsByGroup(String groupName) throws Exception {
ResultSetHandler<List<HostBean>> h = new BeanListHandler<>(HostBean.class);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
package com.pinterest.deployservice.handler;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.pinterest.deployservice.ServiceContext;
import com.pinterest.deployservice.bean.HostBean;
import com.pinterest.deployservice.common.CommonUtils;
import com.pinterest.deployservice.dao.AgentDAO;
import com.pinterest.deployservice.dao.HostDAO;
import com.pinterest.deployservice.dao.HostAgentDAO;
import com.pinterest.deployservice.dao.HostDAO;
import com.pinterest.deployservice.dao.HostTagDAO;
import com.pinterest.deployservice.handler.HostHandler;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HostHandler {
private static final Logger LOG = LoggerFactory.getLogger(HostHandler.class);
Expand All @@ -27,15 +24,36 @@ public HostHandler(ServiceContext serviceContext) {
hostTagDAO = serviceContext.getHostTagDAO();
}

public void removeHost(String hostId) throws Exception {
public void removeHost(String hostId) {
boolean hasException = false;
try {
hostDAO.deleteAllById(hostId);
agentDAO.deleteAllById(hostId);
} catch (Exception e) {
hasException = true;
LOG.error("Failed to remove host record from agent - " + hostId, e);
}
try {
hostTagDAO.deleteByHostId(hostId);
} catch (Exception e) {
hasException = true;
LOG.error("Failed to remove host record from hostTag - " + hostId, e);
}
try {
hostAgentDAO.delete(hostId);
LOG.info("Removed all records for the host {}", hostId);
} catch (Exception e) {
LOG.error("Failed to remove all records for the host {}, exception: {}", hostId, e);
hasException = true;
LOG.error("Failed to remove host record from hostAgent - " + hostId, e);
}
try {
hostDAO.deleteAllById(hostId);
} catch (Exception e) {
hasException = true;
LOG.error("Failed to remove host record from host - " + hostId, e);
}

if (!hasException) {
LOG.info("Removed all records for host {}", hostId);
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@
public class ConfigHelper {
private static final Logger LOG = LoggerFactory.getLogger(ConfigHelper.class);
private static final int DEFAULT_PERIOD = 30;
private static final int DEFAULT_MAX_STALE_HOST_THRESHOLD = 600; // 10 mins
private static final int DEFAULT_MIN_STALE_HOST_THRESHOLD = 150;
private static final int DEFAULT_LAUNCH_LATENCY_THRESHOLD = 600;
private static final int DEFAULT_MAX_STALE_HOST_THRESHOLD_SECONDS = 600; // 10 min
private static final int DEFAULT_MIN_STALE_HOST_THRESHOLD_SECONDS = 150; // 2.5 min
private static final int DEFAULT_LAUNCH_LATENCY_THRESHOLD_SECONDS = 600;
private static final String DEFAULT_DEPLOY_JANITOR_SCHEDULE = "0 30 3 * * ?";
private static final String DEFAULT_BUILD_JANITOR_SCHEDULE = "0 40 3 * * ?";
private static final int DEFAULT_MAX_DAYS_TO_KEEP = 180;
Expand Down Expand Up @@ -255,18 +255,23 @@ public static void scheduleWorkers(TeletraanServiceConfiguration configuration,

if (workerName.equalsIgnoreCase(SimpleAgentJanitor.class.getSimpleName())) {
ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1);
int minStaleHostThreshold = MapUtils.getIntValue(properties, "minStaleHostThreshold", DEFAULT_MIN_STALE_HOST_THRESHOLD);
int maxStaleHostThreshold = MapUtils.getIntValue(properties, "maxStaleHostThreshold", DEFAULT_MAX_STALE_HOST_THRESHOLD);
int minStaleHostThreshold = MapUtils.getIntValue(properties, "minStaleHostThreshold",
DEFAULT_MIN_STALE_HOST_THRESHOLD_SECONDS);
int maxStaleHostThreshold = MapUtils.getIntValue(properties,
"maxStaleHostThreshold", DEFAULT_MAX_STALE_HOST_THRESHOLD_SECONDS);
Runnable worker = new SimpleAgentJanitor(serviceContext, minStaleHostThreshold, maxStaleHostThreshold);
scheduler.scheduleAtFixedRate(worker, initDelay, period, TimeUnit.SECONDS);
LOG.info("Scheduled SimpleAgentJanitor.");
}

if (workerName.equalsIgnoreCase(AgentJanitor.class.getSimpleName())) {
ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1);
int minStaleHostThreshold = MapUtils.getIntValue(properties, "minStaleHostThreshold", DEFAULT_MIN_STALE_HOST_THRESHOLD);
int maxStaleHostThreshold = MapUtils.getIntValue(properties, "maxStaleHostThreshold", DEFAULT_MAX_STALE_HOST_THRESHOLD);
int maxLaunchLatencyThreshold = MapUtils.getIntValue(properties, "maxLaunchLaencyThreshold", DEFAULT_LAUNCH_LATENCY_THRESHOLD);
int minStaleHostThreshold = MapUtils.getIntValue(properties, "minStaleHostThreshold",
DEFAULT_MIN_STALE_HOST_THRESHOLD_SECONDS);
int maxStaleHostThreshold = MapUtils.getIntValue(properties, "maxStaleHostThreshold",
DEFAULT_MAX_STALE_HOST_THRESHOLD_SECONDS);
int maxLaunchLatencyThreshold = MapUtils.getIntValue(properties, "maxLaunchLatencyThreshold",
DEFAULT_LAUNCH_LATENCY_THRESHOLD_SECONDS);
Runnable worker = new AgentJanitor(serviceContext, minStaleHostThreshold, maxStaleHostThreshold, maxLaunchLatencyThreshold);
scheduler.scheduleAtFixedRate(worker, initDelay, period, TimeUnit.SECONDS);
LOG.info("Scheduled AgentJanitor.");
Expand Down
Loading

0 comments on commit b2bd6b8

Please sign in to comment.