-
Notifications
You must be signed in to change notification settings - Fork 44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Automated GitHub Actions Test for gRPC Training #148
Changes from 19 commits
f0192d4
755fc07
d37c35b
4c6d85e
2f087e1
464a674
71dd9e8
65fd527
dc1fb85
9c7f1b7
a818cbe
3c14086
33e668a
6cea0a9
23c3252
3e93738
66f3c44
c765013
9e3ea96
b0677f9
3d28db0
ddbdbb1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
name: Test Training Code with gRPC | ||
|
||
on: | ||
workflow_dispatch: | ||
push: | ||
branches: | ||
# - main | ||
- "*" | ||
pull_request: | ||
branches: | ||
- main | ||
|
||
env: | ||
ACTIONS_STEP_DEBUG: true | ||
|
||
jobs: | ||
train-check: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
# Step 1: Checkout the code | ||
- name: Checkout repository | ||
uses: actions/checkout@v3 | ||
|
||
# Step 2: Set up Python | ||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: "3.10" # Specify the Python version you're using | ||
|
||
# Step 3: Install dependencies | ||
- name: Install dependencies | ||
run: | | ||
sudo apt update | ||
sudo apt install -y libopenmpi-dev openmpi-bin | ||
sudo apt-get install -y libgl1 libglib2.0-0 | ||
|
||
pip install -r requirements.txt | ||
|
||
# Step 4: Run gRPC server and client | ||
- name: Run test | ||
run: | | ||
cd src | ||
# chmod +x ./configs/algo_config_test.py | ||
|
||
echo "starting main grpc" | ||
python main_grpc.py -n 4 -host localhost | ||
echo "starting main" | ||
python main.py -super true -s "./configs/sys_config_test.py" | ||
echo "done" | ||
|
||
# further checks: | ||
# only 5 rounds | ||
# gRPC only? or also MPI? | ||
# num of samples | ||
# num users and nodes |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,10 @@ | ||
{ | ||
"python.analysis.typeCheckingMode": "strict" | ||
} | ||
"python.analysis.typeCheckingMode": "strict", | ||
"sshfs.configs": [ | ||
{ | ||
"name": "matlaber", | ||
"host": "matlaber7.media.mit.edu", | ||
"username": "kle" | ||
} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from utils.types import ConfigType | ||
|
||
# fedstatic: ConfigType = { | ||
# # Collaboration setup | ||
# "algo": "fedstatic", | ||
# "topology": {"name": "watts_strogatz", "k": 3, "p": 0.2}, # type: ignore | ||
# "rounds": 1, | ||
|
||
# # Model parameters | ||
# "model": "resnet10", | ||
# "model_lr": 3e-4, | ||
# "batch_size": 256, | ||
# } | ||
|
||
traditional_fl: ConfigType = { | ||
# Collaboration setup | ||
"algo": "fedavg", | ||
"rounds": 1, | ||
|
||
# Model parameters | ||
"model": "resnet10", | ||
"model_lr": 3e-4, | ||
"batch_size": 256, | ||
} | ||
|
||
# default_config_list: List[ConfigType] = [fedstatic, fedstatic, fedstatic, fedstatic] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -158,16 +158,20 @@ def get_digit_five_support(num_users: int, domains: List[str] = DIGIT_FIVE): | |
CIAR10_DPATH = "./datasets/imgs/cifar10/" | ||
|
||
NUM_COLLABORATORS = 1 | ||
DUMP_DIR = "/mas/camera/Experiments/SONAR/abhi/" | ||
# DUMP_DIR = "../../../../../../../home/" | ||
DUMP_DIR = "./" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's make the default dump directory as |
||
|
||
num_users = 3 | ||
mpi_system_config: ConfigType = { | ||
"exp_id": "", | ||
"comm": {"type": "MPI"}, | ||
"num_users": num_users, | ||
"num_collaborators": NUM_COLLABORATORS, | ||
"dset": CIFAR10_DSET, | ||
"dump_dir": DUMP_DIR, | ||
"dpath": CIAR10_DPATH, | ||
"seed": 32, | ||
# "seed": 32, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make sure to not include any unnecessary comments |
||
"seed": 2, | ||
# node_0 is a server currently | ||
# The device_ids dictionary depicts the GPUs on which the nodes reside. | ||
# For a single-GPU environment, the config will look as follows (as it follows a 0-based indexing): | ||
|
@@ -177,11 +181,9 @@ def get_digit_five_support(num_users: int, domains: List[str] = DIGIT_FIVE): | |
# "algo": get_algo_configs(num_users=3, algo_configs=algo_configs_list), | ||
"algos": get_algo_configs( | ||
num_users=3, | ||
algo_configs=malicious_algo_config_list, | ||
assignment_method="distribution", | ||
distribution={0: 1, 1: 1, 2: 1}, | ||
algo_configs=default_config_list | ||
), # type: ignore | ||
"samples_per_user": 1000, # TODO: To model scenarios where different users have different number of samples | ||
"samples_per_user": 5555, # TODO: To model scenarios where different users have different number of samples | ||
# we need to make this a dictionary with user_id as key and number of samples as value | ||
"train_label_distribution": "iid", # Either "iid", "non_iid" "support" | ||
"test_label_distribution": "iid", # Either "iid", "non_iid" "support" | ||
|
@@ -316,7 +318,7 @@ def get_digit_five_support(num_users: int, domains: List[str] = DIGIT_FIVE): | |
"exp_keys": [], | ||
} | ||
|
||
num_users = 9 | ||
num_users = 4 | ||
|
||
dropout_dict = { | ||
"distribution_dict": { # leave dict empty to disable dropout | ||
|
@@ -347,7 +349,8 @@ def get_digit_five_support(num_users: int, domains: List[str] = DIGIT_FIVE): | |
"device_ids": get_device_ids(num_users, gpu_ids), | ||
# "algos": get_algo_configs(num_users=num_users, algo_configs=default_config_list), # type: ignore | ||
"algos": get_algo_configs(num_users=num_users, algo_configs=[fedstatic]), # type: ignore | ||
"samples_per_user": 50000 // num_users, # distributed equally | ||
# "samples_per_user": 50000 // num_users, # distributed equally | ||
"samples_per_user": 100, | ||
"train_label_distribution": "non_iid", | ||
"test_label_distribution": "iid", | ||
"alpha_data": 1.0, | ||
|
@@ -356,6 +359,6 @@ def get_digit_five_support(num_users: int, domains: List[str] = DIGIT_FIVE): | |
"test_samples_per_user": 200, | ||
} | ||
|
||
|
||
current_config = grpc_system_config | ||
# current_config = mpi_system_config | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
from typing import Dict, List, Literal, Optional | ||
import random | ||
from utils.types import ConfigType | ||
|
||
from .algo_config_test import ( | ||
traditional_fl | ||
) | ||
|
||
def get_device_ids(num_users: int, gpus_available: List[int | Literal["cpu"]]) -> Dict[str, List[int | Literal["cpu"]]]: | ||
""" | ||
Get the GPU device IDs for the users. | ||
""" | ||
# TODO: Make it multi-host | ||
device_ids: Dict[str, List[int | Literal["cpu"]]] = {} | ||
for i in range(num_users + 1): # +1 for the super-node | ||
index = i % len(gpus_available) | ||
gpu_id = gpus_available[index] | ||
device_ids[f"node_{i}"] = [gpu_id] | ||
return device_ids | ||
|
||
|
||
def get_algo_configs( | ||
num_users: int, | ||
algo_configs: List[ConfigType], | ||
assignment_method: Literal[ | ||
"sequential", "random", "mapping", "distribution" | ||
] = "sequential", | ||
seed: Optional[int] = 1, | ||
mapping: Optional[List[int]] = None, | ||
distribution: Optional[Dict[int, int]] = None, | ||
) -> Dict[str, ConfigType]: | ||
""" | ||
Assign an algorithm configuration to each node, allowing for repetition. | ||
sequential: Assigns the algo_configs sequentially to the nodes | ||
random: Assigns the algo_configs randomly to the nodes | ||
mapping: Assigns the algo_configs based on the mapping of node index to algo index provided | ||
distribution: Assigns the algo_configs based on the distribution of algo index to number of nodes provided | ||
""" | ||
algo_config_map: Dict[str, ConfigType] = {} | ||
algo_config_map["node_0"] = algo_configs[0] # Super-node | ||
if assignment_method == "sequential": | ||
for i in range(1, num_users + 1): | ||
algo_config_map[f"node_{i}"] = algo_configs[i % len(algo_configs)] | ||
elif assignment_method == "random": | ||
for i in range(1, num_users + 1): | ||
algo_config_map[f"node_{i}"] = random.choice(algo_configs) | ||
elif assignment_method == "mapping": | ||
if not mapping: | ||
raise ValueError("Mapping must be provided for assignment method 'mapping'") | ||
assert len(mapping) == num_users | ||
for i in range(1, num_users + 1): | ||
algo_config_map[f"node_{i}"] = algo_configs[mapping[i - 1]] | ||
elif assignment_method == "distribution": | ||
if not distribution: | ||
raise ValueError( | ||
"Distribution must be provided for assignment method 'distribution'" | ||
) | ||
total_users = sum(distribution.values()) | ||
assert total_users == num_users | ||
|
||
# List of node indices to assign | ||
node_indices = list(range(1, total_users + 1)) | ||
# Seed for reproducibility | ||
random.seed(seed) | ||
# Shuffle the node indices based on the seed | ||
random.shuffle(node_indices) | ||
|
||
# Assign nodes based on the shuffled indices | ||
current_index = 0 | ||
for algo_index, num_nodes in distribution.items(): | ||
for i in range(num_nodes): | ||
node_id = node_indices[current_index] | ||
algo_config_map[f"node_{node_id}"] = algo_configs[algo_index] | ||
current_index += 1 | ||
else: | ||
raise ValueError(f"Invalid assignment method: {assignment_method}") | ||
# print("algo config mapping is: ", algo_config_map) | ||
return algo_config_map | ||
|
||
CIFAR10_DSET = "cifar10" | ||
CIAR10_DPATH = "./datasets/imgs/cifar10/" | ||
|
||
# DUMP_DIR = "../../../../../../../home/" | ||
DUMP_DIR = "./" | ||
|
||
NUM_COLLABORATORS = 1 | ||
num_users = 4 | ||
|
||
dropout_dict = { | ||
"distribution_dict": { # leave dict empty to disable dropout | ||
"method": "uniform", # "uniform", "normal" | ||
"parameters": {} # "mean": 0.5, "std": 0.1 in case of normal distribution | ||
}, | ||
"dropout_rate": 0.0, # cutoff for dropout: [0,1] | ||
"dropout_correlation": 0.0, # correlation between dropouts of successive rounds: [0,1] | ||
} | ||
|
||
dropout_dicts = {"node_0": {}} | ||
for i in range(1, num_users + 1): | ||
dropout_dicts[f"node_{i}"] = dropout_dict | ||
|
||
gpu_ids = [2, 3, 5, 6] | ||
|
||
grpc_system_config: ConfigType = { | ||
"exp_id": "static", | ||
"num_users": num_users, | ||
"num_collaborators": NUM_COLLABORATORS, | ||
"comm": {"type": "GRPC", "synchronous": True, "peer_ids": ["localhost:50048"]}, # The super-node | ||
"dset": CIFAR10_DSET, | ||
"dump_dir": DUMP_DIR, | ||
"dpath": CIAR10_DPATH, | ||
"seed": 2, | ||
"device_ids": get_device_ids(num_users, gpu_ids), | ||
# "algos": get_algo_configs(num_users=num_users, algo_configs=default_config_list), # type: ignore | ||
"algos": get_algo_configs(num_users=num_users, algo_configs=[traditional_fl]), # type: ignore | ||
# "samples_per_user": 50000 // num_users, # distributed equally | ||
"samples_per_user": 100, | ||
"train_label_distribution": "non_iid", | ||
"test_label_distribution": "iid", | ||
"alpha_data": 1.0, | ||
"exp_keys": [], | ||
"dropout_dicts": dropout_dicts, | ||
"test_samples_per_user": 200, | ||
} | ||
|
||
current_config = grpc_system_config |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -70,6 +70,5 @@ | |
# Start the scheduler | ||
scheduler.install_config() | ||
scheduler.initialize() | ||
|
||
# Run the job | ||
scheduler.run_job() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you remove this?