Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add seed in train_test_split #3211

Merged
merged 10 commits into from
Apr 4, 2024
2 changes: 1 addition & 1 deletion datasets/doc/source/how-to-use-with-pytorch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ expected by a model with a convolutional layer.

If you want to divide the dataset, you can use (at any point before passing the dataset to the DataLoader)::

partition_train_test = partition.train_test_split(test_size=0.2)
partition_train_test = partition.train_test_split(test_size=0.2, seed=42)
partition_train = partition_train_test["train"]
partition_test = partition_train_test["test"]

Expand Down
2 changes: 1 addition & 1 deletion datasets/e2e/pytorch/pytorch_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _create_trainloader(self, batch_size: int) -> DataLoader:
partition_id = 0
fds = FederatedDataset(dataset=self.dataset_name, partitioners={"train": 100})
partition = fds.load_partition(partition_id, "train")
partition_train_test = partition.train_test_split(test_size=0.2)
partition_train_test = partition.train_test_split(test_size=0.2, seed=42)
partition_train_test = partition_train_test.map(
lambda img: {"img": self.transforms(img)}, input_columns="img"
)
Expand Down
2 changes: 1 addition & 1 deletion datasets/e2e/scikit-learn/sklearn_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def _get_partition_data(self):
fds = FederatedDataset(dataset=self.dataset_name, partitioners={"train": 10})
partition = fds.load_partition(partition_id, "train")
partition.set_format("numpy")
partition_train_test = partition.train_test_split(test_size=0.2)
partition_train_test = partition.train_test_split(test_size=0.2, seed=42)
X_train, y_train = partition_train_test["train"]["image"], partition_train_test[
"train"]["label"]
X_test, y_test = partition_train_test["test"]["image"], partition_train_test[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@
" for partition_id in range(NUM_CLIENTS):\n",
" partition = fds.load_partition(partition_id, \"train\")\n",
" partition = partition.with_transform(apply_transforms)\n",
" partition = partition.train_test_split(train_size=0.8)\n",
" partition = partition.train_test_split(train_size=0.8, seed=42)\n",
" trainloaders.append(DataLoader(partition[\"train\"], batch_size=BATCH_SIZE))\n",
" valloaders.append(DataLoader(partition[\"test\"], batch_size=BATCH_SIZE))\n",
" testset = fds.load_split(\"test\").with_transform(apply_transforms)\n",
Expand Down
2 changes: 1 addition & 1 deletion examples/advanced-pytorch/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def fit(self, parameters, config):
batch_size: int = config["batch_size"]
epochs: int = config["local_epochs"]

train_valid = self.trainset.train_test_split(self.validation_split)
train_valid = self.trainset.train_test_split(self.validation_split, seed=42)
trainset = train_valid["train"]
valset = train_valid["test"]

Expand Down
2 changes: 1 addition & 1 deletion examples/advanced-pytorch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def load_partition(partition_id, toy: bool = False):
fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10})
partition = fds.load_partition(partition_id)
# Divide data on each node: 80% train, 20% test
partition_train_test = partition.train_test_split(test_size=0.2)
partition_train_test = partition.train_test_split(test_size=0.2, seed=42)
partition_train_test = partition_train_test.with_transform(apply_transforms)
return partition_train_test["train"], partition_train_test["test"]

Expand Down
2 changes: 1 addition & 1 deletion examples/advanced-tensorflow/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def load_partition(idx: int):
partition.set_format("numpy")

# Divide data on each node: 80% train, 20% test
partition = partition.train_test_split(test_size=0.2)
partition = partition.train_test_split(test_size=0.2, seed=42)
x_train, y_train = partition["train"]["img"] / 255.0, partition["train"]["label"]
x_test, y_test = partition["test"]["img"] / 255.0, partition["test"]["label"]
return x_train, y_train, x_test, y_test
Expand Down
1 change: 0 additions & 1 deletion examples/app-pytorch/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

# Define FlowerClient and client_fn
class FlowerClient(NumPyClient):

def fit(self, parameters, config):
set_weights(net, parameters)
results = train(net, trainloader, testloader, epochs=1, device=DEVICE)
Expand Down
1 change: 0 additions & 1 deletion examples/custom-mods/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def wandb_mod(msg: Message, context: Context, app: ClientAppCallable) -> Message

# if the `ClientApp` just processed a "fit" message, let's log some metrics to W&B
if reply.metadata.message_type == MessageType.TRAIN and reply.has_content():

metrics = reply.content.configs_records

results_to_log = dict(metrics.get("fitres.metrics", ConfigsRecord()))
Expand Down
2 changes: 1 addition & 1 deletion examples/embedded-devices/client_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def apply_transforms(batch):
for partition_id in range(NUM_CLIENTS):
partition = fds.load_partition(partition_id, "train")
# Divide data on each node: 90% train, 10% test
partition = partition.train_test_split(test_size=0.1)
partition = partition.train_test_split(test_size=0.1, seed=42)
partition = partition.with_transform(apply_transforms)
trainsets.append(partition["train"])
validsets.append(partition["test"])
Expand Down
2 changes: 1 addition & 1 deletion examples/embedded-devices/client_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def prepare_dataset(use_mnist: bool):
partition = fds.load_partition(partition_id, "train")
partition.set_format("numpy")
# Divide data on each node: 90% train, 10% test
partition = partition.train_test_split(test_size=0.1)
partition = partition.train_test_split(test_size=0.1, seed=42)
x_train, y_train = (
partition["train"][img_key] / 255.0,
partition["train"]["label"],
Expand Down
2 changes: 1 addition & 1 deletion examples/fl-dp-sa/fl_dp_sa/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def load_data(partition_id):
fds = FederatedDataset(dataset="mnist", partitioners={"train": 100})
partition = fds.load_partition(partition_id)
# Divide data on each node: 80% train, 20% test
partition_train_test = partition.train_test_split(test_size=0.2)
partition_train_test = partition.train_test_split(test_size=0.2, seed=42)
pytorch_transforms = Compose([ToTensor(), Normalize((0.5,), (0.5,))])

def apply_transforms(batch):
Expand Down
2 changes: 1 addition & 1 deletion examples/flower-via-docker-compose/helpers/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def load_data(data_sampling_percentage=0.5, client_id=1, total_clients=2):
partition.set_format("numpy")

# Divide data on each client: 80% train, 20% test
partition = partition.train_test_split(test_size=0.2)
partition = partition.train_test_split(test_size=0.2, seed=42)
x_train, y_train = partition["train"]["img"] / 255.0, partition["train"]["label"]
x_test, y_test = partition["test"]["img"] / 255.0, partition["test"]["label"]

Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch-from-centralized-to-federated/cifar.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def load_data(partition_id: int):
fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10})
partition = fds.load_partition(partition_id)
# Divide data on each node: 80% train, 20% test
partition_train_test = partition.train_test_split(test_size=0.2)
partition_train_test = partition.train_test_split(test_size=0.2, seed=42)
pytorch_transforms = Compose(
[ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
Expand Down
2 changes: 1 addition & 1 deletion examples/quickstart-huggingface/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def load_data(partition_id):
fds = FederatedDataset(dataset="imdb", partitioners={"train": 1_000})
partition = fds.load_partition(partition_id)
# Divide data: 80% train, 20% test
partition_train_test = partition.train_test_split(test_size=0.2)
partition_train_test = partition.train_test_split(test_size=0.2, seed=42)

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

Expand Down
3 changes: 3 additions & 0 deletions examples/quickstart-mlcube/dev/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def create_directory(path: str) -> None:

def download(task_args: List[str]) -> None:
"""Task: download.
Input parameters:
--data_dir
"""
Expand Down Expand Up @@ -81,6 +82,7 @@ def download(task_args: List[str]) -> None:

def train(task_args: List[str]) -> None:
"""Task: train.
Input parameters:
--data_dir, --log_dir, --model_dir, --parameters_file
"""
Expand Down Expand Up @@ -175,6 +177,7 @@ def train(task_args: List[str]) -> None:

def evaluate(task_args: List[str]) -> None:
"""Task: train.
Input parameters:
--data_dir, --log_dir, --model_dir, --parameters_file
"""
Expand Down
2 changes: 1 addition & 1 deletion examples/quickstart-mlx/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def evaluate(self, parameters, config):

fds = FederatedDataset(dataset="mnist", partitioners={"train": 3})
partition = fds.load_partition(partition_id=args.partition_id)
partition_splits = partition.train_test_split(test_size=0.2)
partition_splits = partition.train_test_split(test_size=0.2, seed=42)

partition_splits["train"].set_format("numpy")
partition_splits["test"].set_format("numpy")
Expand Down
6 changes: 4 additions & 2 deletions examples/quickstart-pytorch-lightning/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,11 @@ def load_data(partition):

partition = partition.with_transform(apply_transforms)
# 20 % for on federated evaluation
partition_full = partition.train_test_split(test_size=0.2)
partition_full = partition.train_test_split(test_size=0.2, seed=42)
# 60 % for the federated train and 20 % for the federated validation (both in fit)
partition_train_valid = partition_full["train"].train_test_split(train_size=0.75)
partition_train_valid = partition_full["train"].train_test_split(
train_size=0.75, seed=42
)
trainloader = DataLoader(
partition_train_valid["train"],
batch_size=32,
Expand Down
2 changes: 1 addition & 1 deletion examples/quickstart-pytorch/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def load_data(partition_id):
fds = FederatedDataset(dataset="cifar10", partitioners={"train": 3})
partition = fds.load_partition(partition_id)
# Divide data on each node: 80% train, 20% test
partition_train_test = partition.train_test_split(test_size=0.2)
partition_train_test = partition.train_test_split(test_size=0.2, seed=42)
pytorch_transforms = Compose(
[ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
Expand Down
2 changes: 1 addition & 1 deletion examples/quickstart-tensorflow/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
partition.set_format("numpy")

# Divide data on each node: 80% train, 20% test
partition = partition.train_test_split(test_size=0.2)
partition = partition.train_test_split(test_size=0.2, seed=42)
x_train, y_train = partition["train"]["img"] / 255.0, partition["train"]["label"]
x_test, y_test = partition["test"]["img"] / 255.0, partition["test"]["label"]

Expand Down
2 changes: 1 addition & 1 deletion examples/simulation-pytorch/sim.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@
" client_dataset = dataset.load_partition(int(cid), \"train\")\n",
"\n",
" # Now let's split it into train (90%) and validation (10%)\n",
" client_dataset_splits = client_dataset.train_test_split(test_size=0.1)\n",
" client_dataset_splits = client_dataset.train_test_split(test_size=0.1, seed=42)\n",
"\n",
" trainset = client_dataset_splits[\"train\"]\n",
" valset = client_dataset_splits[\"test\"]\n",
Expand Down
2 changes: 1 addition & 1 deletion examples/simulation-pytorch/sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def client_fn(cid: str) -> fl.client.Client:
client_dataset = dataset.load_partition(int(cid), "train")

# Now let's split it into train (90%) and validation (10%)
client_dataset_splits = client_dataset.train_test_split(test_size=0.1)
client_dataset_splits = client_dataset.train_test_split(test_size=0.1, seed=42)

trainset = client_dataset_splits["train"]
valset = client_dataset_splits["test"]
Expand Down
2 changes: 1 addition & 1 deletion examples/simulation-tensorflow/sim.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@
" client_dataset = dataset.load_partition(int(cid), \"train\")\n",
"\n",
" # Now let's split it into train (90%) and validation (10%)\n",
" client_dataset_splits = client_dataset.train_test_split(test_size=0.1)\n",
" client_dataset_splits = client_dataset.train_test_split(test_size=0.1, seed=42)\n",
"\n",
" trainset = client_dataset_splits[\"train\"].to_tf_dataset(\n",
" columns=\"image\", label_cols=\"label\", batch_size=32\n",
Expand Down
2 changes: 1 addition & 1 deletion examples/simulation-tensorflow/sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def client_fn(cid: str) -> fl.client.Client:
client_dataset = dataset.load_partition(int(cid), "train")

# Now let's split it into train (90%) and validation (10%)
client_dataset_splits = client_dataset.train_test_split(test_size=0.1)
client_dataset_splits = client_dataset.train_test_split(test_size=0.1, seed=42)

trainset = client_dataset_splits["train"].to_tf_dataset(
columns="image", label_cols="label", batch_size=32
Expand Down
2 changes: 0 additions & 2 deletions examples/vit-finetune/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@


class FedViTClient(NumPyClient):

def __init__(self, trainset):

self.trainset = trainset
self.model = get_model()

Expand Down
1 change: 0 additions & 1 deletion examples/vit-finetune/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@


def main():

args = parser.parse_args()

# To control the degree of parallelism
Expand Down
16 changes: 8 additions & 8 deletions examples/whisper-federated-finetuning/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,10 @@ def prepare_silences_dataset(train_dataset, ratio_silence: float = 0.1) -> Datas
"""Generate silences for the train set.
One of the classes in the SpeechCommands datatset is `silence`. However, the dataset
does not include clips of silence. It does however include 5 long files with different
background sounds. The taks of this function is to extract several (defined by `ratio_silence`)
one-second long clips from those background audio files. Later, those audio clips will be
included into the training set.
does not include clips of silence. It does however include 5 long files with
different background sounds. The taks of this function is to extract several
(defined by `ratio_silence`) one-second long clips from those background audio
files. Later, those audio clips will be included into the training set.
"""
# retrieve original silence audio clips
silences = [d for d in train_dataset if d["label"] == 35]
Expand Down Expand Up @@ -138,9 +138,9 @@ def prepare_silences_dataset(train_dataset, ratio_silence: float = 0.1) -> Datas
def construct_client_mapping(full_trainset, num_clients: int = 100):
"""Create a mapping to partition the dataset into `num_client` buckets.
These buckets contain the same number of `spekaer_id` but likely different
number of training exampes since each `speaker_id` in SpeechCommands does
provide different amounts of data to the dataset.
These buckets contain the same number of `spekaer_id` but likely different number of
training exampes since each `speaker_id` in SpeechCommands does provide different
amounts of data to the dataset.
"""
client_ids = list(set(full_trainset["speaker_id"]))
client_ids.remove(
Expand Down Expand Up @@ -191,7 +191,7 @@ def set_params(model: torch.nn.ModuleList, params: List[fl.common.NDArrays]):


def get_model(device, num_classes, compile: bool = True):
"""Create model: Whisper-tiny Encoder + classification head"""
"""Create model: Whisper-tiny Encoder + classification head."""
encoder = WhisperForConditionalGeneration.from_pretrained(
"openai/whisper-tiny"
).get_encoder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def load_data(partition_id, num_partitions):
fds = FederatedDataset(dataset="cifar10", partitioners={"train": num_partitions})
partition = fds.load_partition(partition_id)
# Divide data on each node: 80% train, 20% test
partition_train_test = partition.train_test_split(test_size=0.2)
partition_train_test = partition.train_test_split(test_size=0.2, seed=42)
pytorch_transforms = Compose(
[ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
Expand Down