From 9c13f3d425977c54cfb27e852182c0c6864529d1 Mon Sep 17 00:00:00 2001 From: Vinay Hiremath Date: Mon, 24 Jun 2024 16:16:27 -0400 Subject: [PATCH] fix val dataset size code comment --- data/openwebtext/prepare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/openwebtext/prepare.py b/data/openwebtext/prepare.py index 2a9b9752b..092dc2f3a 100644 --- a/data/openwebtext/prepare.py +++ b/data/openwebtext/prepare.py @@ -73,7 +73,7 @@ def process(example): idx += len(arr_batch) arr.flush() - # train.bin is ~17GB, val.bin ~8.5MB + # train.bin is ~17GB, val.bin ~85MB # train has ~9B tokens (9,035,582,198) # val has ~4M tokens (4,434,897)