From c5913aad58e0a7bc0b267422e11d859d9b1c3432 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 11 Jun 2024 17:22:38 -0700 Subject: [PATCH 01/40] 7b normal baseline scripts --- scripts/beaker/llamaish7-normal-launch.sh | 33 ++++++++++++++ scripts/beaker/llamaish7-normal.sh | 54 +++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 scripts/beaker/llamaish7-normal-launch.sh create mode 100644 scripts/beaker/llamaish7-normal.sh diff --git a/scripts/beaker/llamaish7-normal-launch.sh b/scripts/beaker/llamaish7-normal-launch.sh new file mode 100644 index 000000000..de9bd286b --- /dev/null +++ b/scripts/beaker/llamaish7-normal-launch.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=64 + +gantry run \ + --workspace ai2/OLMo-training \ + --task-name llamaish7-normal-qk-norm-reorder-zloss \ + --description "OLMo medium - 7B - Llamaish Normal" \ + --priority urgent \ + --preemptible \ + --beaker-image shanea/olmo-torch2.3-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --propagate-failure \ + --synchronized-start-timeout 15m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ + --shared-memory 10GiB \ + --venv base \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/llamaish7-normal.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/llamaish7-normal.sh b/scripts/beaker/llamaish7-normal.sh new file mode 100644 index 000000000..8b6b3bc3b --- /dev/null +++ b/scripts/beaker/llamaish7-normal.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +popd +export HF_DATASETS_OFFLINE=1 + +export EXPERIMENT=llamaish7-normal + +torchrun \ + --nnodes ${NUM_NODES}:${NUM_NODES} \ + --nproc-per-node 8 \ + --rdzv_id=12347 \ + --rdzv_backend=static \ + --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ + --node_rank=$BEAKER_REPLICA_RANK \ + --rdzv_conf="read_timeout=420" \ + scripts/train.py \ + configs/llamaish7-s3.yaml \ + --run_name=$EXPERIMENT \ + --wandb.name=$EXPERIMENT \ + --wandb.group=$EXPERIMENT \ + --model.flash_attention=true \ + --fsdp.wrapping_strategy=by_block_and_size \ + --fsdp.sharding_strategy=SHARD_GRAD_OP \ + --save_folder=runs/ \ + --activation_checkpointing=fine_grained \ + --fused_loss=true \ + --device_train_microbatch_size=2 \ + --global_train_batch_size=1024 \ + --save_interval=250 \ + --eval_interval=250 \ + --optimizer.metrics_log_interval=1 \ + --save_overwrite \ + --model.init_fn=normal \ + --model.init_std=0.02 \ + --model.clip_qkv=null \ + --save_num_checkpoints_to_keep=3 \ + --scheduler.units=steps \ + --scheduler.t_warmup=2000 + # '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-normal/}' From e2cd59b59be2380a4e12b22859f9fcd0dbf97efc Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 11 Jun 2024 17:30:12 -0700 Subject: [PATCH 02/40] add new evals --- configs/llamaish7-s3.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/configs/llamaish7-s3.yaml b/configs/llamaish7-s3.yaml index 5d52eeef6..ef7aef937 100644 --- a/configs/llamaish7-s3.yaml +++ b/configs/llamaish7-s3.yaml @@ -212,6 +212,18 @@ evaluators: - label: mmlu_other_mc_5shot_test type: downstream + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + data: pad_direction: right num_workers: 32 From 3d02325cc957ba9215192039a61024d44901d00b Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 11 Jun 2024 17:30:33 -0700 Subject: [PATCH 03/40] add 1b config --- configs/llamaish1-s3.yaml | 1297 +++++++++++++++++++++++++++++++++++++ 1 file changed, 1297 insertions(+) create mode 100644 configs/llamaish1-s3.yaml diff --git a/configs/llamaish1-s3.yaml b/configs/llamaish1-s3.yaml new file mode 100644 index 000000000..d43668587 --- /dev/null +++ b/configs/llamaish1-s3.yaml @@ -0,0 +1,1297 @@ +run_name: llamaish1-001 +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-small + group: llamaish1 + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + # mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + attention_layer_norm: false + clip_qkv: 8.0 + include_bias: false + block_type: sequential + layer_norm_type: default + layer_norm_with_affine: false + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 50280 + embedding_size: 50304 + eos_token_id: 0 + pad_token_id: 1 + init_device: meta + init_fn: full_megatron + init_std: 0.006 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 4.0e-4 + weight_decay: 0.1 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 10485760000 + t_max: 3e12 + alpha_f: 0.1 + grad_clip_warmup_steps: 2097152000 + grad_clip_warmup_factor: 5 + warmup_min_lr: 0 + +tokenizer: + identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/OLMo-small/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 2ep +global_train_batch_size: 512 +device_train_microbatch_size: 4 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + # Doesn't work from cache. + # - label: basic_arithmetic + # type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + ######### NON WEB DATA ######### + # ~> GUTENBERG BOOKS (5.256 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy + # ~> PES2O STEM PAPERS (57.21 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy + # MEGAWIKA v1 (4.6 GT) + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy + # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + # ~> REDPAJAMA ARXIV (27.97 GT) + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy + # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy + # ~> CC NEWS (14.3 GT) + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy + #################################### + ######### CODE ######### + # ~> STARCODER (263.775 GT) + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy + #################################### + ######### WEB HIGH QUALITY ######### + # ~> C4 (138.4 GT) + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy + # ~> REDDIT (79.9 GT) + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy + # ~> FALCON (547.341 GT) + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy + #################################### + ######### WEB REST ######### + # ~> DOLMA CC HEAD 50% (178.4 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy + # ~> DOLMA CC MIDDLE 33% (242.05 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy + # ~> DOLMA CC TAIL 33% (191.4 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy From 995247f4af8c93280d14afc789939ab9d04e5b68 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 11 Jun 2024 17:39:25 -0700 Subject: [PATCH 04/40] 1b scripts --- scripts/beaker/llamaish1-normal-launch.sh | 33 +++++++++++++ scripts/beaker/llamaish1-normal.sh | 57 +++++++++++++++++++++++ scripts/beaker/llamaish7-normal.sh | 1 + 3 files changed, 91 insertions(+) create mode 100644 scripts/beaker/llamaish1-normal-launch.sh create mode 100644 scripts/beaker/llamaish1-normal.sh diff --git a/scripts/beaker/llamaish1-normal-launch.sh b/scripts/beaker/llamaish1-normal-launch.sh new file mode 100644 index 000000000..387d9576c --- /dev/null +++ b/scripts/beaker/llamaish1-normal-launch.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=8 + +gantry run \ + --workspace ai2/OLMo-training \ + --task-name llamaish1-normal \ + --description "OLMo small - 1B - Llamaish Normal New" \ + --priority high \ + --preemptible \ + --beaker-image shanea/olmo-torch2.3-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --propagate-failure \ + --synchronized-start-timeout 10m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ + --shared-memory 10GiB \ + --venv base \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/llamaish1-normal.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/llamaish1-normal.sh b/scripts/beaker/llamaish1-normal.sh new file mode 100644 index 000000000..f20038d22 --- /dev/null +++ b/scripts/beaker/llamaish1-normal.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +popd +export HF_DATASETS_OFFLINE=1 + +export EXPERIMENT=llamaish1-normal-new + +torchrun \ + --nnodes ${NUM_NODES}:${NUM_NODES} \ + --nproc-per-node 8 \ + --rdzv_id=12347 \ + --rdzv_backend=static \ + --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ + --node_rank=$BEAKER_REPLICA_RANK \ + --rdzv_conf="read_timeout=420" \ + scripts/train.py \ + configs/llamaish1-s3.yaml \ + --run_name=$EXPERIMENT \ + --wandb.name=$EXPERIMENT \ + --wandb.group=$EXPERIMENT \ + --model.flash_attention=true \ + --fsdp.wrapping_strategy=by_block_and_size \ + --fsdp.sharding_strategy=NO_SHARD \ + --gen1_gc_interval=null \ + --save_folder=runs/ \ + --activation_checkpointing=fine_grained \ + --fused_loss=true \ + --device_train_microbatch_size=4 \ + --global_train_batch_size=512 \ + --save_interval=250 \ + --eval_interval=250 \ + --optimizer.metrics_log_interval=1 \ + --save_overwrite \ + --model.init_fn=normal \ + --model.init_std=0.02 \ + --model.init_cutoff_factor=3 \ + --model.clip_qkv=null \ + --save_num_checkpoints_to_keep=3 \ + --scheduler.grad_clip_warmup_steps=null \ + --scheduler.t_warmup=2000 \ + --scheduler.units=steps + #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llamaish1-normal-new/}' diff --git a/scripts/beaker/llamaish7-normal.sh b/scripts/beaker/llamaish7-normal.sh index 8b6b3bc3b..30699e84f 100644 --- a/scripts/beaker/llamaish7-normal.sh +++ b/scripts/beaker/llamaish7-normal.sh @@ -47,6 +47,7 @@ torchrun \ --save_overwrite \ --model.init_fn=normal \ --model.init_std=0.02 \ + --model.init_cutoff_factor=3 \ --model.clip_qkv=null \ --save_num_checkpoints_to_keep=3 \ --scheduler.units=steps \ From b71dff92be4d13796c24ec6615b81576d3692962 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 11 Jun 2024 17:49:41 -0700 Subject: [PATCH 05/40] turn off fused_loss --- scripts/beaker/llamaish1-normal.sh | 2 +- scripts/beaker/llamaish7-normal.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/beaker/llamaish1-normal.sh b/scripts/beaker/llamaish1-normal.sh index f20038d22..3d6ed47b7 100644 --- a/scripts/beaker/llamaish1-normal.sh +++ b/scripts/beaker/llamaish1-normal.sh @@ -39,7 +39,7 @@ torchrun \ --gen1_gc_interval=null \ --save_folder=runs/ \ --activation_checkpointing=fine_grained \ - --fused_loss=true \ + --fused_loss=false \ --device_train_microbatch_size=4 \ --global_train_batch_size=512 \ --save_interval=250 \ diff --git a/scripts/beaker/llamaish7-normal.sh b/scripts/beaker/llamaish7-normal.sh index 30699e84f..eef86c18e 100644 --- a/scripts/beaker/llamaish7-normal.sh +++ b/scripts/beaker/llamaish7-normal.sh @@ -38,7 +38,7 @@ torchrun \ --fsdp.sharding_strategy=SHARD_GRAD_OP \ --save_folder=runs/ \ --activation_checkpointing=fine_grained \ - --fused_loss=true \ + --fused_loss=false \ --device_train_microbatch_size=2 \ --global_train_batch_size=1024 \ --save_interval=250 \ From 0de723455f87563bebe874f205642a613e28b7f4 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 11 Jun 2024 17:54:58 -0700 Subject: [PATCH 06/40] fix name --- scripts/beaker/llamaish7-normal-launch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/beaker/llamaish7-normal-launch.sh b/scripts/beaker/llamaish7-normal-launch.sh index de9bd286b..c3c30e0ce 100644 --- a/scripts/beaker/llamaish7-normal-launch.sh +++ b/scripts/beaker/llamaish7-normal-launch.sh @@ -6,7 +6,7 @@ NUM_NODES=64 gantry run \ --workspace ai2/OLMo-training \ - --task-name llamaish7-normal-qk-norm-reorder-zloss \ + --task-name llamaish7-normal \ --description "OLMo medium - 7B - Llamaish Normal" \ --priority urgent \ --preemptible \ From 75ae73fb35c328b5d02c773eaa3922c2d9d47c3a Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 11 Jun 2024 17:58:57 -0700 Subject: [PATCH 07/40] make executable --- scripts/beaker/llamaish1-normal-launch.sh | 0 scripts/beaker/llamaish1-normal.sh | 0 scripts/beaker/llamaish7-normal-launch.sh | 0 scripts/beaker/llamaish7-normal.sh | 0 4 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/beaker/llamaish1-normal-launch.sh mode change 100644 => 100755 scripts/beaker/llamaish1-normal.sh mode change 100644 => 100755 scripts/beaker/llamaish7-normal-launch.sh mode change 100644 => 100755 scripts/beaker/llamaish7-normal.sh diff --git a/scripts/beaker/llamaish1-normal-launch.sh b/scripts/beaker/llamaish1-normal-launch.sh old mode 100644 new mode 100755 diff --git a/scripts/beaker/llamaish1-normal.sh b/scripts/beaker/llamaish1-normal.sh old mode 100644 new mode 100755 diff --git a/scripts/beaker/llamaish7-normal-launch.sh b/scripts/beaker/llamaish7-normal-launch.sh old mode 100644 new mode 100755 diff --git a/scripts/beaker/llamaish7-normal.sh b/scripts/beaker/llamaish7-normal.sh old mode 100644 new mode 100755 From ed51f61079fb67e15749b4b3ca57bcaf9a32492a Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 11 Jun 2024 19:34:33 -0700 Subject: [PATCH 08/40] temporarily don't run new evals --- configs/llamaish1-s3.yaml | 16 ++++++++-------- configs/llamaish7-s3.yaml | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/configs/llamaish1-s3.yaml b/configs/llamaish1-s3.yaml index d43668587..0e2947deb 100644 --- a/configs/llamaish1-s3.yaml +++ b/configs/llamaish1-s3.yaml @@ -213,17 +213,17 @@ evaluators: - label: mmlu_other_mc_5shot_test type: downstream - - label: basic_arithmetic - type: downstream + # - label: basic_arithmetic + # type: downstream - - label: trivia_qa_wiki_ppl - type: downstream + # - label: trivia_qa_wiki_ppl + # type: downstream - - label: natural_qs_open_ppl - type: downstream + # - label: natural_qs_open_ppl + # type: downstream - - label: arc_easy_ppl - type: downstream + # - label: arc_easy_ppl + # type: downstream data: pad_direction: right diff --git a/configs/llamaish7-s3.yaml b/configs/llamaish7-s3.yaml index ef7aef937..e928af92f 100644 --- a/configs/llamaish7-s3.yaml +++ b/configs/llamaish7-s3.yaml @@ -212,17 +212,17 @@ evaluators: - label: mmlu_other_mc_5shot_test type: downstream - - label: basic_arithmetic - type: downstream + # - label: basic_arithmetic + # type: downstream - - label: trivia_qa_wiki_ppl - type: downstream + # - label: trivia_qa_wiki_ppl + # type: downstream - - label: natural_qs_open_ppl - type: downstream + # - label: natural_qs_open_ppl + # type: downstream - - label: arc_easy_ppl - type: downstream + # - label: arc_easy_ppl + # type: downstream data: pad_direction: right From 3293cbbd666e0f5169124cbce88700dba9be2a77 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 11 Jun 2024 19:49:43 -0700 Subject: [PATCH 09/40] switch to pete's torch2.3 image --- scripts/beaker/llamaish1-normal-launch.sh | 2 +- scripts/beaker/llamaish7-normal-launch.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/beaker/llamaish1-normal-launch.sh b/scripts/beaker/llamaish1-normal-launch.sh index 387d9576c..56f633583 100755 --- a/scripts/beaker/llamaish1-normal-launch.sh +++ b/scripts/beaker/llamaish1-normal-launch.sh @@ -10,7 +10,7 @@ gantry run \ --description "OLMo small - 1B - Llamaish Normal New" \ --priority high \ --preemptible \ - --beaker-image shanea/olmo-torch2.3-gantry \ + --beaker-image petew/olmo-torch23-gantry \ --cluster ai2/jupiter-cirrascale-2 \ --gpus 8 \ --replicas "${NUM_NODES}" \ diff --git a/scripts/beaker/llamaish7-normal-launch.sh b/scripts/beaker/llamaish7-normal-launch.sh index c3c30e0ce..4938615ac 100755 --- a/scripts/beaker/llamaish7-normal-launch.sh +++ b/scripts/beaker/llamaish7-normal-launch.sh @@ -10,7 +10,7 @@ gantry run \ --description "OLMo medium - 7B - Llamaish Normal" \ --priority urgent \ --preemptible \ - --beaker-image shanea/olmo-torch2.3-gantry \ + --beaker-image petew/olmo-torch23-gantry \ --cluster ai2/jupiter-cirrascale-2 \ --gpus 8 \ --replicas "${NUM_NODES}" \ From d77add5b91a3c29c0166a8d83e7536a836f8defe Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 11 Jun 2024 21:05:46 -0700 Subject: [PATCH 10/40] no clipping warmup --- scripts/beaker/llamaish1-normal.sh | 5 +++-- scripts/beaker/llamaish7-normal.sh | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/beaker/llamaish1-normal.sh b/scripts/beaker/llamaish1-normal.sh index 3d6ed47b7..e74b8a868 100755 --- a/scripts/beaker/llamaish1-normal.sh +++ b/scripts/beaker/llamaish1-normal.sh @@ -51,7 +51,8 @@ torchrun \ --model.init_cutoff_factor=3 \ --model.clip_qkv=null \ --save_num_checkpoints_to_keep=3 \ + --scheduler.warmup_min_lr=0 \ --scheduler.grad_clip_warmup_steps=null \ - --scheduler.t_warmup=2000 \ - --scheduler.units=steps + --scheduler.units=steps \ + --scheduler.t_warmup=2000 #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llamaish1-normal-new/}' diff --git a/scripts/beaker/llamaish7-normal.sh b/scripts/beaker/llamaish7-normal.sh index eef86c18e..dd956702f 100755 --- a/scripts/beaker/llamaish7-normal.sh +++ b/scripts/beaker/llamaish7-normal.sh @@ -50,6 +50,8 @@ torchrun \ --model.init_cutoff_factor=3 \ --model.clip_qkv=null \ --save_num_checkpoints_to_keep=3 \ + --scheduler.warmup_min_lr=0 \ + --scheduler.grad_clip_warmup_steps=null \ --scheduler.units=steps \ --scheduler.t_warmup=2000 # '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-normal/}' From eff21eeca6e1eed3558b35508915c9b646063ef9 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 11 Jun 2024 21:39:35 -0700 Subject: [PATCH 11/40] wait longer --- scripts/beaker/llamaish1-normal-launch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/beaker/llamaish1-normal-launch.sh b/scripts/beaker/llamaish1-normal-launch.sh index 56f633583..b913bc40e 100755 --- a/scripts/beaker/llamaish1-normal-launch.sh +++ b/scripts/beaker/llamaish1-normal-launch.sh @@ -19,7 +19,7 @@ gantry run \ --budget ai2/oe-training \ --no-nfs \ --propagate-failure \ - --synchronized-start-timeout 10m \ + --synchronized-start-timeout 20m \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ From c1075ce39889df094f7a9dc39036da7497253607 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 11 Jun 2024 21:44:25 -0700 Subject: [PATCH 12/40] priority --- scripts/beaker/llamaish1-normal-launch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/beaker/llamaish1-normal-launch.sh b/scripts/beaker/llamaish1-normal-launch.sh index b913bc40e..86c33dd53 100755 --- a/scripts/beaker/llamaish1-normal-launch.sh +++ b/scripts/beaker/llamaish1-normal-launch.sh @@ -8,7 +8,7 @@ gantry run \ --workspace ai2/OLMo-training \ --task-name llamaish1-normal \ --description "OLMo small - 1B - Llamaish Normal New" \ - --priority high \ + --priority urgent \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ --cluster ai2/jupiter-cirrascale-2 \ From 90282629fe3dc161c61faeda24387cfc10e50099 Mon Sep 17 00:00:00 2001 From: Dustin Date: Tue, 11 Jun 2024 23:28:40 -0700 Subject: [PATCH 13/40] config for llamaish1 base run with amber data --- configs/llamaish1-s3.yaml | 1442 +++++++++------------------------ configs/llm-360-amber-s3.yaml | 516 ++++++++++++ 2 files changed, 887 insertions(+), 1071 deletions(-) create mode 100644 configs/llm-360-amber-s3.yaml diff --git a/configs/llamaish1-s3.yaml b/configs/llamaish1-s3.yaml index 0e2947deb..06f522330 100644 --- a/configs/llamaish1-s3.yaml +++ b/configs/llamaish1-s3.yaml @@ -1,11 +1,11 @@ -run_name: llamaish1-001 +run_name: llamaish1-amber-data-001 seed: 6198 dry_run: false wandb: name: ${run_name} project: olmo-small - group: llamaish1 + group: llamaish1-amber-data model: d_model: 2048 @@ -29,14 +29,14 @@ model: activation_type: swiglu residual_dropout: 0.0 embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 + max_sequence_length: 2048 + vocab_size: 32000 + embedding_size: 32000 + eos_token_id: 2 + pad_token_id: 2 init_device: meta - init_fn: full_megatron - init_std: 0.006 + init_fn: normal + init_std: 0.02 init_cutoff_factor: 3 compile: null @@ -63,7 +63,7 @@ scheduler: warmup_min_lr: 0 tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json + identifier: tokenizers/nousresearch-llama-2-7b-hf.json truncate_direction: right save_folder: runs/${run_name} @@ -233,1065 +233,365 @@ data: prefetch_factor: 8 persistent_workers: true timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy + ######### Amber ######### + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_000.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_001.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_002.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_003.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_004.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_005.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_006.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_007.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_008.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_009.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_010.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_011.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_012.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_013.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_014.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_015.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_016.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_017.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_018.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_019.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_020.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_021.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_022.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_023.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_024.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_025.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_026.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_027.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_028.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_029.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_030.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_031.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_032.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_033.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_034.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_035.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_036.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_037.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_038.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_039.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_040.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_041.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_042.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_043.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_044.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_045.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_046.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_047.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_048.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_049.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_050.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_051.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_052.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_053.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_054.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_055.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_056.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_057.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_058.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_059.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_060.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_061.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_062.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_063.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_064.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_065.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_066.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_067.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_068.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_069.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_070.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_071.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_072.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_073.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_074.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_075.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_076.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_077.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_078.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_079.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_080.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_081.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_082.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_083.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_084.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_085.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_086.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_087.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_088.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_089.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_090.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_091.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_092.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_093.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_094.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_095.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_096.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_097.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_098.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_099.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_100.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_101.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_102.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_103.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_104.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_105.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_106.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_107.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_108.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_109.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_110.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_111.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_112.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_113.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_114.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_115.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_116.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_117.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_118.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_119.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_120.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_121.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_122.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_123.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_124.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_125.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_126.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_127.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_128.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_129.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_130.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_131.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_132.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_133.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_134.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_135.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_136.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_137.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_138.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_139.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_140.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_141.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_142.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_143.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_144.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_145.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_146.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_147.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_148.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_149.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_150.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_151.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_152.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_153.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_154.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_155.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_156.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_157.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_158.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_159.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_160.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_161.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_162.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_163.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_164.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_165.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_166.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_167.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_168.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_169.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_170.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_171.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_172.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_173.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_174.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_175.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_176.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_177.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_178.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_179.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_180.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_181.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_182.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_183.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_184.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_185.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_186.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_187.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_188.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_189.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_190.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_191.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_192.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_193.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_194.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_195.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_196.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_197.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_198.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_199.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_200.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_201.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_202.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_203.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_204.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_205.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_206.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_207.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_208.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_209.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_210.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_211.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_212.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_213.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_214.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_215.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_216.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_217.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_218.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_219.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_220.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_221.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_222.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_223.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_224.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_225.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_226.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_227.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_228.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_229.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_230.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_231.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_232.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_233.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_234.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_235.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_236.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_237.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_238.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_239.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_240.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_241.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_242.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_243.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_244.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_245.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_246.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_247.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_248.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_249.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_250.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_251.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_252.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_253.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_254.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_255.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_256.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_257.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_258.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_259.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_260.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_261.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_262.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_263.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_264.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_265.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_266.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_267.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_268.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_269.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_270.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_271.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_272.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_273.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_274.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_275.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_276.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_277.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_278.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_279.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_280.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_281.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_282.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_283.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_284.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_285.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_286.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_287.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_288.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_289.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_290.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_291.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_292.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_293.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_294.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_295.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_296.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_297.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_298.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_299.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_300.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_301.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_302.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_303.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_304.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_305.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_306.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_307.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_308.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_309.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_310.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_311.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_312.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_313.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_314.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_315.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_316.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_317.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_318.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_319.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_320.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_321.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_322.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_323.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_324.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_325.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_326.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_327.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_328.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_329.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_330.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_331.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_332.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_333.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_334.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_335.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_336.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_337.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_338.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_339.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_340.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_341.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_342.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_343.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_344.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_345.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_346.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_347.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_348.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_349.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_350.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_351.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_352.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_353.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_354.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_355.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_356.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_357.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_358.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_359.npy \ No newline at end of file diff --git a/configs/llm-360-amber-s3.yaml b/configs/llm-360-amber-s3.yaml new file mode 100644 index 000000000..5bcb3a634 --- /dev/null +++ b/configs/llm-360-amber-s3.yaml @@ -0,0 +1,516 @@ +run_name: llm-306-amber-data-repro-db-normal-init-2 +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: llm-306-amber-data-repro-match-dirk-baseline-normal-init + +model: + d_model: 4096 + n_heads: 32 + n_kv_heads: null + clip_qkv: 8.0 + n_layers: 32 + mlp_ratio: 4 + mlp_hidden_size: 22016 + activation_type: swiglu + block_type: sequential + block_group_size: 1 + alibi: false + alibi_bias_max: 8.0 + rope: true + rope_full_precision: true + flash_attention: true + attention_dropout: 0.0 + multi_query_attention: null + attention_layer_norm: false + residual_dropout: 0.0 + embedding_dropout: 0.0 + layer_norm_type: default + layer_norm_with_affine: false + attention_layer_norm_with_affine: false + max_sequence_length: 2048 + include_bias: false + bias_for_layer_norm: false + scale_logits: false + vocab_size: 32000 + embedding_size: 32000 + weight_tying: false + eos_token_id: 2 + pad_token_id: 2 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: null + +compile: null + +optimizer: + name: adamw + learning_rate: 3.0e-4 + weight_decay: 0.1 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: cosine_with_warmup + units: tokens + warmup_min_lr: 0 + t_warmup: 20971520000 + t_max: 3e12 + alpha_f: 0.1 + grad_clip_warmup_steps: null + grad_clip_warmup_factor: 5 + +tokenizer: + identifier: tokenizers/nousresearch-llama-2-7b-hf.json + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 2ep +global_train_batch_size: 1024 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + use_orig_params: true + sharding_strategy: ShardingStrategy.SHARD_GRAD_OP + wrapping_strategy: by_block_and_size + precision: mixed + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # datasets: + # c4_en-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + + - label: hellaswag + type: downstream + + +data: + pad_direction: right + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + paths: + ######### Amber ######### + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_000.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_001.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_002.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_003.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_004.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_005.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_006.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_007.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_008.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_009.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_010.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_011.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_012.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_013.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_014.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_015.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_016.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_017.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_018.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_019.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_020.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_021.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_022.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_023.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_024.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_025.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_026.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_027.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_028.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_029.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_030.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_031.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_032.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_033.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_034.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_035.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_036.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_037.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_038.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_039.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_040.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_041.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_042.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_043.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_044.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_045.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_046.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_047.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_048.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_049.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_050.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_051.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_052.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_053.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_054.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_055.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_056.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_057.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_058.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_059.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_060.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_061.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_062.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_063.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_064.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_065.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_066.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_067.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_068.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_069.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_070.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_071.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_072.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_073.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_074.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_075.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_076.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_077.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_078.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_079.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_080.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_081.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_082.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_083.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_084.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_085.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_086.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_087.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_088.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_089.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_090.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_091.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_092.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_093.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_094.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_095.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_096.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_097.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_098.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_099.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_100.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_101.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_102.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_103.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_104.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_105.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_106.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_107.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_108.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_109.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_110.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_111.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_112.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_113.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_114.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_115.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_116.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_117.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_118.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_119.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_120.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_121.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_122.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_123.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_124.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_125.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_126.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_127.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_128.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_129.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_130.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_131.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_132.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_133.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_134.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_135.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_136.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_137.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_138.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_139.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_140.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_141.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_142.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_143.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_144.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_145.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_146.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_147.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_148.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_149.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_150.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_151.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_152.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_153.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_154.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_155.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_156.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_157.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_158.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_159.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_160.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_161.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_162.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_163.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_164.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_165.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_166.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_167.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_168.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_169.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_170.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_171.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_172.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_173.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_174.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_175.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_176.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_177.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_178.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_179.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_180.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_181.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_182.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_183.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_184.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_185.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_186.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_187.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_188.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_189.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_190.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_191.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_192.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_193.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_194.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_195.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_196.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_197.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_198.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_199.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_200.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_201.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_202.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_203.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_204.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_205.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_206.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_207.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_208.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_209.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_210.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_211.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_212.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_213.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_214.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_215.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_216.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_217.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_218.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_219.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_220.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_221.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_222.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_223.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_224.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_225.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_226.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_227.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_228.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_229.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_230.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_231.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_232.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_233.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_234.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_235.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_236.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_237.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_238.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_239.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_240.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_241.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_242.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_243.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_244.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_245.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_246.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_247.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_248.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_249.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_250.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_251.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_252.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_253.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_254.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_255.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_256.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_257.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_258.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_259.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_260.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_261.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_262.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_263.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_264.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_265.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_266.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_267.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_268.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_269.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_270.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_271.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_272.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_273.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_274.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_275.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_276.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_277.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_278.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_279.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_280.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_281.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_282.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_283.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_284.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_285.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_286.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_287.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_288.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_289.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_290.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_291.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_292.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_293.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_294.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_295.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_296.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_297.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_298.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_299.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_300.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_301.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_302.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_303.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_304.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_305.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_306.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_307.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_308.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_309.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_310.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_311.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_312.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_313.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_314.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_315.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_316.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_317.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_318.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_319.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_320.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_321.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_322.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_323.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_324.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_325.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_326.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_327.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_328.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_329.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_330.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_331.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_332.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_333.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_334.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_335.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_336.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_337.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_338.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_339.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_340.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_341.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_342.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_343.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_344.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_345.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_346.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_347.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_348.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_349.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_350.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_351.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_352.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_353.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_354.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_355.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_356.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_357.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_358.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_359.npy From 5d9dce59fdc6ae7b5ce1497be259d528afc4772a Mon Sep 17 00:00:00 2001 From: Dustin Date: Tue, 11 Jun 2024 23:29:03 -0700 Subject: [PATCH 14/40] launch scripts for llamaish1 with amber data --- scripts/beaker/llamaish1-normal-launch.sh | 10 +++++----- scripts/beaker/llamaish1-normal.sh | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/beaker/llamaish1-normal-launch.sh b/scripts/beaker/llamaish1-normal-launch.sh index 86c33dd53..55dbccaf1 100755 --- a/scripts/beaker/llamaish1-normal-launch.sh +++ b/scripts/beaker/llamaish1-normal-launch.sh @@ -6,8 +6,8 @@ NUM_NODES=8 gantry run \ --workspace ai2/OLMo-training \ - --task-name llamaish1-normal \ - --description "OLMo small - 1B - Llamaish Normal New" \ + --task-name llamaish1-normal-amber-data \ + --description "OLMo small - 1B - Llamaish Normal with Amber data" \ --priority urgent \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ @@ -23,9 +23,9 @@ gantry run \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=DUSTINS_WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=DUSTINS_AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=DUSTINS_AWS_SECRET_ACCESS_KEY \ --shared-memory 10GiB \ --venv base \ --yes \ diff --git a/scripts/beaker/llamaish1-normal.sh b/scripts/beaker/llamaish1-normal.sh index e74b8a868..479c7405a 100755 --- a/scripts/beaker/llamaish1-normal.sh +++ b/scripts/beaker/llamaish1-normal.sh @@ -18,7 +18,7 @@ curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" popd export HF_DATASETS_OFFLINE=1 -export EXPERIMENT=llamaish1-normal-new +export EXPERIMENT=llamaish1-normal-amber-data torchrun \ --nnodes ${NUM_NODES}:${NUM_NODES} \ From a45fe68b062cabbb6eb7f57e45fd63eafc64fc56 Mon Sep 17 00:00:00 2001 From: Dustin Date: Tue, 11 Jun 2024 23:59:06 -0700 Subject: [PATCH 15/40] fixed tokenizer def --- configs/llamaish1-s3.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/llamaish1-s3.yaml b/configs/llamaish1-s3.yaml index 06f522330..65557130a 100644 --- a/configs/llamaish1-s3.yaml +++ b/configs/llamaish1-s3.yaml @@ -63,7 +63,8 @@ scheduler: warmup_min_lr: 0 tokenizer: - identifier: tokenizers/nousresearch-llama-2-7b-hf.json + + identifier: NousResearch/Llama-2-7b-hf truncate_direction: right save_folder: runs/${run_name} From f8f530a1bcf9d4b5eca3c736f75d9d837ce9fe6f Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 09:43:38 -0700 Subject: [PATCH 16/40] turn off perplexity eval --- configs/llamaish1-s3.yaml | 54 +++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/configs/llamaish1-s3.yaml b/configs/llamaish1-s3.yaml index 65557130a..61574d142 100644 --- a/configs/llamaish1-s3.yaml +++ b/configs/llamaish1-s3.yaml @@ -101,33 +101,33 @@ eval_interval: 1000 eval_subset_num_batches: -1 device_eval_batch_size: ${device_train_microbatch_size} evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # datasets: + # c4_en-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy ########################## # Downstream evaluations # From 0538bce3518478870a3a9f88b11fd032dcca7fee Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 09:44:40 -0700 Subject: [PATCH 17/40] load last checkpoint --- scripts/beaker/llamaish1-normal.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/beaker/llamaish1-normal.sh b/scripts/beaker/llamaish1-normal.sh index 479c7405a..17ea89f8e 100755 --- a/scripts/beaker/llamaish1-normal.sh +++ b/scripts/beaker/llamaish1-normal.sh @@ -55,4 +55,4 @@ torchrun \ --scheduler.grad_clip_warmup_steps=null \ --scheduler.units=steps \ --scheduler.t_warmup=2000 - #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llamaish1-normal-new/}' + '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llm-306-amber-data-repro-db-normal-init-2}' From 15c56065b6b61b41dad393eba15a70757b81181c Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 11:04:50 -0700 Subject: [PATCH 18/40] changeing sharding strategy to shard_grad_op --- scripts/beaker/llamaish1-normal.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/beaker/llamaish1-normal.sh b/scripts/beaker/llamaish1-normal.sh index 17ea89f8e..aef1137a0 100755 --- a/scripts/beaker/llamaish1-normal.sh +++ b/scripts/beaker/llamaish1-normal.sh @@ -35,7 +35,7 @@ torchrun \ --wandb.group=$EXPERIMENT \ --model.flash_attention=true \ --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=NO_SHARD \ + --fsdp.sharding_strategy=SHARD_GRAD_OP \ --gen1_gc_interval=null \ --save_folder=runs/ \ --activation_checkpointing=fine_grained \ From 0d2259e64a20b4bb81c347cc2a1b5c640091a9c8 Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 11:06:30 -0700 Subject: [PATCH 19/40] change run names --- configs/llamaish1-s3.yaml | 4 ++-- scripts/beaker/llamaish1-normal-launch.sh | 2 +- scripts/beaker/llamaish1-normal.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/llamaish1-s3.yaml b/configs/llamaish1-s3.yaml index 61574d142..d1a968949 100644 --- a/configs/llamaish1-s3.yaml +++ b/configs/llamaish1-s3.yaml @@ -1,11 +1,11 @@ -run_name: llamaish1-amber-data-001 +run_name: llamaish1-amber-data-sgo-001 seed: 6198 dry_run: false wandb: name: ${run_name} project: olmo-small - group: llamaish1-amber-data + group: llamaish1-amber-data-sgo model: d_model: 2048 diff --git a/scripts/beaker/llamaish1-normal-launch.sh b/scripts/beaker/llamaish1-normal-launch.sh index 55dbccaf1..332cd1672 100755 --- a/scripts/beaker/llamaish1-normal-launch.sh +++ b/scripts/beaker/llamaish1-normal-launch.sh @@ -6,7 +6,7 @@ NUM_NODES=8 gantry run \ --workspace ai2/OLMo-training \ - --task-name llamaish1-normal-amber-data \ + --task-name llamaish1-normal-amber-data-sgo \ --description "OLMo small - 1B - Llamaish Normal with Amber data" \ --priority urgent \ --preemptible \ diff --git a/scripts/beaker/llamaish1-normal.sh b/scripts/beaker/llamaish1-normal.sh index aef1137a0..0fab93b95 100755 --- a/scripts/beaker/llamaish1-normal.sh +++ b/scripts/beaker/llamaish1-normal.sh @@ -18,7 +18,7 @@ curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" popd export HF_DATASETS_OFFLINE=1 -export EXPERIMENT=llamaish1-normal-amber-data +export EXPERIMENT=llamaish1-normal-amber-data-sgo torchrun \ --nnodes ${NUM_NODES}:${NUM_NODES} \ From d74e26f2d4407e7b9e2737cc1aa017fd9ada0c57 Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 12:55:56 -0700 Subject: [PATCH 20/40] switch to huggyface tokenizer --- configs/llamaish1-s3.yaml | 4 ++-- scripts/beaker/llamaish1-normal-launch.sh | 2 +- scripts/beaker/llamaish1-normal.sh | 4 ---- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/configs/llamaish1-s3.yaml b/configs/llamaish1-s3.yaml index d1a968949..ceeca397e 100644 --- a/configs/llamaish1-s3.yaml +++ b/configs/llamaish1-s3.yaml @@ -1,4 +1,4 @@ -run_name: llamaish1-amber-data-sgo-001 +run_name: llamaish1-amber-data-sgo-002 seed: 6198 dry_run: false @@ -64,7 +64,7 @@ scheduler: tokenizer: - identifier: NousResearch/Llama-2-7b-hf + identifier: huggyllama/llama-7b truncate_direction: right save_folder: runs/${run_name} diff --git a/scripts/beaker/llamaish1-normal-launch.sh b/scripts/beaker/llamaish1-normal-launch.sh index 332cd1672..33fb66476 100755 --- a/scripts/beaker/llamaish1-normal-launch.sh +++ b/scripts/beaker/llamaish1-normal-launch.sh @@ -7,7 +7,7 @@ NUM_NODES=8 gantry run \ --workspace ai2/OLMo-training \ --task-name llamaish1-normal-amber-data-sgo \ - --description "OLMo small - 1B - Llamaish Normal with Amber data" \ + --description "OLMo small - 1B - Llamaish Normal with Amber data, huggyface tokenizer" \ --priority urgent \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ diff --git a/scripts/beaker/llamaish1-normal.sh b/scripts/beaker/llamaish1-normal.sh index 0fab93b95..78ab758c5 100755 --- a/scripts/beaker/llamaish1-normal.sh +++ b/scripts/beaker/llamaish1-normal.sh @@ -18,7 +18,6 @@ curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" popd export HF_DATASETS_OFFLINE=1 -export EXPERIMENT=llamaish1-normal-amber-data-sgo torchrun \ --nnodes ${NUM_NODES}:${NUM_NODES} \ @@ -30,9 +29,6 @@ torchrun \ --rdzv_conf="read_timeout=420" \ scripts/train.py \ configs/llamaish1-s3.yaml \ - --run_name=$EXPERIMENT \ - --wandb.name=$EXPERIMENT \ - --wandb.group=$EXPERIMENT \ --model.flash_attention=true \ --fsdp.wrapping_strategy=by_block_and_size \ --fsdp.sharding_strategy=SHARD_GRAD_OP \ From d909a986cde1b1f66f788b5e1789cd40159cf3b9 Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 14:30:55 -0700 Subject: [PATCH 21/40] initial config changes for amberish 1B --- configs/amberish1-s3.yaml | 598 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 598 insertions(+) create mode 100644 configs/amberish1-s3.yaml diff --git a/configs/amberish1-s3.yaml b/configs/amberish1-s3.yaml new file mode 100644 index 000000000..0650b289b --- /dev/null +++ b/configs/amberish1-s3.yaml @@ -0,0 +1,598 @@ +run_name: amberish1-base +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-small + group: amberish1-base + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + # mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + attention_layer_norm: false + clip_qkv: null + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: false + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 32000 + embedding_size: 32000 + eos_token_id: 2 + pad_token_id: 2 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 4.0e-4 + weight_decay: 0.1 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 10485760000 + t_max: 3e12 + alpha_f: 0.1 + grad_clip_warmup_steps: 2097152000 + grad_clip_warmup_factor: 5 + warmup_min_lr: 0 + +tokenizer: + + identifier: huggyllama/llama-7b + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/OLMo-small/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 2ep +global_train_batch_size: 512 +device_train_microbatch_size: 4 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # datasets: + # c4_en-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + # Doesn't work from cache. + # - label: basic_arithmetic + # type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + # - label: basic_arithmetic + # type: downstream + + # - label: trivia_qa_wiki_ppl + # type: downstream + + # - label: natural_qs_open_ppl + # type: downstream + + # - label: arc_easy_ppl + # type: downstream + +data: + pad_direction: right + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + paths: + ######### Amber ######### + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_000.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_001.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_002.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_003.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_004.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_005.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_006.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_007.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_008.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_009.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_010.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_011.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_012.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_013.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_014.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_015.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_016.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_017.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_018.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_019.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_020.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_021.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_022.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_023.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_024.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_025.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_026.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_027.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_028.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_029.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_030.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_031.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_032.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_033.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_034.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_035.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_036.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_037.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_038.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_039.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_040.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_041.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_042.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_043.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_044.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_045.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_046.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_047.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_048.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_049.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_050.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_051.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_052.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_053.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_054.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_055.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_056.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_057.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_058.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_059.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_060.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_061.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_062.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_063.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_064.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_065.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_066.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_067.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_068.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_069.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_070.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_071.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_072.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_073.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_074.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_075.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_076.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_077.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_078.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_079.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_080.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_081.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_082.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_083.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_084.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_085.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_086.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_087.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_088.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_089.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_090.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_091.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_092.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_093.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_094.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_095.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_096.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_097.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_098.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_099.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_100.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_101.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_102.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_103.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_104.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_105.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_106.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_107.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_108.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_109.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_110.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_111.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_112.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_113.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_114.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_115.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_116.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_117.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_118.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_119.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_120.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_121.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_122.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_123.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_124.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_125.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_126.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_127.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_128.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_129.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_130.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_131.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_132.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_133.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_134.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_135.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_136.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_137.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_138.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_139.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_140.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_141.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_142.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_143.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_144.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_145.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_146.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_147.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_148.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_149.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_150.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_151.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_152.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_153.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_154.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_155.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_156.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_157.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_158.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_159.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_160.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_161.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_162.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_163.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_164.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_165.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_166.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_167.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_168.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_169.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_170.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_171.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_172.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_173.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_174.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_175.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_176.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_177.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_178.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_179.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_180.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_181.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_182.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_183.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_184.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_185.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_186.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_187.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_188.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_189.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_190.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_191.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_192.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_193.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_194.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_195.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_196.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_197.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_198.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_199.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_200.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_201.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_202.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_203.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_204.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_205.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_206.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_207.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_208.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_209.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_210.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_211.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_212.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_213.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_214.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_215.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_216.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_217.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_218.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_219.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_220.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_221.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_222.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_223.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_224.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_225.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_226.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_227.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_228.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_229.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_230.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_231.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_232.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_233.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_234.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_235.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_236.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_237.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_238.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_239.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_240.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_241.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_242.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_243.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_244.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_245.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_246.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_247.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_248.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_249.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_250.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_251.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_252.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_253.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_254.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_255.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_256.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_257.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_258.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_259.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_260.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_261.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_262.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_263.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_264.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_265.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_266.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_267.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_268.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_269.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_270.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_271.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_272.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_273.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_274.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_275.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_276.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_277.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_278.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_279.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_280.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_281.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_282.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_283.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_284.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_285.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_286.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_287.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_288.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_289.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_290.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_291.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_292.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_293.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_294.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_295.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_296.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_297.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_298.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_299.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_300.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_301.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_302.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_303.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_304.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_305.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_306.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_307.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_308.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_309.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_310.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_311.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_312.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_313.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_314.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_315.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_316.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_317.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_318.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_319.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_320.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_321.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_322.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_323.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_324.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_325.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_326.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_327.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_328.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_329.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_330.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_331.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_332.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_333.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_334.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_335.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_336.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_337.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_338.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_339.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_340.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_341.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_342.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_343.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_344.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_345.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_346.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_347.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_348.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_349.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_350.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_351.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_352.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_353.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_354.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_355.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_356.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_357.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_358.npy + - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_359.npy \ No newline at end of file From 9d1ad1f11cfb3490218c41c3db2d7af76e690dec Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 14:31:22 -0700 Subject: [PATCH 22/40] initial launch script changes --- scripts/beaker/amberish1-normal-launch.sh | 33 ++++++++++++++ scripts/beaker/amberish1-normal.sh | 54 +++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100755 scripts/beaker/amberish1-normal-launch.sh create mode 100755 scripts/beaker/amberish1-normal.sh diff --git a/scripts/beaker/amberish1-normal-launch.sh b/scripts/beaker/amberish1-normal-launch.sh new file mode 100755 index 000000000..74f193768 --- /dev/null +++ b/scripts/beaker/amberish1-normal-launch.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=8 + +gantry run \ + --workspace ai2/OLMo-training \ + --task-name amberish1-base \ + --description "OLMo small - 1B - Amberish with Amber data" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --propagate-failure \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env-secret WANDB_API_KEY=DUSTINS_WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=DUSTINS_AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=DUSTINS_AWS_SECRET_ACCESS_KEY \ + --shared-memory 10GiB \ + --venv base \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/llamaish1-normal.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish1-normal.sh b/scripts/beaker/amberish1-normal.sh new file mode 100755 index 000000000..78ab758c5 --- /dev/null +++ b/scripts/beaker/amberish1-normal.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +popd +export HF_DATASETS_OFFLINE=1 + + +torchrun \ + --nnodes ${NUM_NODES}:${NUM_NODES} \ + --nproc-per-node 8 \ + --rdzv_id=12347 \ + --rdzv_backend=static \ + --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ + --node_rank=$BEAKER_REPLICA_RANK \ + --rdzv_conf="read_timeout=420" \ + scripts/train.py \ + configs/llamaish1-s3.yaml \ + --model.flash_attention=true \ + --fsdp.wrapping_strategy=by_block_and_size \ + --fsdp.sharding_strategy=SHARD_GRAD_OP \ + --gen1_gc_interval=null \ + --save_folder=runs/ \ + --activation_checkpointing=fine_grained \ + --fused_loss=false \ + --device_train_microbatch_size=4 \ + --global_train_batch_size=512 \ + --save_interval=250 \ + --eval_interval=250 \ + --optimizer.metrics_log_interval=1 \ + --save_overwrite \ + --model.init_fn=normal \ + --model.init_std=0.02 \ + --model.init_cutoff_factor=3 \ + --model.clip_qkv=null \ + --save_num_checkpoints_to_keep=3 \ + --scheduler.warmup_min_lr=0 \ + --scheduler.grad_clip_warmup_steps=null \ + --scheduler.units=steps \ + --scheduler.t_warmup=2000 + '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llm-306-amber-data-repro-db-normal-init-2}' From 109e5b55c381cd584d9b74b098051413d81e69f4 Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 14:43:19 -0700 Subject: [PATCH 23/40] change rms_layernorm eps to match amber --- olmo/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/olmo/model.py b/olmo/model.py index 65c430e78..aaaa10b77 100644 --- a/olmo/model.py +++ b/olmo/model.py @@ -230,7 +230,7 @@ def __init__( config: ModelConfig, size: Optional[int] = None, elementwise_affine: Optional[bool] = None, - eps: float = 1e-5, + eps: float = 1e-6, ): super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps) From 397af9552ebb64326b1e2f70879e76ad33ac8085 Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 15:12:23 -0700 Subject: [PATCH 24/40] additional config changes --- configs/amberish1-s3.yaml | 12 +++++++----- scripts/beaker/amberish1-normal-launch.sh | 2 +- scripts/beaker/amberish1-normal.sh | 16 ++-------------- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/configs/amberish1-s3.yaml b/configs/amberish1-s3.yaml index 0650b289b..66c3d9b6f 100644 --- a/configs/amberish1-s3.yaml +++ b/configs/amberish1-s3.yaml @@ -47,6 +47,7 @@ optimizer: weight_decay: 0.1 decay_norm_and_bias: true decay_embeddings: false + epsilon: 1.0E-08 betas: - 0.9 - 0.95 @@ -55,15 +56,15 @@ optimizer: scheduler: name: cosine_with_warmup units: tokens - t_warmup: 10485760000 - t_max: 3e12 + t_warmup: 4587520000 + t_max: 1.25e12 alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 + warmup_min_lr: 0 + grad_clip_warmup_steps: null grad_clip_warmup_factor: 5 warmup_min_lr: 0 tokenizer: - identifier: huggyllama/llama-7b truncate_direction: right @@ -82,13 +83,14 @@ save_num_unsharded_checkpoints_to_keep: -1 load_path: null max_duration: 2ep -global_train_batch_size: 512 +global_train_batch_size: 1120 device_train_microbatch_size: 4 precision: amp_bf16 fsdp: wrapping_strategy: by_block_and_size + sharding_strategy: SHARD_GRAD_OP precision: mixed max_grad_norm: 1.0 diff --git a/scripts/beaker/amberish1-normal-launch.sh b/scripts/beaker/amberish1-normal-launch.sh index 74f193768..9f2e0f209 100755 --- a/scripts/beaker/amberish1-normal-launch.sh +++ b/scripts/beaker/amberish1-normal-launch.sh @@ -30,4 +30,4 @@ gantry run \ --venv base \ --yes \ --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/llamaish1-normal.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" + -- /bin/bash -c "scripts/beaker/amberish1-normal.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish1-normal.sh b/scripts/beaker/amberish1-normal.sh index 78ab758c5..79e92ebce 100755 --- a/scripts/beaker/amberish1-normal.sh +++ b/scripts/beaker/amberish1-normal.sh @@ -28,27 +28,15 @@ torchrun \ --node_rank=$BEAKER_REPLICA_RANK \ --rdzv_conf="read_timeout=420" \ scripts/train.py \ - configs/llamaish1-s3.yaml \ + configs/amberish1-s3.yaml \ --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ --gen1_gc_interval=null \ --save_folder=runs/ \ --activation_checkpointing=fine_grained \ --fused_loss=false \ - --device_train_microbatch_size=4 \ - --global_train_batch_size=512 \ --save_interval=250 \ --eval_interval=250 \ --optimizer.metrics_log_interval=1 \ --save_overwrite \ - --model.init_fn=normal \ - --model.init_std=0.02 \ - --model.init_cutoff_factor=3 \ - --model.clip_qkv=null \ --save_num_checkpoints_to_keep=3 \ - --scheduler.warmup_min_lr=0 \ - --scheduler.grad_clip_warmup_steps=null \ - --scheduler.units=steps \ - --scheduler.t_warmup=2000 - '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llm-306-amber-data-repro-db-normal-init-2}' + # '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llm-306-amber-data-repro-db-normal-init-2}' From 93c88a88fd16f056d915dabf668dddc904780086 Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 15:17:15 -0700 Subject: [PATCH 25/40] move last couple of configs from launch script to config file --- configs/amberish1-s3.yaml | 4 ++++ scripts/beaker/amberish1-normal.sh | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/configs/amberish1-s3.yaml b/configs/amberish1-s3.yaml index 66c3d9b6f..dc64ce6bd 100644 --- a/configs/amberish1-s3.yaml +++ b/configs/amberish1-s3.yaml @@ -41,6 +41,10 @@ model: compile: null +fused_loss: false + +activation_checkpointing: fine_grained + optimizer: name: adamw learning_rate: 4.0e-4 diff --git a/scripts/beaker/amberish1-normal.sh b/scripts/beaker/amberish1-normal.sh index 79e92ebce..a4805049d 100755 --- a/scripts/beaker/amberish1-normal.sh +++ b/scripts/beaker/amberish1-normal.sh @@ -29,11 +29,8 @@ torchrun \ --rdzv_conf="read_timeout=420" \ scripts/train.py \ configs/amberish1-s3.yaml \ - --model.flash_attention=true \ --gen1_gc_interval=null \ --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=false \ --save_interval=250 \ --eval_interval=250 \ --optimizer.metrics_log_interval=1 \ From d9c929df0d6837b571a147fe216bacb4f6c096a0 Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 16:13:27 -0700 Subject: [PATCH 26/40] adding rms_layer_norm eps to config --- olmo/config.py | 2 ++ olmo/model.py | 9 +++------ pyproject.toml | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/olmo/config.py b/olmo/config.py index 7c294b2db..96c761899 100644 --- a/olmo/config.py +++ b/olmo/config.py @@ -349,6 +349,8 @@ class ModelConfig(BaseConfig): to ``False``. """ + layer_norm_eps: float = 1e-05 + attention_layer_norm_with_affine: bool = True """ Toggle affine transform for the QK norms. diff --git a/olmo/model.py b/olmo/model.py index aaaa10b77..f902c2463 100644 --- a/olmo/model.py +++ b/olmo/model.py @@ -136,11 +136,10 @@ def __init__( *, size: Optional[int] = None, elementwise_affine: Optional[bool] = True, - eps: float = 1e-05, ): super().__init__() self.config = config - self.eps = eps + self.eps = config.layer_norm_eps self.normalized_shape = (size or config.d_model,) if elementwise_affine or (elementwise_affine is None and self.config.layer_norm_with_affine): self.weight = nn.Parameter(torch.ones(self.normalized_shape, device=config.init_device)) @@ -199,9 +198,8 @@ def __init__( size: Optional[int] = None, low_precision: bool = False, elementwise_affine: Optional[bool] = None, - eps: float = 1e-05, ): - super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps) + super().__init__(config, size=size, elementwise_affine=elementwise_affine) self.low_precision = low_precision def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -230,9 +228,8 @@ def __init__( config: ModelConfig, size: Optional[int] = None, elementwise_affine: Optional[bool] = None, - eps: float = 1e-6, ): - super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps) + super().__init__(config, size=size, elementwise_affine=elementwise_affine) def forward(self, x: torch.Tensor) -> torch.Tensor: with torch.autocast(enabled=False, device_type=x.device.type): diff --git a/pyproject.toml b/pyproject.toml index 18314fc64..2d8927fd1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ requires-python = ">=3.8" license = { file = "LICENSE" } dependencies = [ "numpy", - "torch>=2.1,<2.3", + "torch>=2.1,<=2.3", "ai2-olmo-core==0.1.0", "omegaconf", "rich", From 7e4f0df9630f8d95d91581bf90a030b4d016724f Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 16:16:09 -0700 Subject: [PATCH 27/40] turn off fg activation checkpointing --- configs/amberish1-s3.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/configs/amberish1-s3.yaml b/configs/amberish1-s3.yaml index dc64ce6bd..5c47eeb99 100644 --- a/configs/amberish1-s3.yaml +++ b/configs/amberish1-s3.yaml @@ -43,8 +43,6 @@ compile: null fused_loss: false -activation_checkpointing: fine_grained - optimizer: name: adamw learning_rate: 4.0e-4 From c41b2edcc739bcba139bdf526f1c389d278ec94b Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 16:19:21 -0700 Subject: [PATCH 28/40] rename config and scripts --- configs/llm-360-amber-s3.yaml | 516 ------------------ ...{amberish1-s3.yaml => llm-360-amber1.yaml} | 0 ...mal-launch.sh => llm-360-amber1-launch.sh} | 2 +- ...{amberish1-normal.sh => llm-360-amber1.sh} | 2 +- 4 files changed, 2 insertions(+), 518 deletions(-) delete mode 100644 configs/llm-360-amber-s3.yaml rename configs/{amberish1-s3.yaml => llm-360-amber1.yaml} (100%) rename scripts/beaker/{amberish1-normal-launch.sh => llm-360-amber1-launch.sh} (87%) rename scripts/beaker/{amberish1-normal.sh => llm-360-amber1.sh} (96%) diff --git a/configs/llm-360-amber-s3.yaml b/configs/llm-360-amber-s3.yaml deleted file mode 100644 index 5bcb3a634..000000000 --- a/configs/llm-360-amber-s3.yaml +++ /dev/null @@ -1,516 +0,0 @@ -run_name: llm-306-amber-data-repro-db-normal-init-2 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: llm-306-amber-data-repro-match-dirk-baseline-normal-init - -model: - d_model: 4096 - n_heads: 32 - n_kv_heads: null - clip_qkv: 8.0 - n_layers: 32 - mlp_ratio: 4 - mlp_hidden_size: 22016 - activation_type: swiglu - block_type: sequential - block_group_size: 1 - alibi: false - alibi_bias_max: 8.0 - rope: true - rope_full_precision: true - flash_attention: true - attention_dropout: 0.0 - multi_query_attention: null - attention_layer_norm: false - residual_dropout: 0.0 - embedding_dropout: 0.0 - layer_norm_type: default - layer_norm_with_affine: false - attention_layer_norm_with_affine: false - max_sequence_length: 2048 - include_bias: false - bias_for_layer_norm: false - scale_logits: false - vocab_size: 32000 - embedding_size: 32000 - weight_tying: false - eos_token_id: 2 - pad_token_id: 2 - init_device: meta - init_fn: normal - init_std: 0.02 - init_cutoff_factor: null - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 1 - -scheduler: - name: cosine_with_warmup - units: tokens - warmup_min_lr: 0 - t_warmup: 20971520000 - t_max: 3e12 - alpha_f: 0.1 - grad_clip_warmup_steps: null - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/nousresearch-llama-2-7b-hf.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 1024 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - use_orig_params: true - sharding_strategy: ShardingStrategy.SHARD_GRAD_OP - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - # - label: all-small-ppl-validation - # data: - # num_workers: 0 - # drop_last: true - # datasets: - # c4_en-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - # dolma_books-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - # dolma_common-crawl-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - # dolma_pes2o-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - # dolma_reddit-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - # dolma_stack-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - # dolma_wiki-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - # ice-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - # m2d2_s2orc-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - # pile-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - # wikitext_103-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - - label: hellaswag - type: downstream - - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### Amber ######### - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_000.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_001.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_002.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_003.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_004.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_005.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_006.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_007.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_008.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_009.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_010.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_011.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_012.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_013.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_014.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_015.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_016.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_017.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_018.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_019.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_020.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_021.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_022.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_023.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_024.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_025.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_026.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_027.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_028.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_029.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_030.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_031.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_032.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_033.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_034.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_035.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_036.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_037.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_038.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_039.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_040.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_041.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_042.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_043.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_044.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_045.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_046.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_047.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_048.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_049.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_050.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_051.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_052.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_053.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_054.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_055.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_056.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_057.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_058.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_059.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_060.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_061.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_062.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_063.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_064.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_065.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_066.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_067.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_068.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_069.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_070.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_071.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_072.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_073.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_074.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_075.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_076.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_077.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_078.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_079.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_080.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_081.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_082.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_083.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_084.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_085.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_086.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_087.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_088.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_089.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_090.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_091.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_092.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_093.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_094.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_095.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_096.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_097.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_098.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_099.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_100.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_101.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_102.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_103.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_104.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_105.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_106.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_107.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_108.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_109.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_110.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_111.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_112.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_113.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_114.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_115.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_116.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_117.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_118.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_119.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_120.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_121.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_122.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_123.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_124.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_125.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_126.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_127.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_128.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_129.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_130.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_131.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_132.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_133.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_134.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_135.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_136.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_137.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_138.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_139.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_140.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_141.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_142.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_143.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_144.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_145.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_146.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_147.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_148.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_149.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_150.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_151.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_152.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_153.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_154.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_155.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_156.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_157.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_158.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_159.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_160.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_161.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_162.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_163.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_164.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_165.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_166.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_167.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_168.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_169.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_170.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_171.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_172.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_173.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_174.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_175.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_176.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_177.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_178.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_179.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_180.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_181.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_182.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_183.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_184.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_185.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_186.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_187.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_188.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_189.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_190.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_191.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_192.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_193.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_194.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_195.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_196.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_197.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_198.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_199.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_200.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_201.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_202.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_203.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_204.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_205.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_206.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_207.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_208.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_209.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_210.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_211.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_212.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_213.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_214.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_215.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_216.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_217.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_218.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_219.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_220.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_221.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_222.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_223.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_224.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_225.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_226.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_227.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_228.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_229.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_230.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_231.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_232.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_233.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_234.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_235.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_236.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_237.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_238.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_239.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_240.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_241.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_242.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_243.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_244.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_245.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_246.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_247.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_248.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_249.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_250.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_251.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_252.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_253.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_254.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_255.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_256.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_257.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_258.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_259.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_260.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_261.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_262.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_263.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_264.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_265.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_266.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_267.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_268.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_269.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_270.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_271.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_272.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_273.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_274.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_275.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_276.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_277.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_278.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_279.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_280.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_281.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_282.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_283.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_284.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_285.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_286.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_287.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_288.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_289.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_290.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_291.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_292.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_293.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_294.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_295.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_296.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_297.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_298.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_299.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_300.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_301.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_302.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_303.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_304.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_305.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_306.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_307.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_308.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_309.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_310.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_311.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_312.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_313.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_314.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_315.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_316.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_317.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_318.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_319.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_320.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_321.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_322.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_323.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_324.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_325.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_326.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_327.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_328.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_329.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_330.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_331.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_332.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_333.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_334.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_335.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_336.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_337.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_338.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_339.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_340.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_341.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_342.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_343.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_344.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_345.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_346.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_347.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_348.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_349.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_350.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_351.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_352.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_353.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_354.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_355.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_356.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_357.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_358.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_359.npy diff --git a/configs/amberish1-s3.yaml b/configs/llm-360-amber1.yaml similarity index 100% rename from configs/amberish1-s3.yaml rename to configs/llm-360-amber1.yaml diff --git a/scripts/beaker/amberish1-normal-launch.sh b/scripts/beaker/llm-360-amber1-launch.sh similarity index 87% rename from scripts/beaker/amberish1-normal-launch.sh rename to scripts/beaker/llm-360-amber1-launch.sh index 9f2e0f209..92f7f5d6d 100755 --- a/scripts/beaker/amberish1-normal-launch.sh +++ b/scripts/beaker/llm-360-amber1-launch.sh @@ -30,4 +30,4 @@ gantry run \ --venv base \ --yes \ --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/amberish1-normal.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" + -- /bin/bash -c "scripts/beaker/llm-360-amber1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish1-normal.sh b/scripts/beaker/llm-360-amber1.sh similarity index 96% rename from scripts/beaker/amberish1-normal.sh rename to scripts/beaker/llm-360-amber1.sh index a4805049d..cc60b732c 100755 --- a/scripts/beaker/amberish1-normal.sh +++ b/scripts/beaker/llm-360-amber1.sh @@ -28,7 +28,7 @@ torchrun \ --node_rank=$BEAKER_REPLICA_RANK \ --rdzv_conf="read_timeout=420" \ scripts/train.py \ - configs/amberish1-s3.yaml \ + configs/llm-360-amber1.yaml \ --gen1_gc_interval=null \ --save_folder=runs/ \ --save_interval=250 \ From c63d82190cca1d469a15c8ec85c71036a89adc31 Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 16:31:59 -0700 Subject: [PATCH 29/40] removed files not intended for the amberish PR --- configs/llamaish1-s3.yaml | 598 ---------- configs/llamaish7-s3.yaml | 1296 --------------------- scripts/beaker/llamaish1-normal-launch.sh | 33 - scripts/beaker/llamaish1-normal.sh | 54 - scripts/beaker/llamaish7-normal.sh | 57 - 5 files changed, 2038 deletions(-) delete mode 100644 configs/llamaish1-s3.yaml delete mode 100644 configs/llamaish7-s3.yaml delete mode 100755 scripts/beaker/llamaish1-normal-launch.sh delete mode 100755 scripts/beaker/llamaish1-normal.sh delete mode 100755 scripts/beaker/llamaish7-normal.sh diff --git a/configs/llamaish1-s3.yaml b/configs/llamaish1-s3.yaml deleted file mode 100644 index ceeca397e..000000000 --- a/configs/llamaish1-s3.yaml +++ /dev/null @@ -1,598 +0,0 @@ -run_name: llamaish1-amber-data-sgo-002 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-small - group: llamaish1-amber-data-sgo - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - # mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 32000 - embedding_size: 32000 - eos_token_id: 2 - pad_token_id: 2 - init_device: meta - init_fn: normal - init_std: 0.02 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 4.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 10485760000 - t_max: 3e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - warmup_min_lr: 0 - -tokenizer: - - identifier: huggyllama/llama-7b - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-small/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 512 -device_train_microbatch_size: 4 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - # - label: all-small-ppl-validation - # data: - # num_workers: 0 - # drop_last: true - # datasets: - # c4_en-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - # dolma_books-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - # dolma_common-crawl-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - # dolma_pes2o-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - # dolma_reddit-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - # dolma_stack-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - # dolma_wiki-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - # ice-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - # m2d2_s2orc-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - # pile-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - # wikitext_103-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - - # - label: basic_arithmetic - # type: downstream - - # - label: trivia_qa_wiki_ppl - # type: downstream - - # - label: natural_qs_open_ppl - # type: downstream - - # - label: arc_easy_ppl - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - paths: - ######### Amber ######### - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_000.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_001.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_002.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_003.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_004.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_005.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_006.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_007.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_008.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_009.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_010.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_011.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_012.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_013.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_014.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_015.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_016.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_017.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_018.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_019.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_020.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_021.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_022.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_023.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_024.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_025.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_026.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_027.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_028.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_029.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_030.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_031.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_032.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_033.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_034.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_035.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_036.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_037.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_038.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_039.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_040.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_041.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_042.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_043.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_044.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_045.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_046.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_047.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_048.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_049.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_050.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_051.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_052.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_053.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_054.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_055.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_056.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_057.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_058.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_059.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_060.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_061.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_062.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_063.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_064.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_065.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_066.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_067.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_068.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_069.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_070.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_071.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_072.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_073.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_074.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_075.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_076.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_077.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_078.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_079.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_080.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_081.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_082.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_083.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_084.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_085.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_086.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_087.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_088.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_089.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_090.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_091.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_092.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_093.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_094.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_095.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_096.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_097.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_098.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_099.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_100.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_101.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_102.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_103.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_104.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_105.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_106.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_107.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_108.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_109.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_110.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_111.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_112.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_113.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_114.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_115.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_116.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_117.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_118.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_119.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_120.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_121.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_122.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_123.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_124.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_125.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_126.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_127.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_128.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_129.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_130.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_131.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_132.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_133.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_134.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_135.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_136.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_137.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_138.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_139.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_140.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_141.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_142.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_143.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_144.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_145.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_146.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_147.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_148.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_149.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_150.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_151.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_152.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_153.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_154.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_155.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_156.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_157.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_158.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_159.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_160.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_161.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_162.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_163.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_164.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_165.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_166.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_167.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_168.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_169.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_170.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_171.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_172.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_173.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_174.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_175.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_176.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_177.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_178.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_179.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_180.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_181.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_182.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_183.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_184.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_185.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_186.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_187.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_188.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_189.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_190.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_191.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_192.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_193.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_194.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_195.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_196.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_197.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_198.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_199.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_200.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_201.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_202.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_203.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_204.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_205.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_206.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_207.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_208.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_209.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_210.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_211.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_212.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_213.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_214.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_215.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_216.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_217.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_218.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_219.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_220.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_221.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_222.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_223.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_224.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_225.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_226.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_227.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_228.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_229.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_230.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_231.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_232.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_233.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_234.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_235.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_236.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_237.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_238.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_239.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_240.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_241.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_242.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_243.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_244.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_245.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_246.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_247.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_248.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_249.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_250.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_251.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_252.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_253.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_254.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_255.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_256.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_257.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_258.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_259.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_260.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_261.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_262.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_263.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_264.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_265.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_266.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_267.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_268.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_269.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_270.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_271.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_272.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_273.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_274.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_275.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_276.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_277.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_278.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_279.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_280.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_281.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_282.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_283.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_284.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_285.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_286.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_287.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_288.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_289.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_290.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_291.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_292.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_293.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_294.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_295.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_296.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_297.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_298.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_299.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_300.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_301.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_302.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_303.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_304.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_305.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_306.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_307.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_308.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_309.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_310.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_311.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_312.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_313.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_314.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_315.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_316.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_317.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_318.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_319.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_320.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_321.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_322.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_323.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_324.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_325.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_326.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_327.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_328.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_329.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_330.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_331.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_332.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_333.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_334.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_335.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_336.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_337.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_338.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_339.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_340.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_341.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_342.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_343.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_344.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_345.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_346.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_347.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_348.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_349.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_350.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_351.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_352.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_353.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_354.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_355.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_356.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_357.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_358.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_359.npy \ No newline at end of file diff --git a/configs/llamaish7-s3.yaml b/configs/llamaish7-s3.yaml deleted file mode 100644 index e928af92f..000000000 --- a/configs/llamaish7-s3.yaml +++ /dev/null @@ -1,1296 +0,0 @@ -run_name: llamaish7-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: llamaish7 - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: true - attention_dropout: 0.0 - attention_layer_norm: false - clip_qkv: 8.0 - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 4096 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: full_megatron - init_std: 0.006 - init_cutoff_factor: 3 - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - decay_norm_and_bias: true - decay_embeddings: false - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: cosine_with_warmup - units: tokens - t_warmup: 20971520000 - t_max: 3e12 - alpha_f: 0.1 - grad_clip_warmup_steps: 2097152000 - grad_clip_warmup_factor: 5 - -tokenizer: - identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json - truncate_direction: right - -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name} -save_overwrite: false - -save_interval: 1000 -save_interval_ephemeral: null -save_num_checkpoints_to_keep: -1 -sharded_checkpointer: olmo_core - -save_interval_unsharded: null -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -max_duration: 2ep -global_train_batch_size: 512 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: by_block_and_size - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 1 - -eval_interval: 1000 -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - datasets: - c4_en-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy - dolma_books-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy - ice-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy - wikitext_103-validation: - - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - - label: boolq - type: downstream - - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - - label: arc_challenge - type: downstream - - - label: copa - type: downstream - - #- label: rte - # type: downstream - - #- label: commitment_bank - # type: downstream - - #- label: sst2 - # type: downstream - - - label: commonsense_qa - type: downstream - - - label: social_iqa - type: downstream - - # Doesn't work from cache. - # - label: basic_arithmetic - # type: downstream - - - label: mmlu_stem_var - type: downstream - - - label: mmlu_humanities_var - type: downstream - - - label: mmlu_social_sciences_var - type: downstream - - - label: mmlu_other_var - type: downstream - - - label: mmlu_stem_mc_5shot - type: downstream - - - label: mmlu_humanities_mc_5shot - type: downstream - - - label: mmlu_social_sciences_mc_5shot - type: downstream - - - label: mmlu_other_mc_5shot - type: downstream - - - label: mmlu_stem_mc_5shot_test - type: downstream - - - label: mmlu_humanities_mc_5shot_test - type: downstream - - - label: mmlu_social_sciences_mc_5shot_test - type: downstream - - - label: mmlu_other_mc_5shot_test - type: downstream - - # - label: basic_arithmetic - # type: downstream - - # - label: trivia_qa_wiki_ppl - # type: downstream - - # - label: natural_qs_open_ppl - # type: downstream - - # - label: arc_easy_ppl - # type: downstream - -data: - pad_direction: right - num_workers: 32 - drop_last: true - pin_memory: true - prefetch_factor: 8 - persistent_workers: true - timeout: 0 - instance_filter: - repetition_max_period: 13 - repetition_min_period: 1 - repetition_max_count: 32 - paths: - ######### NON WEB DATA ######### - # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - # ~> PES2O STEM PAPERS (57.21 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - # MEGAWIKA v1 (4.6 GT) - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - # ~> REDPAJAMA ARXIV (27.97 GT) - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - # ~> CC NEWS (14.3 GT) - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy - - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy - #################################### - ######### CODE ######### - # ~> STARCODER (263.775 GT) - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy - #################################### - ######### WEB HIGH QUALITY ######### - # ~> C4 (138.4 GT) - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - # ~> REDDIT (79.9 GT) - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy - - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy - # ~> FALCON (547.341 GT) - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy - - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy - #################################### - ######### WEB REST ######### - # ~> DOLMA CC HEAD 50% (178.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - # ~> DOLMA CC MIDDLE 33% (242.05 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - # ~> DOLMA CC TAIL 33% (191.4 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy - - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy diff --git a/scripts/beaker/llamaish1-normal-launch.sh b/scripts/beaker/llamaish1-normal-launch.sh deleted file mode 100755 index 33fb66476..000000000 --- a/scripts/beaker/llamaish1-normal-launch.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name llamaish1-normal-amber-data-sgo \ - --description "OLMo small - 1B - Llamaish Normal with Amber data, huggyface tokenizer" \ - --priority urgent \ - --preemptible \ - --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 20m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=DUSTINS_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=DUSTINS_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=DUSTINS_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/llamaish1-normal.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/llamaish1-normal.sh b/scripts/beaker/llamaish1-normal.sh deleted file mode 100755 index 78ab758c5..000000000 --- a/scripts/beaker/llamaish1-normal.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish1-s3.yaml \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --gen1_gc_interval=null \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=false \ - --device_train_microbatch_size=4 \ - --global_train_batch_size=512 \ - --save_interval=250 \ - --eval_interval=250 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.init_fn=normal \ - --model.init_std=0.02 \ - --model.init_cutoff_factor=3 \ - --model.clip_qkv=null \ - --save_num_checkpoints_to_keep=3 \ - --scheduler.warmup_min_lr=0 \ - --scheduler.grad_clip_warmup_steps=null \ - --scheduler.units=steps \ - --scheduler.t_warmup=2000 - '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llm-306-amber-data-repro-db-normal-init-2}' diff --git a/scripts/beaker/llamaish7-normal.sh b/scripts/beaker/llamaish7-normal.sh deleted file mode 100755 index dd956702f..000000000 --- a/scripts/beaker/llamaish7-normal.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -export EXPERIMENT=llamaish7-normal - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish7-s3.yaml \ - --run_name=$EXPERIMENT \ - --wandb.name=$EXPERIMENT \ - --wandb.group=$EXPERIMENT \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=false \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --save_interval=250 \ - --eval_interval=250 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.init_fn=normal \ - --model.init_std=0.02 \ - --model.init_cutoff_factor=3 \ - --model.clip_qkv=null \ - --save_num_checkpoints_to_keep=3 \ - --scheduler.warmup_min_lr=0 \ - --scheduler.grad_clip_warmup_steps=null \ - --scheduler.units=steps \ - --scheduler.t_warmup=2000 - # '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-normal/}' From dc1c6562752a96556addfd441734df115f59d2f1 Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 16:32:55 -0700 Subject: [PATCH 30/40] added one file I didn't want to remove --- configs/llamaish7-s3.yaml | 1284 +++++++++++++++++++++++++++++++++++++ 1 file changed, 1284 insertions(+) create mode 100644 configs/llamaish7-s3.yaml diff --git a/configs/llamaish7-s3.yaml b/configs/llamaish7-s3.yaml new file mode 100644 index 000000000..5d52eeef6 --- /dev/null +++ b/configs/llamaish7-s3.yaml @@ -0,0 +1,1284 @@ +run_name: llamaish7-001 +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: llamaish7 + +model: + d_model: 4096 + n_heads: 32 + n_layers: 32 + # mlp_ratio: 6 + mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + attention_layer_norm: false + clip_qkv: 8.0 + include_bias: false + block_type: sequential + layer_norm_type: default + layer_norm_with_affine: false + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 50280 + embedding_size: 50304 + eos_token_id: 0 + pad_token_id: 1 + init_device: meta + init_fn: full_megatron + init_std: 0.006 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 3.0e-4 + weight_decay: 0.1 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 20971520000 + t_max: 3e12 + alpha_f: 0.1 + grad_clip_warmup_steps: 2097152000 + grad_clip_warmup_factor: 5 + +tokenizer: + identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json + truncate_direction: right + +save_folder: runs/${run_name} +remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 2ep +global_train_batch_size: 512 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + datasets: + c4_en-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy + dolma_books-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy + ice-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy + wikitext_103-validation: + - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + # Doesn't work from cache. + # - label: basic_arithmetic + # type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + +data: + pad_direction: right + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + ######### NON WEB DATA ######### + # ~> GUTENBERG BOOKS (5.256 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy + # ~> PES2O STEM PAPERS (57.21 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy + # MEGAWIKA v1 (4.6 GT) + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy + - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy + # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT) + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + # ~> REDPAJAMA ARXIV (27.97 GT) + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy + - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy + # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT) + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + # ~> PROOFPILE2 OPENWEBMATH (12.734 GT) + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline) + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy + # ~> CC NEWS (14.3 GT) + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy + - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy + #################################### + ######### CODE ######### + # ~> STARCODER (263.775 GT) + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy + - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy + #################################### + ######### WEB HIGH QUALITY ######### + # ~> C4 (138.4 GT) + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy + - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy + # ~> REDDIT (79.9 GT) + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy + - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy + # ~> FALCON (547.341 GT) + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy + - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy + #################################### + ######### WEB REST ######### + # ~> DOLMA CC HEAD 50% (178.4 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy + # ~> DOLMA CC MIDDLE 33% (242.05 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy + # ~> DOLMA CC TAIL 33% (191.4 GT) + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy From 0dfba0d6cc6825c97608b340da3c69dbe5fb3df1 Mon Sep 17 00:00:00 2001 From: Dustin Schwenk Date: Wed, 12 Jun 2024 16:40:07 -0700 Subject: [PATCH 31/40] Update configs/llm-360-amber1.yaml Co-authored-by: Pete --- configs/llm-360-amber1.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/configs/llm-360-amber1.yaml b/configs/llm-360-amber1.yaml index 5c47eeb99..5b0bde3b4 100644 --- a/configs/llm-360-amber1.yaml +++ b/configs/llm-360-amber1.yaml @@ -62,9 +62,6 @@ scheduler: t_max: 1.25e12 alpha_f: 0.1 warmup_min_lr: 0 - grad_clip_warmup_steps: null - grad_clip_warmup_factor: 5 - warmup_min_lr: 0 tokenizer: identifier: huggyllama/llama-7b From c849064303bcf86450788007cb5eeecbe68b678d Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 16:45:07 -0700 Subject: [PATCH 32/40] clear out redundant settings --- configs/llm-360-amber1.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/configs/llm-360-amber1.yaml b/configs/llm-360-amber1.yaml index 5c47eeb99..5b0bde3b4 100644 --- a/configs/llm-360-amber1.yaml +++ b/configs/llm-360-amber1.yaml @@ -62,9 +62,6 @@ scheduler: t_max: 1.25e12 alpha_f: 0.1 warmup_min_lr: 0 - grad_clip_warmup_steps: null - grad_clip_warmup_factor: 5 - warmup_min_lr: 0 tokenizer: identifier: huggyllama/llama-7b From be47e5c2510f5c5be4d8d84eb9ecda3f044fc62f Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 16:57:17 -0700 Subject: [PATCH 33/40] change opt eps key name --- configs/llm-360-amber1.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/llm-360-amber1.yaml b/configs/llm-360-amber1.yaml index 5b0bde3b4..6b440ee7e 100644 --- a/configs/llm-360-amber1.yaml +++ b/configs/llm-360-amber1.yaml @@ -49,7 +49,7 @@ optimizer: weight_decay: 0.1 decay_norm_and_bias: true decay_embeddings: false - epsilon: 1.0E-08 + eps: 1.0E-08 betas: - 0.9 - 0.95 From 412fdb4ac7e5a782f14e7dc477b9cc38cb64ede8 Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 17:00:58 -0700 Subject: [PATCH 34/40] reduce N nodes --- scripts/beaker/llm-360-amber1-launch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/beaker/llm-360-amber1-launch.sh b/scripts/beaker/llm-360-amber1-launch.sh index 92f7f5d6d..8907e48b5 100755 --- a/scripts/beaker/llm-360-amber1-launch.sh +++ b/scripts/beaker/llm-360-amber1-launch.sh @@ -2,7 +2,7 @@ set -ex -NUM_NODES=8 +NUM_NODES=4 gantry run \ --workspace ai2/OLMo-training \ From 9e20d153e2044c3d3fd9411c46940d79bd05b242 Mon Sep 17 00:00:00 2001 From: Dustin Date: Wed, 12 Jun 2024 22:33:32 -0700 Subject: [PATCH 35/40] change layer norm eps, remove fused loss --- configs/llm-360-amber1.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/configs/llm-360-amber1.yaml b/configs/llm-360-amber1.yaml index 6b440ee7e..84c0326a5 100644 --- a/configs/llm-360-amber1.yaml +++ b/configs/llm-360-amber1.yaml @@ -24,6 +24,7 @@ model: block_type: sequential layer_norm_type: rms layer_norm_with_affine: false + layer_norm_eps: 1e-6 bias_for_layer_norm: false attention_layer_norm_with_affine: false activation_type: swiglu @@ -41,8 +42,6 @@ model: compile: null -fused_loss: false - optimizer: name: adamw learning_rate: 4.0e-4 From 963576adf0e4a040c44c6f38c76272d3ae840bbb Mon Sep 17 00:00:00 2001 From: Dustin Date: Fri, 14 Jun 2024 18:41:03 -0700 Subject: [PATCH 36/40] move to WEKA, turn on additional evals --- configs/llm-360-amber1.yaml | 764 ++++++++++++------------ scripts/beaker/llm-360-amber1-launch.sh | 1 + scripts/beaker/llm-360-amber1.sh | 8 +- 3 files changed, 387 insertions(+), 386 deletions(-) diff --git a/configs/llm-360-amber1.yaml b/configs/llm-360-amber1.yaml index 84c0326a5..4b019407b 100644 --- a/configs/llm-360-amber1.yaml +++ b/configs/llm-360-amber1.yaml @@ -66,9 +66,9 @@ tokenizer: identifier: huggyllama/llama-7b truncate_direction: right -save_folder: runs/${run_name} -remote_save_folder: s3://ai2-llm/checkpoints/OLMo-small/${run_name} -save_overwrite: false +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/dustins-stability/${run_name} +# remote_save_folder:/weka/ai2-llm/checkpoints/OLMo-small/${run_name} +save_overwrite: true save_interval: 1000 save_interval_ephemeral: null @@ -107,27 +107,27 @@ evaluators: # drop_last: true # datasets: # c4_en-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy + # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy # dolma_books-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy + # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy # dolma_common-crawl-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy + # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy # dolma_pes2o-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy + # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy # dolma_reddit-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy + # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy # dolma_stack-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy + # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy # dolma_wiki-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy + # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy # ice-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy + # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy # m2d2_s2orc-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy + # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy # pile-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy + # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy # wikitext_103-validation: - # - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy + # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy ########################## # Downstream evaluations # @@ -214,17 +214,17 @@ evaluators: - label: mmlu_other_mc_5shot_test type: downstream - # - label: basic_arithmetic - # type: downstream + - label: basic_arithmetic + type: downstream - # - label: trivia_qa_wiki_ppl - # type: downstream + - label: trivia_qa_wiki_ppl + type: downstream - # - label: natural_qs_open_ppl - # type: downstream + - label: natural_qs_open_ppl + type: downstream - # - label: arc_easy_ppl - # type: downstream + - label: arc_easy_ppl + type: downstream data: pad_direction: right @@ -236,363 +236,363 @@ data: timeout: 0 paths: ######### Amber ######### - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_000.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_001.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_002.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_003.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_004.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_005.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_006.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_007.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_008.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_009.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_010.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_011.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_012.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_013.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_014.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_015.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_016.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_017.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_018.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_019.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_020.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_021.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_022.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_023.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_024.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_025.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_026.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_027.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_028.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_029.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_030.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_031.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_032.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_033.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_034.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_035.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_036.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_037.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_038.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_039.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_040.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_041.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_042.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_043.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_044.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_045.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_046.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_047.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_048.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_049.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_050.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_051.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_052.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_053.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_054.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_055.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_056.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_057.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_058.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_059.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_060.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_061.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_062.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_063.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_064.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_065.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_066.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_067.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_068.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_069.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_070.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_071.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_072.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_073.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_074.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_075.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_076.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_077.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_078.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_079.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_080.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_081.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_082.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_083.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_084.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_085.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_086.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_087.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_088.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_089.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_090.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_091.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_092.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_093.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_094.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_095.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_096.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_097.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_098.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_099.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_100.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_101.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_102.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_103.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_104.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_105.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_106.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_107.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_108.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_109.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_110.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_111.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_112.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_113.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_114.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_115.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_116.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_117.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_118.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_119.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_120.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_121.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_122.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_123.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_124.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_125.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_126.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_127.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_128.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_129.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_130.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_131.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_132.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_133.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_134.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_135.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_136.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_137.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_138.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_139.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_140.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_141.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_142.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_143.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_144.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_145.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_146.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_147.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_148.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_149.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_150.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_151.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_152.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_153.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_154.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_155.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_156.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_157.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_158.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_159.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_160.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_161.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_162.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_163.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_164.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_165.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_166.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_167.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_168.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_169.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_170.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_171.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_172.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_173.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_174.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_175.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_176.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_177.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_178.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_179.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_180.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_181.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_182.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_183.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_184.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_185.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_186.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_187.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_188.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_189.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_190.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_191.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_192.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_193.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_194.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_195.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_196.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_197.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_198.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_199.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_200.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_201.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_202.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_203.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_204.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_205.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_206.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_207.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_208.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_209.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_210.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_211.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_212.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_213.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_214.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_215.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_216.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_217.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_218.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_219.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_220.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_221.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_222.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_223.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_224.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_225.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_226.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_227.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_228.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_229.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_230.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_231.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_232.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_233.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_234.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_235.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_236.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_237.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_238.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_239.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_240.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_241.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_242.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_243.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_244.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_245.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_246.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_247.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_248.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_249.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_250.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_251.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_252.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_253.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_254.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_255.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_256.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_257.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_258.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_259.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_260.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_261.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_262.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_263.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_264.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_265.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_266.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_267.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_268.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_269.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_270.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_271.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_272.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_273.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_274.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_275.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_276.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_277.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_278.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_279.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_280.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_281.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_282.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_283.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_284.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_285.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_286.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_287.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_288.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_289.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_290.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_291.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_292.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_293.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_294.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_295.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_296.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_297.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_298.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_299.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_300.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_301.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_302.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_303.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_304.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_305.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_306.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_307.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_308.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_309.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_310.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_311.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_312.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_313.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_314.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_315.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_316.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_317.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_318.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_319.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_320.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_321.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_322.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_323.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_324.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_325.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_326.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_327.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_328.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_329.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_330.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_331.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_332.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_333.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_334.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_335.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_336.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_337.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_338.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_339.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_340.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_341.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_342.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_343.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_344.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_345.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_346.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_347.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_348.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_349.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_350.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_351.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_352.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_353.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_354.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_355.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_356.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_357.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_358.npy - - s3://ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_359.npy \ No newline at end of file + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_000.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_001.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_002.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_003.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_004.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_005.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_006.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_007.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_008.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_009.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_010.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_011.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_012.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_013.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_014.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_015.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_016.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_017.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_018.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_019.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_020.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_021.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_022.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_023.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_024.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_025.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_026.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_027.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_028.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_029.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_030.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_031.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_032.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_033.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_034.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_035.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_036.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_037.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_038.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_039.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_040.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_041.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_042.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_043.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_044.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_045.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_046.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_047.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_048.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_049.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_050.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_051.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_052.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_053.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_054.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_055.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_056.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_057.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_058.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_059.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_060.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_061.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_062.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_063.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_064.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_065.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_066.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_067.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_068.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_069.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_070.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_071.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_072.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_073.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_074.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_075.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_076.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_077.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_078.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_079.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_080.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_081.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_082.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_083.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_084.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_085.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_086.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_087.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_088.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_089.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_090.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_091.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_092.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_093.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_094.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_095.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_096.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_097.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_098.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_099.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_100.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_101.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_102.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_103.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_104.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_105.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_106.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_107.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_108.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_109.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_110.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_111.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_112.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_113.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_114.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_115.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_116.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_117.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_118.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_119.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_120.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_121.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_122.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_123.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_124.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_125.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_126.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_127.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_128.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_129.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_130.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_131.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_132.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_133.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_134.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_135.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_136.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_137.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_138.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_139.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_140.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_141.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_142.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_143.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_144.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_145.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_146.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_147.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_148.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_149.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_150.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_151.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_152.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_153.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_154.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_155.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_156.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_157.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_158.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_159.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_160.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_161.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_162.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_163.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_164.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_165.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_166.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_167.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_168.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_169.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_170.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_171.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_172.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_173.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_174.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_175.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_176.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_177.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_178.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_179.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_180.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_181.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_182.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_183.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_184.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_185.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_186.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_187.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_188.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_189.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_190.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_191.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_192.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_193.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_194.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_195.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_196.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_197.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_198.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_199.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_200.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_201.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_202.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_203.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_204.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_205.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_206.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_207.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_208.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_209.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_210.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_211.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_212.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_213.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_214.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_215.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_216.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_217.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_218.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_219.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_220.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_221.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_222.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_223.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_224.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_225.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_226.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_227.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_228.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_229.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_230.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_231.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_232.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_233.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_234.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_235.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_236.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_237.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_238.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_239.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_240.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_241.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_242.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_243.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_244.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_245.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_246.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_247.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_248.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_249.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_250.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_251.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_252.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_253.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_254.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_255.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_256.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_257.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_258.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_259.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_260.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_261.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_262.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_263.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_264.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_265.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_266.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_267.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_268.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_269.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_270.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_271.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_272.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_273.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_274.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_275.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_276.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_277.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_278.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_279.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_280.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_281.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_282.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_283.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_284.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_285.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_286.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_287.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_288.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_289.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_290.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_291.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_292.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_293.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_294.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_295.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_296.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_297.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_298.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_299.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_300.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_301.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_302.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_303.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_304.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_305.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_306.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_307.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_308.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_309.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_310.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_311.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_312.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_313.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_314.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_315.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_316.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_317.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_318.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_319.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_320.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_321.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_322.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_323.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_324.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_325.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_326.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_327.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_328.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_329.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_330.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_331.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_332.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_333.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_334.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_335.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_336.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_337.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_338.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_339.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_340.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_341.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_342.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_343.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_344.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_345.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_346.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_347.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_348.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_349.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_350.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_351.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_352.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_353.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_354.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_355.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_356.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_357.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_358.npy + -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_359.npy \ No newline at end of file diff --git a/scripts/beaker/llm-360-amber1-launch.sh b/scripts/beaker/llm-360-amber1-launch.sh index 8907e48b5..3ac881088 100755 --- a/scripts/beaker/llm-360-amber1-launch.sh +++ b/scripts/beaker/llm-360-amber1-launch.sh @@ -18,6 +18,7 @@ gantry run \ --host-networking \ --budget ai2/oe-training \ --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ --propagate-failure \ --synchronized-start-timeout 20m \ --env LOG_FILTER_TYPE=local_rank0_only \ diff --git a/scripts/beaker/llm-360-amber1.sh b/scripts/beaker/llm-360-amber1.sh index cc60b732c..aaa923838 100755 --- a/scripts/beaker/llm-360-amber1.sh +++ b/scripts/beaker/llm-360-amber1.sh @@ -14,7 +14,7 @@ shift # Warm HF cache mkdir -p /root/.cache pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - popd export HF_DATASETS_OFFLINE=1 @@ -31,9 +31,9 @@ torchrun \ configs/llm-360-amber1.yaml \ --gen1_gc_interval=null \ --save_folder=runs/ \ - --save_interval=250 \ - --eval_interval=250 \ + --save_interval=1000 \ + --eval_interval=1000 \ --optimizer.metrics_log_interval=1 \ --save_overwrite \ --save_num_checkpoints_to_keep=3 \ - # '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llm-306-amber-data-repro-db-normal-init-2}' + '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/amberish1-base/}' From 47c953d2fc25ca854c9004f0d17a26e2d3d7c1d5 Mon Sep 17 00:00:00 2001 From: Dustin Date: Fri, 14 Jun 2024 18:45:38 -0700 Subject: [PATCH 37/40] fix hf_cache uri --- scripts/beaker/llm-360-amber1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/beaker/llm-360-amber1.sh b/scripts/beaker/llm-360-amber1.sh index aaa923838..e929c4d83 100755 --- a/scripts/beaker/llm-360-amber1.sh +++ b/scripts/beaker/llm-360-amber1.sh @@ -14,7 +14,7 @@ shift # Warm HF cache mkdir -p /root/.cache pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - popd export HF_DATASETS_OFFLINE=1 From 7a5367433ecb857de0dff7e80f9bd6c7db5a1085 Mon Sep 17 00:00:00 2001 From: Dustin Date: Fri, 14 Jun 2024 18:52:19 -0700 Subject: [PATCH 38/40] change load path --- scripts/beaker/llm-360-amber1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/beaker/llm-360-amber1.sh b/scripts/beaker/llm-360-amber1.sh index e929c4d83..1cd2cf315 100755 --- a/scripts/beaker/llm-360-amber1.sh +++ b/scripts/beaker/llm-360-amber1.sh @@ -36,4 +36,4 @@ torchrun \ --optimizer.metrics_log_interval=1 \ --save_overwrite \ --save_num_checkpoints_to_keep=3 \ - '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/amberish1-base/}' + '--load_path=s3://ai2-llm/checkpoints/OLMo-small/${run_name}/step69750/' From afd65d9c58b42f4dcd1cb7fa60ad6aa46bfedc97 Mon Sep 17 00:00:00 2001 From: Dustin Date: Fri, 14 Jun 2024 20:46:06 -0700 Subject: [PATCH 39/40] fix data path --- configs/llm-360-amber1.yaml | 746 ++++++++++++++++++------------------ 1 file changed, 373 insertions(+), 373 deletions(-) diff --git a/configs/llm-360-amber1.yaml b/configs/llm-360-amber1.yaml index 4b019407b..aa9c5b5aa 100644 --- a/configs/llm-360-amber1.yaml +++ b/configs/llm-360-amber1.yaml @@ -66,8 +66,8 @@ tokenizer: identifier: huggyllama/llama-7b truncate_direction: right -save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/dustins-stability/${run_name} -# remote_save_folder:/weka/ai2-llm/checkpoints/OLMo-small/${run_name} +save_folder: /weka//oe-training-default/ai2-llm/checkpoints/OLMo-medium/dustins-stability/${run_name} +# remote_save_folder: /weka//ai2-llm/checkpoints/OLMo-small/${run_name} save_overwrite: true save_interval: 1000 @@ -107,27 +107,27 @@ evaluators: # drop_last: true # datasets: # c4_en-validation: - # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy + # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy # dolma_books-validation: - # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy + # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy # dolma_common-crawl-validation: - # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy + # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy # dolma_pes2o-validation: - # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy + # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy # dolma_reddit-validation: - # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy + # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy # dolma_stack-validation: - # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy + # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy # dolma_wiki-validation: - # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy + # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy # ice-validation: - # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy + # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy # m2d2_s2orc-validation: - # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy + # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy # pile-validation: - # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy + # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy # wikitext_103-validation: - # -/weka/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy + # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy ########################## # Downstream evaluations # @@ -236,363 +236,363 @@ data: timeout: 0 paths: ######### Amber ######### - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_000.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_001.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_002.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_003.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_004.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_005.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_006.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_007.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_008.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_009.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_010.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_011.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_012.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_013.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_014.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_015.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_016.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_017.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_018.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_019.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_020.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_021.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_022.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_023.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_024.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_025.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_026.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_027.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_028.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_029.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_030.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_031.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_032.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_033.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_034.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_035.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_036.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_037.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_038.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_039.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_040.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_041.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_042.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_043.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_044.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_045.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_046.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_047.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_048.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_049.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_050.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_051.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_052.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_053.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_054.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_055.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_056.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_057.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_058.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_059.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_060.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_061.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_062.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_063.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_064.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_065.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_066.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_067.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_068.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_069.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_070.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_071.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_072.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_073.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_074.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_075.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_076.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_077.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_078.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_079.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_080.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_081.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_082.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_083.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_084.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_085.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_086.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_087.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_088.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_089.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_090.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_091.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_092.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_093.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_094.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_095.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_096.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_097.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_098.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_099.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_100.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_101.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_102.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_103.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_104.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_105.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_106.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_107.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_108.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_109.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_110.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_111.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_112.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_113.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_114.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_115.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_116.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_117.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_118.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_119.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_120.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_121.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_122.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_123.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_124.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_125.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_126.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_127.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_128.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_129.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_130.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_131.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_132.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_133.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_134.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_135.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_136.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_137.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_138.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_139.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_140.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_141.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_142.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_143.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_144.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_145.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_146.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_147.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_148.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_149.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_150.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_151.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_152.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_153.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_154.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_155.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_156.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_157.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_158.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_159.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_160.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_161.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_162.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_163.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_164.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_165.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_166.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_167.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_168.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_169.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_170.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_171.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_172.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_173.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_174.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_175.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_176.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_177.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_178.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_179.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_180.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_181.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_182.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_183.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_184.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_185.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_186.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_187.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_188.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_189.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_190.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_191.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_192.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_193.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_194.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_195.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_196.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_197.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_198.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_199.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_200.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_201.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_202.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_203.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_204.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_205.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_206.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_207.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_208.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_209.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_210.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_211.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_212.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_213.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_214.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_215.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_216.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_217.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_218.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_219.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_220.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_221.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_222.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_223.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_224.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_225.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_226.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_227.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_228.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_229.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_230.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_231.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_232.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_233.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_234.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_235.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_236.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_237.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_238.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_239.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_240.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_241.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_242.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_243.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_244.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_245.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_246.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_247.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_248.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_249.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_250.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_251.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_252.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_253.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_254.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_255.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_256.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_257.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_258.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_259.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_260.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_261.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_262.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_263.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_264.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_265.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_266.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_267.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_268.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_269.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_270.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_271.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_272.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_273.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_274.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_275.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_276.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_277.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_278.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_279.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_280.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_281.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_282.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_283.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_284.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_285.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_286.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_287.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_288.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_289.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_290.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_291.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_292.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_293.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_294.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_295.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_296.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_297.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_298.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_299.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_300.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_301.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_302.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_303.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_304.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_305.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_306.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_307.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_308.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_309.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_310.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_311.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_312.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_313.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_314.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_315.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_316.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_317.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_318.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_319.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_320.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_321.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_322.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_323.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_324.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_325.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_326.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_327.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_328.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_329.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_330.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_331.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_332.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_333.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_334.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_335.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_336.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_337.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_338.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_339.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_340.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_341.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_342.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_343.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_344.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_345.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_346.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_347.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_348.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_349.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_350.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_351.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_352.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_353.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_354.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_355.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_356.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_357.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_358.npy - -/weka/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_359.npy \ No newline at end of file + - /weka///ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_000.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_001.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_002.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_003.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_004.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_005.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_006.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_007.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_008.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_009.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_010.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_011.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_012.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_013.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_014.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_015.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_016.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_017.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_018.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_019.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_020.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_021.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_022.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_023.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_024.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_025.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_026.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_027.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_028.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_029.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_030.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_031.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_032.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_033.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_034.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_035.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_036.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_037.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_038.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_039.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_040.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_041.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_042.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_043.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_044.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_045.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_046.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_047.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_048.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_049.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_050.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_051.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_052.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_053.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_054.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_055.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_056.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_057.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_058.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_059.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_060.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_061.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_062.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_063.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_064.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_065.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_066.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_067.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_068.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_069.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_070.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_071.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_072.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_073.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_074.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_075.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_076.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_077.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_078.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_079.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_080.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_081.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_082.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_083.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_084.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_085.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_086.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_087.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_088.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_089.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_090.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_091.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_092.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_093.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_094.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_095.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_096.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_097.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_098.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_099.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_100.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_101.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_102.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_103.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_104.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_105.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_106.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_107.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_108.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_109.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_110.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_111.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_112.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_113.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_114.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_115.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_116.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_117.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_118.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_119.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_120.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_121.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_122.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_123.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_124.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_125.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_126.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_127.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_128.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_129.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_130.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_131.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_132.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_133.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_134.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_135.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_136.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_137.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_138.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_139.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_140.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_141.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_142.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_143.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_144.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_145.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_146.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_147.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_148.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_149.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_150.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_151.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_152.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_153.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_154.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_155.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_156.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_157.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_158.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_159.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_160.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_161.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_162.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_163.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_164.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_165.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_166.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_167.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_168.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_169.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_170.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_171.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_172.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_173.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_174.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_175.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_176.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_177.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_178.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_179.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_180.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_181.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_182.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_183.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_184.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_185.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_186.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_187.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_188.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_189.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_190.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_191.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_192.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_193.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_194.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_195.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_196.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_197.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_198.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_199.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_200.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_201.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_202.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_203.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_204.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_205.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_206.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_207.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_208.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_209.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_210.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_211.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_212.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_213.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_214.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_215.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_216.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_217.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_218.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_219.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_220.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_221.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_222.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_223.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_224.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_225.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_226.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_227.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_228.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_229.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_230.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_231.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_232.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_233.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_234.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_235.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_236.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_237.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_238.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_239.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_240.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_241.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_242.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_243.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_244.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_245.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_246.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_247.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_248.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_249.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_250.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_251.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_252.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_253.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_254.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_255.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_256.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_257.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_258.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_259.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_260.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_261.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_262.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_263.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_264.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_265.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_266.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_267.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_268.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_269.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_270.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_271.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_272.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_273.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_274.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_275.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_276.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_277.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_278.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_279.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_280.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_281.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_282.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_283.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_284.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_285.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_286.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_287.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_288.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_289.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_290.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_291.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_292.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_293.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_294.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_295.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_296.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_297.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_298.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_299.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_300.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_301.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_302.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_303.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_304.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_305.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_306.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_307.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_308.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_309.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_310.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_311.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_312.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_313.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_314.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_315.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_316.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_317.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_318.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_319.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_320.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_321.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_322.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_323.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_324.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_325.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_326.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_327.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_328.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_329.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_330.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_331.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_332.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_333.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_334.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_335.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_336.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_337.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_338.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_339.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_340.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_341.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_342.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_343.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_344.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_345.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_346.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_347.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_348.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_349.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_350.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_351.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_352.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_353.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_354.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_355.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_356.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_357.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_358.npy + - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_359.npy \ No newline at end of file From f7aa4245675316339bce81195fda9e984e06b709 Mon Sep 17 00:00:00 2001 From: Dustin Date: Fri, 14 Jun 2024 21:07:46 -0700 Subject: [PATCH 40/40] one last data path fix --- configs/llm-360-amber1.yaml | 746 ++++++++++++++++++------------------ 1 file changed, 373 insertions(+), 373 deletions(-) diff --git a/configs/llm-360-amber1.yaml b/configs/llm-360-amber1.yaml index aa9c5b5aa..17951ce85 100644 --- a/configs/llm-360-amber1.yaml +++ b/configs/llm-360-amber1.yaml @@ -66,8 +66,8 @@ tokenizer: identifier: huggyllama/llama-7b truncate_direction: right -save_folder: /weka//oe-training-default/ai2-llm/checkpoints/OLMo-medium/dustins-stability/${run_name} -# remote_save_folder: /weka//ai2-llm/checkpoints/OLMo-small/${run_name} +save_folder: /weka/oe-training-default/oe-training-default/ai2-llm/checkpoints/OLMo-medium/dustins-stability/${run_name} +# remote_save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/${run_name} save_overwrite: true save_interval: 1000 @@ -107,27 +107,27 @@ evaluators: # drop_last: true # datasets: # c4_en-validation: - # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy # dolma_books-validation: - # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy # dolma_common-crawl-validation: - # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy # dolma_pes2o-validation: - # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy # dolma_reddit-validation: - # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy # dolma_stack-validation: - # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy # dolma_wiki-validation: - # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy # ice-validation: - # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy # m2d2_s2orc-validation: - # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy # pile-validation: - # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy # wikitext_103-validation: - # - /weka//ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy ########################## # Downstream evaluations # @@ -236,363 +236,363 @@ data: timeout: 0 paths: ######### Amber ######### - - /weka///ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_000.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_001.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_002.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_003.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_004.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_005.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_006.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_007.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_008.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_009.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_010.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_011.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_012.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_013.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_014.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_015.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_016.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_017.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_018.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_019.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_020.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_021.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_022.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_023.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_024.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_025.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_026.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_027.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_028.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_029.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_030.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_031.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_032.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_033.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_034.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_035.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_036.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_037.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_038.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_039.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_040.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_041.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_042.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_043.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_044.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_045.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_046.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_047.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_048.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_049.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_050.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_051.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_052.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_053.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_054.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_055.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_056.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_057.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_058.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_059.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_060.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_061.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_062.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_063.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_064.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_065.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_066.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_067.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_068.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_069.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_070.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_071.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_072.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_073.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_074.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_075.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_076.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_077.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_078.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_079.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_080.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_081.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_082.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_083.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_084.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_085.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_086.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_087.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_088.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_089.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_090.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_091.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_092.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_093.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_094.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_095.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_096.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_097.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_098.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_099.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_100.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_101.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_102.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_103.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_104.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_105.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_106.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_107.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_108.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_109.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_110.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_111.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_112.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_113.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_114.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_115.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_116.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_117.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_118.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_119.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_120.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_121.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_122.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_123.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_124.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_125.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_126.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_127.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_128.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_129.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_130.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_131.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_132.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_133.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_134.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_135.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_136.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_137.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_138.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_139.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_140.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_141.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_142.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_143.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_144.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_145.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_146.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_147.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_148.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_149.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_150.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_151.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_152.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_153.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_154.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_155.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_156.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_157.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_158.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_159.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_160.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_161.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_162.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_163.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_164.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_165.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_166.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_167.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_168.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_169.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_170.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_171.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_172.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_173.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_174.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_175.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_176.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_177.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_178.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_179.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_180.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_181.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_182.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_183.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_184.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_185.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_186.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_187.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_188.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_189.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_190.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_191.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_192.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_193.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_194.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_195.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_196.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_197.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_198.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_199.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_200.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_201.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_202.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_203.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_204.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_205.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_206.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_207.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_208.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_209.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_210.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_211.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_212.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_213.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_214.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_215.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_216.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_217.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_218.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_219.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_220.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_221.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_222.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_223.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_224.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_225.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_226.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_227.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_228.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_229.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_230.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_231.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_232.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_233.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_234.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_235.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_236.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_237.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_238.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_239.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_240.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_241.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_242.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_243.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_244.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_245.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_246.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_247.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_248.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_249.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_250.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_251.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_252.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_253.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_254.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_255.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_256.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_257.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_258.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_259.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_260.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_261.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_262.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_263.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_264.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_265.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_266.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_267.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_268.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_269.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_270.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_271.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_272.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_273.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_274.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_275.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_276.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_277.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_278.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_279.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_280.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_281.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_282.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_283.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_284.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_285.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_286.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_287.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_288.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_289.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_290.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_291.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_292.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_293.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_294.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_295.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_296.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_297.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_298.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_299.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_300.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_301.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_302.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_303.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_304.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_305.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_306.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_307.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_308.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_309.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_310.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_311.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_312.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_313.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_314.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_315.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_316.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_317.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_318.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_319.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_320.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_321.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_322.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_323.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_324.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_325.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_326.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_327.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_328.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_329.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_330.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_331.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_332.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_333.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_334.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_335.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_336.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_337.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_338.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_339.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_340.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_341.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_342.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_343.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_344.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_345.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_346.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_347.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_348.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_349.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_350.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_351.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_352.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_353.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_354.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_355.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_356.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_357.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_358.npy - - /weka//ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_359.npy \ No newline at end of file + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_005.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_006.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_007.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_008.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_009.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_010.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_011.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_012.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_013.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_014.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_015.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_016.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_017.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_018.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_019.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_020.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_021.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_022.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_023.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_024.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_025.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_026.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_027.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_028.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_029.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_030.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_031.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_032.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_033.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_034.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_035.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_036.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_037.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_038.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_039.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_040.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_041.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_042.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_043.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_044.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_045.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_046.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_047.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_048.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_049.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_050.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_051.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_052.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_053.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_054.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_055.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_056.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_057.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_058.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_059.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_060.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_061.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_062.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_063.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_064.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_065.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_066.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_067.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_068.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_069.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_070.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_071.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_072.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_073.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_074.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_075.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_076.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_077.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_078.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_079.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_080.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_081.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_082.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_083.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_084.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_085.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_086.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_087.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_088.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_089.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_090.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_091.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_092.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_093.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_094.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_095.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_096.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_097.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_098.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_099.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_100.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_101.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_102.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_103.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_104.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_105.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_106.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_107.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_108.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_109.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_110.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_111.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_112.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_113.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_114.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_115.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_116.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_117.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_118.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_119.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_120.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_121.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_122.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_123.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_124.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_125.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_126.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_127.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_128.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_129.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_130.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_131.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_132.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_133.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_134.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_135.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_136.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_137.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_138.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_139.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_140.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_141.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_142.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_143.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_144.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_145.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_146.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_147.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_148.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_149.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_150.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_151.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_152.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_153.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_154.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_155.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_156.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_157.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_158.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_159.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_160.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_161.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_162.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_163.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_164.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_165.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_166.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_167.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_168.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_169.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_170.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_171.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_172.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_173.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_174.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_175.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_176.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_177.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_178.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_179.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_180.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_181.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_182.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_183.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_184.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_185.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_186.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_187.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_188.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_189.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_190.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_191.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_192.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_193.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_194.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_195.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_196.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_197.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_198.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_199.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_200.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_201.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_202.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_203.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_204.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_205.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_206.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_207.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_208.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_209.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_210.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_211.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_212.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_213.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_214.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_215.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_216.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_217.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_218.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_219.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_220.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_221.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_222.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_223.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_224.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_225.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_226.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_227.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_228.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_229.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_230.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_231.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_232.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_233.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_234.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_235.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_236.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_237.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_238.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_239.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_240.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_241.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_242.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_243.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_244.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_245.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_246.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_247.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_248.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_249.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_250.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_251.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_252.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_253.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_254.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_255.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_256.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_257.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_258.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_259.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_260.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_261.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_262.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_263.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_264.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_265.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_266.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_267.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_268.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_269.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_270.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_271.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_272.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_273.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_274.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_275.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_276.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_277.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_278.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_279.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_280.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_281.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_282.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_283.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_284.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_285.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_286.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_287.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_288.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_289.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_290.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_291.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_292.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_293.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_294.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_295.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_296.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_297.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_298.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_299.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_300.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_301.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_302.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_303.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_304.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_305.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_306.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_307.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_308.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_309.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_310.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_311.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_312.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_313.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_314.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_315.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_316.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_317.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_318.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_319.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_320.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_321.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_322.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_323.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_324.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_325.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_326.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_327.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_328.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_329.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_330.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_331.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_332.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_333.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_334.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_335.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_336.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_337.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_338.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_339.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_340.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_341.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_342.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_343.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_344.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_345.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_346.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_347.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_348.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_349.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_350.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_351.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_352.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_353.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_354.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_355.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_356.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_357.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_358.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_359.npy \ No newline at end of file