diff --git a/classifiers/scripts/fineweb_100b.sh b/classifiers/scripts/fineweb_100b.sh index 224e011f..45298889 100644 --- a/classifiers/scripts/fineweb_100b.sh +++ b/classifiers/scripts/fineweb_100b.sh @@ -42,4 +42,4 @@ gantry run \ --shared-memory 10GiB \ --install "pip install -e classifiers/" \ --yes \ - -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=420' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_40b.sh b/classifiers/scripts/fineweb_40b.sh index fcd8cd66..5c53820e 100644 --- a/classifiers/scripts/fineweb_40b.sh +++ b/classifiers/scripts/fineweb_40b.sh @@ -41,4 +41,4 @@ gantry run \ --shared-memory 10GiB \ --install "pip install -e classifiers/" \ --yes \ - -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=420' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --prefetch-factor 8" + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_50b_extra.sh b/classifiers/scripts/fineweb_50b_extra.sh index 5017bf98..d80909c8 100644 --- a/classifiers/scripts/fineweb_50b_extra.sh +++ b/classifiers/scripts/fineweb_50b_extra.sh @@ -42,4 +42,4 @@ gantry run \ --shared-memory 10GiB \ --install "pip install -e classifiers/" \ --yes \ - -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=420' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/fineweb_dclm07.sh b/classifiers/scripts/fineweb_dclm07.sh index 6b49b05f..bb41d32b 100644 --- a/classifiers/scripts/fineweb_dclm07.sh +++ b/classifiers/scripts/fineweb_dclm07.sh @@ -42,4 +42,4 @@ gantry run \ --shared-memory 10GiB \ --install "pip install -e classifiers/" \ --yes \ - -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=420' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4" + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4" diff --git a/classifiers/scripts/nvidia-deberta-40b.sh b/classifiers/scripts/nvidia-deberta-40b.sh index 101a5cd8..b3a57c14 100644 --- a/classifiers/scripts/nvidia-deberta-40b.sh +++ b/classifiers/scripts/nvidia-deberta-40b.sh @@ -42,4 +42,4 @@ gantry run \ --shared-memory 10GiB \ --install "pip install -e classifiers/" \ --yes \ - -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=420' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/scripts/nvidia-deberta-50_extra.sh b/classifiers/scripts/nvidia-deberta-50_extra.sh index 49c8cc9b..cdd1f7b5 100644 --- a/classifiers/scripts/nvidia-deberta-50_extra.sh +++ b/classifiers/scripts/nvidia-deberta-50_extra.sh @@ -42,4 +42,4 @@ gantry run \ --shared-memory 10GiB \ --install "pip install -e classifiers/" \ --yes \ - -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=420' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024" diff --git a/classifiers/src/dolma_classifiers/inference.py b/classifiers/src/dolma_classifiers/inference.py index d8c81aa4..1ca59a26 100644 --- a/classifiers/src/dolma_classifiers/inference.py +++ b/classifiers/src/dolma_classifiers/inference.py @@ -294,7 +294,7 @@ def process_documents( scores = classifier.score(**inputs) attributes = [ - {"id": doc_id, "attributes": {pred.label: [0, doc_length, pred.score] for pred in doc_preds}} + {"id": doc_id, "attributes": {pred.label: [[0, doc_length, pred.score]] for pred in doc_preds}} for doc_preds, doc_id, doc_length in zip(scores, batch.ids, batch.lengths) ] scores_queue.put_nowait(AttributeRow(sources=batch.sources, attributes=attributes))