Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix models path #25

Merged
merged 22 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# python
__pycache__
.venv

# vscode
.vscode/
Expand Down
30,786 changes: 15,380 additions & 15,406 deletions paths.json

Large diffs are not rendered by default.

35 changes: 25 additions & 10 deletions results.py
KennethEnevoldsen marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""MTEB Results"""

import json
import os
Samoed marked this conversation as resolved.
Show resolved Hide resolved

import datasets

Expand Down Expand Up @@ -55,7 +56,6 @@
"DanskBERT",
"FollowIR-7B",
"GritLM-7B",
"GritLM-7B-noinstruct",
KennethEnevoldsen marked this conversation as resolved.
Show resolved Hide resolved
"LASER2",
"LLM2Vec-Llama-2-supervised",
"LLM2Vec-Llama-2-unsupervised",
Expand Down Expand Up @@ -106,7 +106,6 @@
"contriever-base-msmarco",
"cross-en-de-roberta-sentence-transformer",
"dfm-encoder-large-v1",
"dfm-sentence-encoder-large-1",
"distilbert-base-25lang-cased",
"distilbert-base-en-fr-cased",
"distilbert-base-en-fr-es-pt-it-cased",
Expand All @@ -129,6 +128,7 @@
"elser-v2",
"embedder-100p",
"facebook-dpr-ctx_encoder-multiset-base",
"facebookdragon-plus-context-encoder",
"flan-t5-base",
"flan-t5-large",
"flaubert_base_cased",
Expand Down Expand Up @@ -193,14 +193,19 @@
"sentence-t5-large",
"sentence-t5-xl",
"sentence-t5-xxl",
"sentence-transformers__LaBSE",
"sentence-transformers__all-MiniLM-L12-v2",
KennethEnevoldsen marked this conversation as resolved.
Show resolved Hide resolved
"sentence-transformers__all-MiniLM-L6-v2",
"sentence-transformers__all-mpnet-base-v2",
"sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2",
"sentence-transformers__paraphrase-multilingual-mpnet-base-v2",
"all-MiniLM-L12-v2",
"sgpt-bloom-1b7-nli",
"sgpt-bloom-7b1-msmarco",
"SGPT-125M-weightedmean-nli-bitfit",
"SGPT-1.3B-weightedmean-msmarco-specb-bitfit",
"SGPT-5.8B-weightedmean-msmarco-specb-bitfit-que",
"SGPT-5.8B-weightedmean-msmarco-specb-bitfit",
"SGPT-5.8B-weightedmean-nli-bitfit",
"SGPT-2.7B-weightedmean-msmarco-specb-bitfit",
"SGPT-125M-weightedmean-msmarco-specb-bitfit-que",
"SGPT-125M-weightedmean-msmarco-specb-bitfit-doc",
"text-embedding-preview-0409-768",
"SGPT-125M-weightedmean-msmarco-specb-bitfit",
"silver-retriever-base-v1",
"st-polish-paraphrase-from-distilroberta",
"st-polish-paraphrase-from-mpnet",
Expand Down Expand Up @@ -254,6 +259,13 @@
]


def get_model_for_current_dir(dir_name: str) -> str | None:
for model in MODELS:
if model == dir_name or ("__" in dir_name and dir_name.split("__")[1] == model):
return model
return None


# Needs to be run whenever new files are added
def get_paths():
import collections, json, os
Expand All @@ -263,13 +275,17 @@ def get_paths():
if not os.path.isdir(results_model_dir):
print(f"Skipping {results_model_dir}")
continue
model_name = get_model_for_current_dir(model_dir)
if model_name is None:
print(f"Skipping {model_dir} model dir")
continue
for revision_folder in os.listdir(results_model_dir):
if not os.path.isdir(os.path.join(results_model_dir, revision_folder)):
continue
for res_file in os.listdir(os.path.join(results_model_dir, revision_folder)):
if (res_file.endswith(".json")) and not(res_file.endswith(("overall_results.json", "model_meta.json"))):
results_model_file = os.path.join(results_model_dir, revision_folder, res_file)
files[model_dir].append(results_model_file)
files[model_name].append(results_model_file)
with open("paths.json", "w") as f:
json.dump(files, f, indent=2)
return files
Expand Down Expand Up @@ -305,7 +321,6 @@ def _info(self):

def _split_generators(self, dl_manager):
path_file = dl_manager.download_and_extract(URL)
# Local debugging:
with open(path_file) as f:
files = json.load(f)
downloaded_files = dl_manager.download_and_extract(files[self.config.name])
Expand Down
Empty file added tests/__init__.py
Empty file.
11 changes: 11 additions & 0 deletions tests/test_load_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from pathlib import Path
import pytest
from datasets import load_dataset
from results import MODELS


@pytest.mark.parametrize("model", MODELS)
def test_load_results_from_datasets(model):
"""Ensures that all models can be imported from dataset"""
path = Path(__file__).parent.parent / "results.py"
ds = load_dataset(str(path.absolute()), model, trust_remote_code=True)
2 changes: 1 addition & 1 deletion tests/test_load_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ def test_load_results():
known_model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
known_revision = "bf3bf13ab40c3157080a7ab344c831b9ad18b5eb"
assert known_model in results
assert known_revision in results[known_model]
assert known_revision in results[known_model]
Loading