Skip to content

Commit

Permalink
[feat] add hfd flags
Browse files Browse the repository at this point in the history
  • Loading branch information
huyiwen committed Aug 16, 2024
1 parent d213d9c commit fbe273a
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 7 deletions.
15 changes: 11 additions & 4 deletions utilization/load_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def get_subsets(
paths = [str(get_script_path(cache_path))] + paths

found_config = False
s = None
for path in paths:
if path is None:
continue
Expand All @@ -121,7 +122,11 @@ def get_subsets(
found_config = True
break
except Exception as e:
logger.info(f"Failed when trying to get_dataset_config_names({path}): {e}. Trying another method...")
logger.info(f"Failed when trying to get_dataset_config_names({path}): {e}")
if s is None:
raise RuntimeError(
f"Failed to get dataset config names for {dataset_cls}. Please check the dataset path or the internet connection. If you have problem connecting to HuggingFace, you can set `--hf_mirror` to use a mirror site."
)

logger.debug(f"get_dataset_config_names({path}): {s}")

Expand Down Expand Up @@ -238,7 +243,7 @@ def load_dataset(
dataset_classes = import_dataset_classes(dataset_name)
cache_paths = []
for dcls in dataset_classes:
if dcls.load_args is None:
if not isinstance(dcls.load_args, tuple):
continue
elif len(dcls.load_args) > 0:
cache_paths.append(
Expand All @@ -247,7 +252,8 @@ def load_dataset(
args.hfd_cache_path,
hf_username=evaluation_args.hf_username,
hf_token=evaluation_args.hf_token,
mirror=args.hf_mirror
mirror=args.hf_mirror,
evaluation_args=evaluation_args,
)
)
else:
Expand All @@ -258,7 +264,8 @@ def load_dataset(
args.hfd_cache_path,
hf_username=evaluation_args.hf_username,
hf_token=evaluation_args.hf_token,
mirror=args.hf_mirror
mirror=args.hf_mirror,
evaluation_args=evaluation_args,
)
)
available_subsets_by_cls = get_subsets(dataset_name, dataset_classes, args, cache_paths)
Expand Down
17 changes: 15 additions & 2 deletions utilization/utils/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import tiktoken

from ..chat_templates import DEFAULT_CHAT_CONFIGS
from ..dataset_enum import DEFAULT_VLLM_DATASETS
from ..model_enum import (
ANTHROPIC_CHAT_COMPLETIONS_ARGS, API_MODELS, DASHSCOPE_CHAT_COMPLETIONS_ARGS, HUGGINGFACE_ARGS,
Expand Down Expand Up @@ -488,7 +487,6 @@ class DatasetArguments:
)

continue_from: ClassVar[int] = 0
"""The number of instances (lines) in .json file to resume from. This is set in `PredictionWriter.write_metainfo`."""

# set in `set_logging` with format "{evaluation_results_dir}/{log_filename}.json"
evaluation_results_path: ClassVar[Optional[str]] = None
Expand All @@ -515,6 +513,9 @@ def __post_init__(self):
f"Invalid batch size: {self.batch_size}. Specify an integer (e.g., '10') to use a fixed batch size for all iterations. Alternatively, append ':auto' (e.g., '10:auto') to start with the specified batch size and automatically adjust it in subsequent iterations to maintain constant CUDA memory usage"
)

if self.hf_mirror:
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"


@dataclass
class EvaluationArguments:
Expand Down Expand Up @@ -572,6 +573,18 @@ class EvaluationArguments:
default=False,
help="Whether to skip metrics evaluation and only do inference",
)
hfd_exclude_pattern: str = HfArg(
default=None,
help="The exclude pattern for datasets downloaded with hfd.sh",
)
hfd_include_pattern: str = HfArg(
default=None,
help="The include pattern for datasets downloaded with hfd.sh",
)
hfd_skip_check: bool = HfArg(
default=False,
help="Whether to skip the check of hfd.sh",
)

_redact = {"hf_token"}

Expand Down
12 changes: 11 additions & 1 deletion utilization/utils/hfd.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def huggingface_download(
hf_token: Optional[str] = None,
old: str = "https://huggingface.co",
new: str = "https://hf-mirror.com",
evaluation_args=None,
) -> Optional[str]:
"""Download a dataset from Hugging Face Hub to a local directory using hfd.sh."""

Expand All @@ -62,14 +63,23 @@ def huggingface_download(
logger.debug(f"Downloading {path} to {repo_path}")

mirror_flag = " --mirror" if mirror else ""

if hf_username and hf_token:
auth = f" --hf_username {hf_username} --hf_token {hf_token}"
elif hf_token:
auth = f" --hf_token {hf_token}"
else:
auth = ""

if evaluation_args is not None:
if evaluation_args.hfd_exclude_pattern is not None:
auth += f" --exclude-pattern {evaluation_args.hfd_exclude_pattern}"
if evaluation_args.hfd_include_pattern is not None:
auth += f" --include-pattern {evaluation_args.hfd_include_pattern}"

command = f"bash {hfd_cli.as_posix()} {path} --dataset --local-dir {repo_path.as_posix()}{auth}"
os.system(command + mirror_flag)
if not evaluation_args.hfd_skip_check:
os.system(command + mirror_flag)

update_script(load_script_path, mirror, old, new)

Expand Down

0 comments on commit fbe273a

Please sign in to comment.